Index: projects/clang1100-import/lib/libcasper/services/Makefile
===================================================================
--- projects/clang1100-import/lib/libcasper/services/Makefile	(revision 364278)
+++ projects/clang1100-import/lib/libcasper/services/Makefile	(revision 364279)
@@ -1,16 +1,17 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 SUBDIR=		cap_dns
 SUBDIR+=	cap_fileargs
 SUBDIR+=	cap_grp
+SUBDIR+=	cap_net
 SUBDIR+=	cap_pwd
 SUBDIR+=	cap_sysctl
 SUBDIR+=	cap_syslog
 
 SUBDIR.${MK_TESTS}+=	tests
 
 SUBDIR_PARALLEL=
 
 .include <bsd.subdir.mk>
Index: projects/clang1100-import/lib/libcasper/services/cap_dns/Makefile
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_dns/Makefile	(revision 364278)
+++ projects/clang1100-import/lib/libcasper/services/cap_dns/Makefile	(revision 364279)
@@ -1,38 +1,33 @@
 # $FreeBSD$
 
 SHLIBDIR?=	/lib/casper
 
 .include <src.opts.mk>
 
 PACKAGE=	runtime
 
 SHLIB_MAJOR=	2
 INCSDIR?=	${INCLUDEDIR}/casper
 
 .if ${MK_CASPER} != "no"
 SHLIB=	cap_dns
 
 SRCS=	cap_dns.c
 .endif
 
 INCS=	cap_dns.h
 
 LIBADD=	nv
 
 CFLAGS+=-I${.CURDIR}
 
 HAS_TESTS=
 SUBDIR.${MK_TESTS}+= tests
 
 MAN+=	cap_dns.3
 
 MLINKS+=cap_dns.3 libcap_dns.3
-MLINKS+=cap_dns.3 cap_gethostbyname.3
-MLINKS+=cap_dns.3 cap_gethostbyname2.3
-MLINKS+=cap_dns.3 cap_gethostbyaddr.3
-MLINKS+=cap_dns.3 cap_getaddrinfo.3
-MLINKS+=cap_dns.3 cap_getnameinfo.3
 MLINKS+=cap_dns.3 cap_dns_type_limit.3
 MLINKS+=cap_dns.3 cap_dns_family_limit.3
 
 .include <bsd.lib.mk>
Index: projects/clang1100-import/lib/libcasper/services/cap_dns/cap_dns.3
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_dns/cap_dns.3	(revision 364278)
+++ projects/clang1100-import/lib/libcasper/services/cap_dns/cap_dns.3	(revision 364279)
@@ -1,239 +1,242 @@
 .\" Copyright (c) 2018 Mariusz Zaborski <oshogbo@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd May 5, 2020
+.Dd August 15, 2020
 .Dt CAP_DNS 3
 .Os
 .Sh NAME
 .Nm cap_getaddrinfo ,
 .Nm cap_getnameinfo ,
 .Nm cap_gethostbyname ,
 .Nm cap_gethostbyname2 ,
 .Nm cap_gethostbyaddr ,
 .Nm cap_dns_type_limit ,
 .Nm cap_dns_family_limit
 .Nd "library for getting network host entry in capability mode"
 .Sh LIBRARY
 .Lb libcap_dns
 .Sh SYNOPSIS
 .In sys/nv.h
 .In libcasper.h
 .In casper/cap_dns.h
 .Ft int
 .Fn cap_getaddrinfo "cap_channel_t *chan" "const char *hostname" "const char *servname" "const struct addrinfo *hints" "struct addrinfo **res"
 .Ft int
 .Fn cap_getnameinfo "cap_channel_t *chan" "const struct sockaddr *sa" "socklen_t salen" "char *host" "size_t hostlen" "char *serv" "size_t servlen" "int flags"
 .Ft "struct hostent *"
 .Fn cap_gethostbyname "const cap_channel_t *chan" "const char *name"
 .Ft "struct hostent *"
 .Fn cap_gethostbyname2 "const cap_channel_t *chan" "const char *name" "int af"
 .Ft "struct hostent *"
 .Fn cap_gethostbyaddr "const cap_channel_t *chan" "const void *addr" "socklen_t len" "int af"
 .Ft "int"
 .Fn cap_dns_type_limit "cap_channel_t *chan" "const char * const *types" "size_t ntypes"
 .Ft "int"
 .Fn cap_dns_family_limit "const cap_channel_t *chan" "const int *families" "size_t nfamilies"
 .Sh DESCRIPTION
 .Bf -symbolic
+This service is obsolete and
+.Xr cap_net 3
+should be used instead.
 The
 .Fn cap_getaddrinfo ,
 and
 .Fn cap_getnameinfo ,
 functions are preferred over the
 .Fn cap_gethostbyname ,
 .Fn cap_gethostbyname2 ,
 and
 .Fn cap_gethostbyaddr
 functions.
 .Ef
 .Pp
 The functions
 .Fn cap_gethostbyname ,
 .Fn cap_gethostbyname2 ,
 .Fn cep_gethostbyaddr
 and
 .Fn cap_getnameinfo
 are respectively equivalent to
 .Xr gethostbyname 3 ,
 .Xr gethostbyname2 3 ,
 .Xr gethostbyaddr 3
 and
 .Xr getnameinfo 3
 except that the connection to the
 .Nm system.dns
 service needs to be provided.
 .Pp
 The
 .Fn cap_dns_type_limit
 function limits the functions allowed in the service.
 The
 .Fa types
 variable can be set to
 .Dv ADDR2NAME
 or
 .Dv NAME2ADDR .
 See the
 .Sx LIMITS
 section for more details.
 The
 .Fa ntpyes
 variable contains the number of
 .Fa types
 provided.
 .Pp
 The
 .Fn cap_dns_family_limit
 functions allows to limit address families.
 For details see
 .Sx LIMITS .
 The
 .Fa nfamilies
 variable contains the number of
 .Fa families
 provided.
 .Sh LIMITS
 The preferred way of setting limits is to use the
 .Fn cap_dns_type_limit
 and
 .Fn cap_dns_family_limit
 functions, but the limits of service can be set also using
 .Xr cap_limit_set 3 .
 The
 .Xr nvlist 9
 for that function can contain the following values and types:
 .Bl -ohang -offset indent
 .It type ( NV_TYPE_STRING )
 The
 .Va type
 can have two values:
 .Dv ADDR2NAME
 or
 .Dv NAME2ADDR .
 The
 .Dv ADDR2NAME
 means that reverse DNS lookups are allowed with
 .Fn cap_getnameinfo
 and
 .Fn cap_gethostbyaddr
 functions.
 In case when
 .Va type
 is set to
 .Dv NAME2ADDR
 the name resolution is allowed with
 .Fn cap_getaddrinfo ,
 .Fn cap_gethostbyname ,
 and
 .Fn cap_gethostbyname2
 functions.
 .It family ( NV_TYPE_NUMBER )
 The
 .Va family
 limits service to one of the address families (e.g.
 .Dv AF_INET , AF_INET6 ,
 etc.).
 .Sh EXAMPLES
 The following example first opens a capability to casper and then uses this
 capability to create the
 .Nm system.dns
 casper service and uses it to resolve an IP address.
 .Bd -literal
 cap_channel_t *capcas, *capdns;
 int familylimit, error;
 const char *ipstr = "127.0.0.1";
 const char *typelimit = "ADDR2NAME";
 char hname[NI_MAXHOST];
 struct addrinfo hints, *res;
 
 /* Open capability to Casper. */
 capcas = cap_init();
 if (capcas == NULL)
 	err(1, "Unable to contact Casper");
 
 /* Cache NLA for gai_strerror. */
 caph_cache_catpages();
 
 /* Enter capability mode sandbox. */
 if (caph_enter() < 0)
 	err(1, "Unable to enter capability mode");
 
 /* Use Casper capability to create capability to the system.dns service. */
 capdns = cap_service_open(capcas, "system.dns");
 if (capdns == NULL)
 	err(1, "Unable to open system.dns service");
 
 /* Close Casper capability, we don't need it anymore. */
 cap_close(capcas);
 
 /* Limit system.dns to reserve IPv4 addresses */
 familylimit = AF_INET;
 if (cap_dns_family_limit(capdns, &familylimit, 1) < 0)
 	err(1, "Unable to limit access to the system.dns service");
 
 /* Convert IP address in C-string to struct sockaddr. */
 memset(&hints, 0, sizeof(hints));
 hints.ai_family = familylimit;
 hints.ai_flags = AI_NUMERICHOST;
 error = cap_getaddrinfo(capdns, ipstr, NULL, &hints, &res);
 if (error != 0)
        errx(1, "cap_getaddrinfo(): %s: %s", ipstr, gai_strerror(error));
 
 /* Limit system.dns to reverse DNS lookups. */
 if (cap_dns_type_limit(capdns, &typelimit, 1) < 0)
 	err(1, "Unable to limit access to the system.dns service");
 
 /* Find hostname for the given IP address. */
 error = cap_getnameinfo(capdns, res->ai_addr, res->ai_addrlen, hname, sizeof(hname),
     NULL, 0, 0);
 if (error != 0)
 	errx(1, "cap_getnameinfo(): %s: %s", ipstr, gai_strerror(error));
 
 printf("Name associated with %s is %s.\\n", ipstr, hname);
 .Ed
 .Sh SEE ALSO
 .Xr cap_enter 2 ,
 .Xr caph_enter 3 ,
 .Xr err 3 ,
 .Xr gethostbyaddr 3 ,
 .Xr gethostbyname 3 ,
 .Xr gethostbyname2 3 ,
 .Xr getnameinfo 3 ,
 .Xr capsicum 4 ,
 .Xr nv 9
 .Sh HISTORY
 The
 .Nm cap_dns
 service first appeared in
 .Fx 10.3 .
 .Sh AUTHORS
 The
 .Nm cap_dns
 service was implemented by
 .An Pawel Jakub Dawidek Aq Mt pawel@dawidek.net
 under sponsorship from the FreeBSD Foundation.
 .Pp
 This manual page was written by
 .An Mariusz Zaborski Aq Mt oshogbo@FreeBSD.org .
Index: projects/clang1100-import/lib/libcasper/services/cap_net/Makefile
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_net/Makefile	(nonexistent)
+++ projects/clang1100-import/lib/libcasper/services/cap_net/Makefile	(revision 364279)
@@ -0,0 +1,48 @@
+# $FreeBSD$
+
+SHLIBDIR?=	/lib/casper
+
+.include <src.opts.mk>
+
+PACKAGE=libcasper
+
+SHLIB_MAJOR=	1
+INCSDIR?=	${INCLUDEDIR}/casper
+
+.if ${MK_CASPER} != "no"
+SHLIB=	cap_net
+
+SRCS=	cap_net.c
+.endif
+
+INCS=	cap_net.h
+
+LIBADD=	nv
+
+CFLAGS+=-I${.CURDIR}
+CFLAGS+=-DWITH_CASPER
+
+HAS_TESTS=
+SUBDIR.${MK_TESTS}+= tests
+
+MAN+=	cap_net.3
+
+MLINKS+=cap_net.3 libcap_net.3
+MLINKS+=cap_net.3 cap_bind.3
+MLINKS+=cap_net.3 cap_connect.3
+MLINKS+=cap_net.3 cap_net_free.3
+MLINKS+=cap_net.3 cap_net_limit.3
+MLINKS+=cap_net.3 cap_net_limit_addr2name.3
+MLINKS+=cap_net.3 cap_net_limit_addr2name_family.3
+MLINKS+=cap_net.3 cap_net_limit_bind.3
+MLINKS+=cap_net.3 cap_net_limit_connect.3
+MLINKS+=cap_net.3 cap_net_limit_init.3
+MLINKS+=cap_net.3 cap_net_limit_name2addr.3
+MLINKS+=cap_net.3 cap_net_limit_name2addr_family.3
+MLINKS+=cap_net.3 cap_getaddrinfo.3
+MLINKS+=cap_net.3 cap_gethostbyaddr.3
+MLINKS+=cap_net.3 cap_gethostbyname.3
+MLINKS+=cap_net.3 cap_gethostbyname2.3
+MLINKS+=cap_net.3 cap_getnameinfo.3
+
+.include <bsd.lib.mk>

Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3	(nonexistent)
+++ projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3	(revision 364279)
@@ -0,0 +1,287 @@
+.\" Copyright (c) 2020 Mariusz Zaborski <oshogbo@FreeBSD.org>
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd August 15, 2020
+.Dt CAP_NET 3
+.Os
+.Sh NAME
+.Nm cap_bind ,
+.Nm cap_connect ,
+.Nm cap_getaddrinfo ,
+.Nm cap_gethostbyaddr ,
+.Nm cap_gethostbyname ,
+.Nm cap_gethostbyname2 ,
+.Nm cap_getnameinfo ,
+.Nm cap_net_free ,
+.Nm cap_net_limit ,
+.Nm cap_net_limit_addr2name ,
+.Nm cap_net_limit_addr2name_family ,
+.Nm cap_net_limit_bind ,
+.Nm cap_net_limit_connect ,
+.Nm cap_net_limit_init ,
+.Nm cap_net_limit_name2addr ,
+.Nm cap_net_limit_name2addr_family ,
+.Nd "library for networking in capability mode"
+.Sh LIBRARY
+.Lb libcap_net
+.Sh SYNOPSIS
+.In sys/nv.h
+.In libcasper.h
+.In casper/cap_net.h
+.Ft int
+.Fn cap_bind "cap_channel_t *chan" "int s" "const struct sockaddr *addr" "socklen_t addrlen"
+.Ft int
+.Fn cap_connect "cap_channel_t *chan" "int s" "const struct sockaddr *name" "socklen_t namelen"
+.Ft int
+.Fn cap_getaddrinfo "cap_channel_t *chan" "const char *hostname" "const char *servname" "const struct addrinfo *hints" "struct addrinfo **res"
+.Ft int
+.Fn cap_getnameinfo "cap_channel_t *chan" "const struct sockaddr *sa" "socklen_t salen" "char *host" "size_t hostlen" "char *serv" "size_t servlen" "int flags"
+.Ft "struct hostent *"
+.Fn cap_gethostbyname "const cap_channel_t *chan" "const char *name"
+.Ft "struct hostent *"
+.Fn cap_gethostbyname2 "const cap_channel_t *chan" "const char *name" "int af"
+.Ft "struct hostent *"
+.Fn cap_gethostbyaddr "const cap_channel_t *chan" "const void *addr" "socklen_t len" "int af"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_init "cap_channel_t *chan" "uint64_t mode"
+.Ft int
+.Fn cap_net_limit "cap_net_limit_t *limit"
+.Ft void
+.Fn cap_net_free "cap_net_limit_t *limit"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_addr2name_family "cap_net_limit_t *limit" "int *family" "size_t size"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_addr2name "cap_net_limit_t *limit" "const struct sockaddr *sa" "socklen_t salen"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_name2addr_family "cap_net_limit_t *limit" "int *family" "size_t size"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_name2addr "cap_net_limit_t *limit" "const char *name" "const char *serv"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_connect "cap_net_limit_t *limit" "const struct sockaddr *sa" "socklen_t salen"
+.Ft "cap_net_limit_t *"
+.Fn cap_net_limit_bind "cap_net_limit_t *limit" "const struct sockaddr *sa" "socklen_t salen"
+.Sh DESCRIPTION
+.Pp
+The functions
+.Fn cap_bind,
+.Fn cap_connect,
+.Fn cap_gethostbyname ,
+.Fn cap_gethostbyname2 ,
+.Fn cap_gethostbyaddr
+and
+.Fn cap_getnameinfo
+are respectively equivalent to
+.Xr bind 2 ,
+.Xr connect 2 ,
+.Xr gethostbyname 3 ,
+.Xr gethostbyname2 3 ,
+.Xr gethostbyaddr 3
+and
+.Xr getnameinfo 3
+except that the connection to the
+.Nm system.net
+service needs to be provided.
+.Sh LIMITS
+By default, the cap_net capability provides unrestricted access to the network
+namespace.
+Applications typically only require access to a small portion of the network
+namespace:
+.Fn cap_net_limit
+interface can be used to restrict access to the network.
+.Fn cap_net_limit_init
+returns an opaque limit handle used to store a list of capabilities.
+The
+.Fv mode
+restricts the functionality of the service.
+Modes are encoded using the following flags:
+.Pp
+.Bd -literal -offset indent -compact
+CAPNET_ADDR2NAME		reverse DNS lookups are allowed with
+				cap_getnameinfo
+CAPNET_NAME2ADDR		name resolution is allowed with
+				cap_getaddrinfo
+CAPNET_DEPRECATED_ADDR2NAME	reverse DNS lookups are allowed with
+				cap_gethostbyaddr
+CAPNET_DEPRECATED_NAME2ADDR	name resolution is allowed with
+				cap_gethostbyname and cap_gethostbyname2
+CAPNET_BIND			bind syscall is allowed
+CAPNET_CONNECT			connect syscall is allowed
+CAPNET_CONNECTDNS		connect syscall is allowed to the values
+				returned from privies call to
+				the cap_getaddrinfo or cap_gethostbyname
+.Ed
+.Pp
+.Fn cap_net_limit_addr2name_family
+limits the
+.Fn cap_getnameinfo
+and
+.Fn cap_gethostbyaddr
+to do reverse DNS lookups to specific family (AF_INET, AF_INET6, etc.)
+.Pp
+.Fn cap_net_limit_addr2name
+limits the
+.Fn cap_getnameinfo
+and
+.Fn cap_gethostbyaddr
+to do reverse DNS lookups only on those specific structures.
+.Pp
+.Fn cap_net_limit_name2addr_family
+limits the
+.Fn cap_getaddrinfo ,
+.Fn cap_gethostbyname
+and
+.Fn cap_gethostbyname2
+to do the name resolution on specific family (AF_INET, AF_INET6, etc.)
+.Pp
+.Fn cap_net_limit_addr2name
+restricts
+.Fn cap_getaddrinfo ,
+.Fn cap_gethostbyname
+and
+.Fn cap_gethostbyname2
+to a set of domains.
+.Pp
+.Fn cap_net_limit_bind
+limits
+.Fn cap_bind
+to bind only on those specific structures.
+.Pp
+.Fn cap_net_limit_connect
+limits
+.Fn cap_connect
+to connect only on those specific structures.
+If the CAPNET_CONNECTDNS is set the limits are extended to the values returned
+by
+.Fn cap_getaddrinfo ,
+.Fn cap_gethostbyname
+and
+.Fn cap_gethostbyname2
+In case of the
+.Fn cap_getaddrinfo
+the restriction is strict.
+In case of the
+.Fn cap_gethostbyname
+and
+.Fn cap_gethostbyname2
+any port will be accepted in the
+.Fn cap_connect
+function.
+.Pp
+.Fn cap_net_limit
+applies a set of sysctl limits to the capability, denying access to sysctl
+variables not belonging to the set.
+.Pp
+Once a set of limits is applied, subsequent calls to
+.Fn cap_net_limit
+will fail unless the new set is a subset of the current set.
+.Pp
+The
+.Fn cap_net_limit
+will consume the limits.
+If the
+.Fn cap_net_limit
+was not called the rights may be freed using
+.Fn cap_net_free .
+Multiple calls to
+.Fn cap_net_limit_addr2name_family ,
+.Fn cap_net_limit_addr2name ,
+.Fn cap_net_limit_name2addr_family ,
+.Fn cap_net_limit_name2addr ,
+.Fn cap_net_limit_connect ,
+and
+.Fn cap_net_limit_bind
+is supported, each call is extending preview capabilities.
+.Sh EXAMPLES
+The following example first opens a capability to casper and then uses this
+capability to create the
+.Nm system.net
+casper service and uses it to resolve a host and connect to it.
+.Bd -literal
+cap_channel_t *capcas, *capnet;
+cap_net_limit_t *limit;
+int familylimit, error, s;
+const char *host = "example.com";
+struct addrinfo hints, *res;
+
+/* Open capability to Casper. */
+capcas = cap_init();
+if (capcas == NULL)
+	err(1, "Unable to contact Casper");
+
+/* Cache NLA for gai_strerror. */
+caph_cache_catpages();
+
+/* Enter capability mode sandbox. */
+if (caph_enter_casper() < 0)
+	err(1, "Unable to enter capability mode");
+
+/* Use Casper capability to create capability to the system.net service. */
+capnet = cap_service_open(capcas, "system.net");
+if (capnet == NULL)
+	err(1, "Unable to open system.net service");
+
+/* Close Casper capability. */
+cap_close(capcas);
+
+/* Limit system.net to reserve IPv4 addresses, to host example.com . */
+limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR | CAPNET_CONNECTDNS);
+if (limit == NULL)
+	err(1, "Unable to create limits.");
+cap_net_limit_name2addr(limit, host, "80");
+familylimit = AF_INET;
+cap_net_limit_name2addr_family(limit, &familylimit, 1);
+if (cap_net_limit(limit) < 0)
+	err(1, "Unable to apply limits.");
+
+/* Find IP addresses for the given host. */
+memset(&hints, 0, sizeof(hints));
+hints.ai_family = AF_INET;
+hints.ai_socktype = SOCK_STREAM;
+
+error = cap_getaddrinfo(capnet, host, "80", &hints, &res);
+if (error != 0)
+	errx(1, "cap_getaddrinfo(): %s: %s", host, gai_strerror(error));
+
+s = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+if (s < 0)
+	err(1, "Unable to create socket");
+
+if (cap_connect(capnet, s, res->ai_addr,  res->ai_addrlen) < 0)
+	err(1, "Unable to connect to host");
+.Ed
+.Sh SEE ALSO
+.Xr bind 2 ,
+.Xr cap_enter 2 ,
+.Xr connect 2 ,
+.Xr caph_enter 3 ,
+.Xr err 3 ,
+.Xr gethostbyaddr 3 ,
+.Xr gethostbyname 3 ,
+.Xr gethostbyname2 3 ,
+.Xr getnameinfo 3 ,
+.Xr capsicum 4 ,
+.Xr nv 9
+.Sh AUTHORS
+.An Mariusz Zaborski Aq Mt oshogbo@FreeBSD.org

Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c	(nonexistent)
+++ projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c	(revision 364279)
@@ -0,0 +1,1385 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Mariusz Zaborski <oshogbo@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/cnv.h>
+#include <sys/dnv.h>
+#include <sys/nv.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <libcasper.h>
+#include <libcasper_service.h>
+
+#include "cap_net.h"
+
+#define	CAPNET_MASK	(CAPNET_ADDR2NAME | CAPNET_NAME2ADDR	\
+    CAPNET_DEPRECATED_ADDR2NAME | CAPNET_DEPRECATED_NAME2ADDR | \
+    CAPNET_CONNECT | CAPNET_BIND | CAPNET_CONNECTDNS)
+
+/*
+ * Defines for the names of the limits.
+ * XXX: we should convert all string constats to this to avoid typos.
+ */
+#define	LIMIT_NV_BIND			"bind"
+#define	LIMIT_NV_CONNECT		"connect"
+#define	LIMIT_NV_ADDR2NAME		"addr2name"
+#define	LIMIT_NV_NAME2ADDR		"name2addr"
+
+struct cap_net_limit {
+	cap_channel_t	*cnl_chan;
+	uint64_t	 cnl_mode;
+	nvlist_t	*cnl_addr2name;
+	nvlist_t	*cnl_name2addr;
+	nvlist_t	*cnl_connect;
+	nvlist_t	*cnl_bind;
+};
+
+static struct hostent hent;
+
+static void
+hostent_free(struct hostent *hp)
+{
+	unsigned int ii;
+
+	free(hp->h_name);
+	hp->h_name = NULL;
+	if (hp->h_aliases != NULL) {
+		for (ii = 0; hp->h_aliases[ii] != NULL; ii++)
+			free(hp->h_aliases[ii]);
+		free(hp->h_aliases);
+		hp->h_aliases = NULL;
+	}
+	if (hp->h_addr_list != NULL) {
+		for (ii = 0; hp->h_addr_list[ii] != NULL; ii++)
+			free(hp->h_addr_list[ii]);
+		free(hp->h_addr_list);
+		hp->h_addr_list = NULL;
+	}
+}
+
+static struct hostent *
+hostent_unpack(const nvlist_t *nvl, struct hostent *hp)
+{
+	unsigned int ii, nitems;
+	char nvlname[64];
+	int n;
+
+	hostent_free(hp);
+
+	hp->h_name = strdup(nvlist_get_string(nvl, "name"));
+	if (hp->h_name == NULL)
+		goto fail;
+	hp->h_addrtype = (int)nvlist_get_number(nvl, "addrtype");
+	hp->h_length = (int)nvlist_get_number(nvl, "length");
+
+	nitems = (unsigned int)nvlist_get_number(nvl, "naliases");
+	hp->h_aliases = calloc(sizeof(hp->h_aliases[0]), nitems + 1);
+	if (hp->h_aliases == NULL)
+		goto fail;
+	for (ii = 0; ii < nitems; ii++) {
+		n = snprintf(nvlname, sizeof(nvlname), "alias%u", ii);
+		assert(n > 0 && n < (int)sizeof(nvlname));
+		hp->h_aliases[ii] =
+		    strdup(nvlist_get_string(nvl, nvlname));
+		if (hp->h_aliases[ii] == NULL)
+			goto fail;
+	}
+	hp->h_aliases[ii] = NULL;
+
+	nitems = (unsigned int)nvlist_get_number(nvl, "naddrs");
+	hp->h_addr_list = calloc(sizeof(hp->h_addr_list[0]), nitems + 1);
+	if (hp->h_addr_list == NULL)
+		goto fail;
+	for (ii = 0; ii < nitems; ii++) {
+		hp->h_addr_list[ii] = malloc(hp->h_length);
+		if (hp->h_addr_list[ii] == NULL)
+			goto fail;
+		n = snprintf(nvlname, sizeof(nvlname), "addr%u", ii);
+		assert(n > 0 && n < (int)sizeof(nvlname));
+		bcopy(nvlist_get_binary(nvl, nvlname, NULL),
+		    hp->h_addr_list[ii], hp->h_length);
+	}
+	hp->h_addr_list[ii] = NULL;
+
+	return (hp);
+fail:
+	hostent_free(hp);
+	h_errno = NO_RECOVERY;
+	return (NULL);
+}
+
+static int
+request_cb(cap_channel_t *chan, const char *name, int s,
+    const struct sockaddr *saddr, socklen_t len)
+{
+	nvlist_t *nvl;
+	int serrno;
+
+	nvl = nvlist_create(0);
+	nvlist_add_string(nvl, "cmd", name);
+	nvlist_add_descriptor(nvl, "s", s);
+	nvlist_add_binary(nvl, "saddr", saddr, len);
+
+	nvl = cap_xfer_nvlist(chan, nvl);
+	if (nvl == NULL)
+		return (-1);
+
+	if (nvlist_get_number(nvl, "error") != 0) {
+		serrno = (int)nvlist_get_number(nvl, "error");
+		nvlist_destroy(nvl);
+		errno = serrno;
+		return (-1);
+	}
+
+	s = dup2(s, nvlist_get_descriptor(nvl, "s"));
+	nvlist_destroy(nvl);
+
+	return (s == -1 ? -1 : 0);
+}
+
+int
+cap_bind(cap_channel_t *chan, int s, const struct sockaddr *addr,
+    socklen_t addrlen)
+{
+
+	return (request_cb(chan, LIMIT_NV_BIND, s, addr, addrlen));
+}
+
+int
+cap_connect(cap_channel_t *chan, int s, const struct sockaddr *name,
+    socklen_t namelen)
+{
+
+	return (request_cb(chan, LIMIT_NV_CONNECT, s, name, namelen));
+}
+
+
+struct hostent *
+cap_gethostbyname(cap_channel_t *chan, const char *name)
+{
+
+	return (cap_gethostbyname2(chan, name, AF_INET));
+}
+
+struct hostent *
+cap_gethostbyname2(cap_channel_t *chan, const char *name, int af)
+{
+	struct hostent *hp;
+	nvlist_t *nvl;
+
+	nvl = nvlist_create(0);
+	nvlist_add_string(nvl, "cmd", "gethostbyname");
+	nvlist_add_number(nvl, "family", (uint64_t)af);
+	nvlist_add_string(nvl, "name", name);
+	nvl = cap_xfer_nvlist(chan, nvl);
+	if (nvl == NULL) {
+		h_errno = NO_RECOVERY;
+		return (NULL);
+	}
+	if (nvlist_get_number(nvl, "error") != 0) {
+		h_errno = (int)nvlist_get_number(nvl, "error");
+		nvlist_destroy(nvl);
+		return (NULL);
+	}
+
+	hp = hostent_unpack(nvl, &hent);
+	nvlist_destroy(nvl);
+	return (hp);
+}
+
+struct hostent *
+cap_gethostbyaddr(cap_channel_t *chan, const void *addr, socklen_t len,
+    int af)
+{
+	struct hostent *hp;
+	nvlist_t *nvl;
+
+	nvl = nvlist_create(0);
+	nvlist_add_string(nvl, "cmd", "gethostbyaddr");
+	nvlist_add_binary(nvl, "addr", addr, (size_t)len);
+	nvlist_add_number(nvl, "family", (uint64_t)af);
+	nvl = cap_xfer_nvlist(chan, nvl);
+	if (nvl == NULL) {
+		h_errno = NO_RECOVERY;
+		return (NULL);
+	}
+	if (nvlist_get_number(nvl, "error") != 0) {
+		h_errno = (int)nvlist_get_number(nvl, "error");
+		nvlist_destroy(nvl);
+		return (NULL);
+	}
+	hp = hostent_unpack(nvl, &hent);
+	nvlist_destroy(nvl);
+	return (hp);
+}
+
+static struct addrinfo *
+addrinfo_unpack(const nvlist_t *nvl)
+{
+	struct addrinfo *ai;
+	const void *addr;
+	size_t addrlen;
+	const char *canonname;
+
+	addr = nvlist_get_binary(nvl, "ai_addr", &addrlen);
+	ai = malloc(sizeof(*ai) + addrlen);
+	if (ai == NULL)
+		return (NULL);
+	ai->ai_flags = (int)nvlist_get_number(nvl, "ai_flags");
+	ai->ai_family = (int)nvlist_get_number(nvl, "ai_family");
+	ai->ai_socktype = (int)nvlist_get_number(nvl, "ai_socktype");
+	ai->ai_protocol = (int)nvlist_get_number(nvl, "ai_protocol");
+	ai->ai_addrlen = (socklen_t)addrlen;
+	canonname = dnvlist_get_string(nvl, "ai_canonname", NULL);
+	if (canonname != NULL) {
+		ai->ai_canonname = strdup(canonname);
+		if (ai->ai_canonname == NULL) {
+			free(ai);
+			return (NULL);
+		}
+	} else {
+		ai->ai_canonname = NULL;
+	}
+	ai->ai_addr = (void *)(ai + 1);
+	bcopy(addr, ai->ai_addr, addrlen);
+	ai->ai_next = NULL;
+
+	return (ai);
+}
+
+int
+cap_getaddrinfo(cap_channel_t *chan, const char *hostname, const char *servname,
+    const struct addrinfo *hints, struct addrinfo **res)
+{
+	struct addrinfo *firstai, *prevai, *curai;
+	unsigned int ii;
+	const nvlist_t *nvlai;
+	char nvlname[64];
+	nvlist_t *nvl;
+	int error, n;
+
+	nvl = nvlist_create(0);
+	nvlist_add_string(nvl, "cmd", "getaddrinfo");
+	if (hostname != NULL)
+		nvlist_add_string(nvl, "hostname", hostname);
+	if (servname != NULL)
+		nvlist_add_string(nvl, "servname", servname);
+	if (hints != NULL) {
+		nvlist_add_number(nvl, "hints.ai_flags",
+		    (uint64_t)hints->ai_flags);
+		nvlist_add_number(nvl, "hints.ai_family",
+		    (uint64_t)hints->ai_family);
+		nvlist_add_number(nvl, "hints.ai_socktype",
+		    (uint64_t)hints->ai_socktype);
+		nvlist_add_number(nvl, "hints.ai_protocol",
+		    (uint64_t)hints->ai_protocol);
+	}
+	nvl = cap_xfer_nvlist(chan, nvl);
+	if (nvl == NULL)
+		return (EAI_MEMORY);
+	if (nvlist_get_number(nvl, "error") != 0) {
+		error = (int)nvlist_get_number(nvl, "error");
+		nvlist_destroy(nvl);
+		return (error);
+	}
+
+	nvlai = NULL;
+	firstai = prevai = curai = NULL;
+	for (ii = 0; ; ii++) {
+		n = snprintf(nvlname, sizeof(nvlname), "res%u", ii);
+		assert(n > 0 && n < (int)sizeof(nvlname));
+		if (!nvlist_exists_nvlist(nvl, nvlname))
+			break;
+		nvlai = nvlist_get_nvlist(nvl, nvlname);
+		curai = addrinfo_unpack(nvlai);
+		if (curai == NULL)
+			return (EAI_MEMORY);
+		if (prevai != NULL)
+			prevai->ai_next = curai;
+		else
+			firstai = curai;
+		prevai = curai;
+	}
+	nvlist_destroy(nvl);
+	if (curai == NULL && nvlai != NULL) {
+		if (firstai == NULL)
+			freeaddrinfo(firstai);
+		return (EAI_MEMORY);
+	}
+
+	*res = firstai;
+	return (0);
+}
+
+int
+cap_getnameinfo(cap_channel_t *chan, const struct sockaddr *sa, socklen_t salen,
+    char *host, size_t hostlen, char *serv, size_t servlen, int flags)
+{
+	nvlist_t *nvl;
+	int error;
+
+	nvl = nvlist_create(0);
+	nvlist_add_string(nvl, "cmd", "getnameinfo");
+	nvlist_add_number(nvl, "hostlen", (uint64_t)hostlen);
+	nvlist_add_number(nvl, "servlen", (uint64_t)servlen);
+	nvlist_add_binary(nvl, "sa", sa, (size_t)salen);
+	nvlist_add_number(nvl, "flags", (uint64_t)flags);
+	nvl = cap_xfer_nvlist(chan, nvl);
+	if (nvl == NULL)
+		return (EAI_MEMORY);
+	if (nvlist_get_number(nvl, "error") != 0) {
+		error = (int)nvlist_get_number(nvl, "error");
+		nvlist_destroy(nvl);
+		return (error);
+	}
+
+	if (host != NULL && nvlist_exists_string(nvl, "host"))
+		strlcpy(host, nvlist_get_string(nvl, "host"), hostlen + 1);
+	if (serv != NULL && nvlist_exists_string(nvl, "serv"))
+		strlcpy(serv, nvlist_get_string(nvl, "serv"), servlen + 1);
+	nvlist_destroy(nvl);
+	return (0);
+}
+
+cap_net_limit_t *
+cap_net_limit_init(cap_channel_t *chan, uint64_t mode)
+{
+	cap_net_limit_t *limit;
+
+	limit = calloc(1, sizeof(*limit));
+	if (limit != NULL) {
+		limit->cnl_mode = mode;
+		limit->cnl_chan = chan;
+		limit->cnl_addr2name = nvlist_create(0);
+		limit->cnl_name2addr = nvlist_create(0);
+		limit->cnl_connect = nvlist_create(0);
+		limit->cnl_bind = nvlist_create(0);
+	}
+
+	return (limit);
+}
+
+static void
+pack_limit(nvlist_t *lnvl, const char *name, nvlist_t *limit)
+{
+
+	if (!nvlist_empty(limit)) {
+		nvlist_move_nvlist(lnvl, name, limit);
+	} else {
+		nvlist_destroy(limit);
+	}
+}
+
+int
+cap_net_limit(cap_net_limit_t *limit)
+{
+	nvlist_t *lnvl;
+	cap_channel_t *chan;
+
+	lnvl = nvlist_create(0);
+	nvlist_add_number(lnvl, "mode", limit->cnl_mode);
+
+	pack_limit(lnvl, LIMIT_NV_ADDR2NAME, limit->cnl_addr2name);
+	pack_limit(lnvl, LIMIT_NV_NAME2ADDR, limit->cnl_name2addr);
+	pack_limit(lnvl, LIMIT_NV_CONNECT, limit->cnl_connect);
+	pack_limit(lnvl, LIMIT_NV_BIND, limit->cnl_bind);
+
+	chan = limit->cnl_chan;
+	free(limit);
+
+	return (cap_limit_set(chan, lnvl));
+}
+
+void
+cap_net_free(cap_net_limit_t *limit)
+{
+
+	if (limit == NULL)
+		return;
+
+	nvlist_destroy(limit->cnl_addr2name);
+	nvlist_destroy(limit->cnl_name2addr);
+	nvlist_destroy(limit->cnl_connect);
+	nvlist_destroy(limit->cnl_bind);
+
+	free(limit);
+}
+
+static void
+pack_family(nvlist_t *nvl, int *family, size_t size)
+{
+	size_t i;
+
+	i = 0;
+	if (!nvlist_exists_number_array(nvl, "family")) {
+		uint64_t val;
+
+		val = family[0];
+		nvlist_add_number_array(nvl, "family", &val, 1);
+		i += 1;
+	}
+
+	for (; i < size; i++) {
+		nvlist_append_number_array(nvl, "family", family[i]);
+	}
+}
+
+static void
+pack_sockaddr(nvlist_t *res, const struct sockaddr *sa, socklen_t salen)
+{
+	nvlist_t *nvl;
+
+	if (!nvlist_exists_nvlist(res, "sockaddr")) {
+		nvl = nvlist_create(NV_FLAG_NO_UNIQUE);
+	} else {
+		nvl = nvlist_take_nvlist(res, "sockaddr");
+	}
+
+	nvlist_add_binary(nvl, "", sa, salen);
+	nvlist_move_nvlist(res, "sockaddr", nvl);
+}
+
+cap_net_limit_t *
+cap_net_limit_addr2name_family(cap_net_limit_t *limit, int *family, size_t size)
+{
+
+	pack_family(limit->cnl_addr2name, family, size);
+	return (limit);
+}
+
+cap_net_limit_t *
+cap_net_limit_name2addr_family(cap_net_limit_t *limit, int *family, size_t size)
+{
+
+	pack_family(limit->cnl_name2addr, family, size);
+	return (limit);
+}
+
+cap_net_limit_t *
+cap_net_limit_name2addr(cap_net_limit_t *limit, const char *host,
+    const char *serv)
+{
+	nvlist_t *nvl;
+
+	if (!nvlist_exists_nvlist(limit->cnl_name2addr, "hosts")) {
+		nvl = nvlist_create(NV_FLAG_NO_UNIQUE);
+	} else {
+		nvl = nvlist_take_nvlist(limit->cnl_name2addr, "hosts");
+	}
+
+	nvlist_add_string(nvl,
+	    host != NULL ? host : "",
+	    serv != NULL ? serv : "");
+
+	nvlist_move_nvlist(limit->cnl_name2addr, "hosts", nvl);
+	return (limit);
+}
+
+cap_net_limit_t *
+cap_net_limit_addr2name(cap_net_limit_t *limit, const struct sockaddr *sa,
+    socklen_t salen)
+{
+
+	pack_sockaddr(limit->cnl_addr2name, sa, salen);
+	return (limit);
+}
+
+
+cap_net_limit_t *
+cap_net_limit_connect(cap_net_limit_t *limit, const struct sockaddr *sa,
+    socklen_t salen)
+{
+
+	pack_sockaddr(limit->cnl_connect, sa, salen);
+	return (limit);
+}
+
+cap_net_limit_t *
+cap_net_limit_bind(cap_net_limit_t *limit, const struct sockaddr *sa,
+    socklen_t salen)
+{
+
+	pack_sockaddr(limit->cnl_bind, sa, salen);
+	return (limit);
+}
+
+/*
+ * Service functions.
+ */
+
+static nvlist_t *capdnscache;
+
+static void
+net_add_sockaddr_to_cache(struct sockaddr *sa, socklen_t salen, bool deprecated)
+{
+	void *cookie;
+
+	if (capdnscache == NULL) {
+		capdnscache = nvlist_create(NV_FLAG_NO_UNIQUE);
+	} else {
+		/* Lets keep it clean. Look for dups. */
+		cookie = NULL;
+		while (nvlist_next(capdnscache, NULL, &cookie) != NULL) {
+			const void *data;
+			size_t size;
+
+			assert(cnvlist_type(cookie) == NV_TYPE_BINARY);
+
+			data = cnvlist_get_binary(cookie, &size);
+			if (salen != size)
+				continue;
+			if (memcmp(data, sa, size) == 0)
+				return;
+		}
+	}
+
+	nvlist_add_binary(capdnscache, deprecated ? "d" : "", sa, salen);
+}
+
+static void
+net_add_hostent_to_cache(const char *address, size_t asize, int family)
+{
+
+	if (family != AF_INET && family != AF_INET6)
+		return;
+
+	if (family == AF_INET6) {
+		struct sockaddr_in6 connaddr;
+
+		memset(&connaddr, 0, sizeof(connaddr));
+		connaddr.sin6_family = AF_INET6;
+		memcpy((char *)&connaddr.sin6_addr, address, asize);
+		connaddr.sin6_port = 0;
+
+		net_add_sockaddr_to_cache((struct sockaddr *)&connaddr,
+		    sizeof(connaddr), true);
+	} else {
+		struct sockaddr_in connaddr;
+
+		memset(&connaddr, 0, sizeof(connaddr));
+		connaddr.sin_family = AF_INET;
+		memcpy((char *)&connaddr.sin_addr.s_addr, address, asize);
+		connaddr.sin_port = 0;
+
+		net_add_sockaddr_to_cache((struct sockaddr *)&connaddr,
+		    sizeof(connaddr), true);
+	}
+}
+
+static bool
+net_allowed_mode(const nvlist_t *limits, uint64_t mode)
+{
+
+	if (limits == NULL)
+		return (true);
+
+	return ((nvlist_get_number(limits, "mode") & mode) == mode);
+}
+
+static bool
+net_allowed_family(const nvlist_t *limits, int family)
+{
+	const uint64_t *allowedfamily;
+	size_t i, allsize;
+
+	if (limits == NULL)
+		return (true);
+
+	/* If there are no familes at all, allow any mode. */
+	if (!nvlist_exists_number_array(limits, "family"))
+		return (true);
+
+	allowedfamily = nvlist_get_number_array(limits, "family", &allsize);
+	for (i = 0; i < allsize; i++) {
+		/* XXX: what with AF_UNSPEC? */
+		if (allowedfamily[i] == (uint64_t)family) {
+			return (true);
+		}
+	}
+
+	return (false);
+}
+
+static bool
+net_allowed_bsaddr_impl(const nvlist_t *salimits, const void *saddr,
+    size_t saddrsize)
+{
+	void *cookie;
+	const void *limit;
+	size_t limitsize;
+
+	cookie = NULL;
+	while (nvlist_next(salimits, NULL, &cookie) != NULL) {
+		limit = cnvlist_get_binary(cookie, &limitsize);
+
+		if (limitsize != saddrsize) {
+			continue;
+		}
+		if (memcmp(limit, saddr, limitsize) == 0) {
+			return (true);
+		}
+
+		/*
+		 * In case of deprecated version (gethostbyname) we have to
+		 * ignore port, because there is no such info in the hostent.
+		 * Suporting only AF_INET and AF_INET6.
+		 */
+		if (strcmp(cnvlist_name(cookie), "d") != 0 ||
+		    (saddrsize != sizeof(struct sockaddr_in) &&
+		    saddrsize != sizeof(struct sockaddr_in6))) {
+			continue;
+		}
+		if (saddrsize == sizeof(struct sockaddr_in)) {
+			const struct sockaddr_in *saddrptr;
+			struct sockaddr_in sockaddr;
+
+			saddrptr = (const struct sockaddr_in *)saddr;
+			memcpy(&sockaddr, limit, sizeof(sockaddr));
+			sockaddr.sin_port = saddrptr->sin_port;
+
+			if (memcmp(&sockaddr, saddr, saddrsize) == 0) {
+				return (true);
+			}
+		} else if (saddrsize == sizeof(struct sockaddr_in6)) {
+			const struct sockaddr_in6 *saddrptr;
+			struct sockaddr_in6 sockaddr;
+
+			saddrptr = (const struct sockaddr_in6 *)saddr;
+			memcpy(&sockaddr, limit, sizeof(sockaddr));
+			sockaddr.sin6_port = saddrptr->sin6_port;
+
+			if (memcmp(&sockaddr, saddr, saddrsize) == 0) {
+				return (true);
+			}
+		}
+	}
+
+	return (false);
+}
+
+static bool
+net_allowed_bsaddr(const nvlist_t *limits, const void *saddr, size_t saddrsize)
+{
+
+	if (limits == NULL)
+		return (true);
+
+	if (!nvlist_exists_nvlist(limits, "sockaddr"))
+		return (true);
+
+	return (net_allowed_bsaddr_impl(nvlist_get_nvlist(limits, "sockaddr"),
+	    saddr, saddrsize));
+}
+
+static bool
+net_allowed_hosts(const nvlist_t *limits, const char *name, const char *srvname)
+{
+	void *cookie;
+	const nvlist_t *hlimits;
+	const char *testname, *testsrvname;
+
+	if (limits == NULL) {
+		return (true);
+	}
+
+	/* If there are no hosts at all, allow any. */
+	if (!nvlist_exists_nvlist(limits, "hosts")) {
+		return (true);
+	}
+
+	cookie = NULL;
+	testname = (name == NULL ? "" : name);
+	testsrvname = (srvname == NULL ? "" : srvname);
+	hlimits = nvlist_get_nvlist(limits, "hosts");
+	while (nvlist_next(hlimits, NULL, &cookie) != NULL) {
+		if (strcmp(cnvlist_name(cookie), "") != 0 &&
+		    strcmp(cnvlist_name(cookie), testname) != 0) {
+			continue;
+		}
+
+		if (strcmp(cnvlist_get_string(cookie), "") != 0 &&
+		    strcmp(cnvlist_get_string(cookie), testsrvname) != 0) {
+			continue;
+		}
+
+		return (true);
+	}
+
+	return (false);
+}
+
+static void
+hostent_pack(const struct hostent *hp, nvlist_t *nvl, bool addtocache)
+{
+	unsigned int ii;
+	char nvlname[64];
+	int n;
+
+	nvlist_add_string(nvl, "name", hp->h_name);
+	nvlist_add_number(nvl, "addrtype", (uint64_t)hp->h_addrtype);
+	nvlist_add_number(nvl, "length", (uint64_t)hp->h_length);
+
+	if (hp->h_aliases == NULL) {
+		nvlist_add_number(nvl, "naliases", 0);
+	} else {
+		for (ii = 0; hp->h_aliases[ii] != NULL; ii++) {
+			n = snprintf(nvlname, sizeof(nvlname), "alias%u", ii);
+			assert(n > 0 && n < (int)sizeof(nvlname));
+			nvlist_add_string(nvl, nvlname, hp->h_aliases[ii]);
+		}
+		nvlist_add_number(nvl, "naliases", (uint64_t)ii);
+	}
+
+	if (hp->h_addr_list == NULL) {
+		nvlist_add_number(nvl, "naddrs", 0);
+	} else {
+		for (ii = 0; hp->h_addr_list[ii] != NULL; ii++) {
+			n = snprintf(nvlname, sizeof(nvlname), "addr%u", ii);
+			assert(n > 0 && n < (int)sizeof(nvlname));
+			nvlist_add_binary(nvl, nvlname, hp->h_addr_list[ii],
+			    (size_t)hp->h_length);
+			if (addtocache) {
+				net_add_hostent_to_cache(hp->h_addr_list[ii],
+				    hp->h_length, hp->h_addrtype);
+			}
+		}
+		nvlist_add_number(nvl, "naddrs", (uint64_t)ii);
+	}
+}
+
+static int
+net_gethostbyname(const nvlist_t *limits, const nvlist_t *nvlin,
+    nvlist_t *nvlout)
+{
+	struct hostent *hp;
+	int family;
+	const nvlist_t *funclimit;
+	const char *name;
+	bool dnscache;
+
+	if (!net_allowed_mode(limits, CAPNET_DEPRECATED_NAME2ADDR))
+		return (ENOTCAPABLE);
+
+	dnscache = net_allowed_mode(limits, CAPNET_CONNECTDNS);
+	funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_NAME2ADDR, NULL);
+
+	family = (int)nvlist_get_number(nvlin, "family");
+	if (!net_allowed_family(funclimit, family))
+		return (ENOTCAPABLE);
+
+	name = nvlist_get_string(nvlin, "name");
+	if (!net_allowed_hosts(funclimit, name, ""))
+		return (ENOTCAPABLE);
+
+	hp = gethostbyname2(name, family);
+	if (hp == NULL)
+		return (h_errno);
+	hostent_pack(hp, nvlout, dnscache);
+	return (0);
+}
+
+static int
+net_gethostbyaddr(const nvlist_t *limits, const nvlist_t *nvlin,
+    nvlist_t *nvlout)
+{
+	struct hostent *hp;
+	const void *addr;
+	size_t addrsize;
+	int family;
+	const nvlist_t *funclimit;
+
+	if (!net_allowed_mode(limits, CAPNET_DEPRECATED_ADDR2NAME))
+		return (ENOTCAPABLE);
+
+	funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_ADDR2NAME, NULL);
+
+	family = (int)nvlist_get_number(nvlin, "family");
+	if (!net_allowed_family(funclimit, family))
+		return (ENOTCAPABLE);
+
+	addr = nvlist_get_binary(nvlin, "addr", &addrsize);
+	if (!net_allowed_bsaddr(funclimit, addr, addrsize))
+		return (ENOTCAPABLE);
+
+	hp = gethostbyaddr(addr, (socklen_t)addrsize, family);
+	if (hp == NULL)
+		return (h_errno);
+	hostent_pack(hp, nvlout, false);
+	return (0);
+}
+
+static int
+net_getnameinfo(const nvlist_t *limits, const nvlist_t *nvlin, nvlist_t *nvlout)
+{
+	struct sockaddr_storage sast;
+	const void *sabin;
+	char *host, *serv;
+	size_t sabinsize, hostlen, servlen;
+	socklen_t salen;
+	int error, flags;
+	const nvlist_t *funclimit;
+
+	if (!net_allowed_mode(limits, CAPNET_ADDR2NAME))
+		return (ENOTCAPABLE);
+	funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_ADDR2NAME, NULL);
+
+	error = 0;
+	host = serv = NULL;
+	memset(&sast, 0, sizeof(sast));
+
+	hostlen = (size_t)nvlist_get_number(nvlin, "hostlen");
+	servlen = (size_t)nvlist_get_number(nvlin, "servlen");
+
+	if (hostlen > 0) {
+		host = calloc(1, hostlen + 1);
+		if (host == NULL) {
+			error = EAI_MEMORY;
+			goto out;
+		}
+	}
+	if (servlen > 0) {
+		serv = calloc(1, servlen + 1);
+		if (serv == NULL) {
+			error = EAI_MEMORY;
+			goto out;
+		}
+	}
+
+	sabin = nvlist_get_binary(nvlin, "sa", &sabinsize);
+	if (sabinsize > sizeof(sast)) {
+		error = EAI_FAIL;
+		goto out;
+	}
+	if (!net_allowed_bsaddr(funclimit, sabin, sabinsize))
+		return (ENOTCAPABLE);
+
+	memcpy(&sast, sabin, sabinsize);
+	salen = (socklen_t)sabinsize;
+
+	if ((sast.ss_family != AF_INET ||
+	     salen != sizeof(struct sockaddr_in)) &&
+	    (sast.ss_family != AF_INET6 ||
+	     salen != sizeof(struct sockaddr_in6))) {
+		error = EAI_FAIL;
+		goto out;
+	}
+
+	if (!net_allowed_family(funclimit, (int)sast.ss_family)) {
+		error = ENOTCAPABLE;
+		goto out;
+	}
+
+	flags = (int)nvlist_get_number(nvlin, "flags");
+
+	error = getnameinfo((struct sockaddr *)&sast, salen, host, hostlen,
+	    serv, servlen, flags);
+	if (error != 0)
+		goto out;
+
+	if (host != NULL)
+		nvlist_move_string(nvlout, "host", host);
+	if (serv != NULL)
+		nvlist_move_string(nvlout, "serv", serv);
+out:
+	if (error != 0) {
+		free(host);
+		free(serv);
+	}
+	return (error);
+}
+
+static nvlist_t *
+addrinfo_pack(const struct addrinfo *ai)
+{
+	nvlist_t *nvl;
+
+	nvl = nvlist_create(0);
+	nvlist_add_number(nvl, "ai_flags", (uint64_t)ai->ai_flags);
+	nvlist_add_number(nvl, "ai_family", (uint64_t)ai->ai_family);
+	nvlist_add_number(nvl, "ai_socktype", (uint64_t)ai->ai_socktype);
+	nvlist_add_number(nvl, "ai_protocol", (uint64_t)ai->ai_protocol);
+	nvlist_add_binary(nvl, "ai_addr", ai->ai_addr, (size_t)ai->ai_addrlen);
+	if (ai->ai_canonname != NULL)
+		nvlist_add_string(nvl, "ai_canonname", ai->ai_canonname);
+
+	return (nvl);
+}
+
+static int
+net_getaddrinfo(const nvlist_t *limits, const nvlist_t *nvlin, nvlist_t *nvlout)
+{
+	struct addrinfo hints, *hintsp, *res, *cur;
+	const char *hostname, *servname;
+	char nvlname[64];
+	nvlist_t *elem;
+	unsigned int ii;
+	int error, family, n;
+	const nvlist_t *funclimit;
+	bool dnscache;
+
+	if (!net_allowed_mode(limits, CAPNET_NAME2ADDR))
+		return (ENOTCAPABLE);
+	dnscache = net_allowed_mode(limits, CAPNET_CONNECTDNS);
+	funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_NAME2ADDR, NULL);
+
+	hostname = dnvlist_get_string(nvlin, "hostname", NULL);
+	servname = dnvlist_get_string(nvlin, "servname", NULL);
+	if (nvlist_exists_number(nvlin, "hints.ai_flags")) {
+		hints.ai_flags = (int)nvlist_get_number(nvlin,
+		    "hints.ai_flags");
+		hints.ai_family = (int)nvlist_get_number(nvlin,
+		    "hints.ai_family");
+		hints.ai_socktype = (int)nvlist_get_number(nvlin,
+		    "hints.ai_socktype");
+		hints.ai_protocol = (int)nvlist_get_number(nvlin,
+		    "hints.ai_protocol");
+		hints.ai_addrlen = 0;
+		hints.ai_addr = NULL;
+		hints.ai_canonname = NULL;
+		hints.ai_next = NULL;
+		hintsp = &hints;
+		family = hints.ai_family;
+	} else {
+		hintsp = NULL;
+		family = AF_UNSPEC;
+	}
+
+	if (!net_allowed_family(funclimit, family))
+		return (ENOTCAPABLE);
+	if (!net_allowed_hosts(funclimit, hostname, servname))
+		return (ENOTCAPABLE);
+	error = getaddrinfo(hostname, servname, hintsp, &res);
+	if (error != 0) {
+		goto out;
+	}
+
+	for (cur = res, ii = 0; cur != NULL; cur = cur->ai_next, ii++) {
+		elem = addrinfo_pack(cur);
+		n = snprintf(nvlname, sizeof(nvlname), "res%u", ii);
+		assert(n > 0 && n < (int)sizeof(nvlname));
+		nvlist_move_nvlist(nvlout, nvlname, elem);
+		if (dnscache) {
+			net_add_sockaddr_to_cache(cur->ai_addr,
+			    cur->ai_addrlen, false);
+		}
+	}
+
+	freeaddrinfo(res);
+	error = 0;
+out:
+	return (error);
+}
+
+static int
+net_bind(const nvlist_t *limits, nvlist_t *nvlin, nvlist_t *nvlout)
+{
+	int socket, serrno;
+	const void *saddr;
+	size_t len;
+	const nvlist_t *funclimit;
+
+	if (!net_allowed_mode(limits, CAPNET_BIND))
+		return (ENOTCAPABLE);
+	funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_BIND, NULL);
+
+	saddr = nvlist_get_binary(nvlin, "saddr", &len);
+
+	if (!net_allowed_bsaddr(funclimit, saddr, len))
+		return (ENOTCAPABLE);
+
+	socket = nvlist_take_descriptor(nvlin, "s");
+	if (bind(socket, saddr, len) < 0) {
+		serrno = errno;
+		close(socket);
+		return (serrno);
+	}
+
+	nvlist_move_descriptor(nvlout, "s", socket);
+
+	return (0);
+}
+
+static int
+net_connect(const nvlist_t *limits, nvlist_t *nvlin, nvlist_t *nvlout)
+{
+	int socket, serrno;
+	const void *saddr;
+	const nvlist_t *funclimit;
+	size_t len;
+	bool conn, conndns;
+
+	conn = net_allowed_mode(limits, CAPNET_CONNECT);
+	conndns = net_allowed_mode(limits, CAPNET_CONNECTDNS);
+
+	if (!conn && !conndns)
+		return (ENOTCAPABLE);
+
+	funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_CONNECT, NULL);
+
+	saddr = nvlist_get_binary(nvlin, "saddr", &len);
+	if (conn && !net_allowed_bsaddr(funclimit, saddr, len)) {
+		return (ENOTCAPABLE);
+	} else if (conndns && (capdnscache == NULL ||
+	   !net_allowed_bsaddr_impl(capdnscache, saddr, len))) {
+		return (ENOTCAPABLE);
+	}
+	socket = dup(nvlist_get_descriptor(nvlin, "s"));
+	if (connect(socket, saddr, len) < 0) {
+		serrno = errno;
+		close(socket);
+		return (serrno);
+	}
+
+	nvlist_move_descriptor(nvlout, "s", socket);
+
+	return (0);
+}
+
+static bool
+verify_only_sa_newlimts(const nvlist_t *oldfunclimits,
+    const nvlist_t *newfunclimit)
+{
+	void *cookie;
+
+	cookie = NULL;
+	while (nvlist_next(newfunclimit, NULL, &cookie) != NULL) {
+		void *sacookie;
+
+		if (strcmp(cnvlist_name(cookie), "sockaddr") != 0)
+			return (false);
+
+		if (cnvlist_type(cookie) != NV_TYPE_NVLIST)
+			return (false);
+
+		sacookie = NULL;
+		while (nvlist_next(cnvlist_get_nvlist(cookie), NULL,
+		    &sacookie) != NULL) {
+			const void *sa;
+			size_t sasize;
+
+			if (cnvlist_type(sacookie) != NV_TYPE_BINARY)
+				return (false);
+
+			sa = cnvlist_get_binary(sacookie, &sasize);
+			if (!net_allowed_bsaddr(oldfunclimits, sa, sasize))
+				return (false);
+		}
+	}
+
+	return (true);
+}
+
+static bool
+verify_bind_newlimts(const nvlist_t *oldlimits,
+    const nvlist_t *newfunclimit)
+{
+	const nvlist_t *oldfunclimits;
+
+	oldfunclimits = NULL;
+	if (oldlimits != NULL) {
+		oldfunclimits = dnvlist_get_nvlist(oldlimits, LIMIT_NV_BIND,
+		    NULL);
+	}
+
+	return (verify_only_sa_newlimts(oldfunclimits, newfunclimit));
+}
+
+
+static bool
+verify_connect_newlimits(const nvlist_t *oldlimits,
+    const nvlist_t *newfunclimit)
+{
+	const nvlist_t *oldfunclimits;
+
+	oldfunclimits = NULL;
+	if (oldlimits != NULL) {
+		oldfunclimits = dnvlist_get_nvlist(oldlimits, LIMIT_NV_CONNECT,
+		    NULL);
+	}
+
+	return (verify_only_sa_newlimts(oldfunclimits, newfunclimit));
+}
+
+static bool
+verify_addr2name_newlimits(const nvlist_t *oldlimits,
+    const nvlist_t *newfunclimit)
+{
+	void *cookie;
+	const nvlist_t *oldfunclimits;
+
+	oldfunclimits = NULL;
+	if (oldlimits != NULL) {
+		oldfunclimits = dnvlist_get_nvlist(oldlimits,
+		    LIMIT_NV_ADDR2NAME, NULL);
+	}
+
+	cookie = NULL;
+	while (nvlist_next(newfunclimit, NULL, &cookie) != NULL) {
+		if (strcmp(cnvlist_name(cookie), "sockaddr") == 0) {
+			void *sacookie;
+
+			if (cnvlist_type(cookie) != NV_TYPE_NVLIST)
+				return (false);
+
+			sacookie = NULL;
+			while (nvlist_next(cnvlist_get_nvlist(cookie), NULL,
+			    &sacookie) != NULL) {
+				const void *sa;
+				size_t sasize;
+
+				if (cnvlist_type(sacookie) != NV_TYPE_BINARY)
+					return (false);
+
+				sa = cnvlist_get_binary(sacookie, &sasize);
+				if (!net_allowed_bsaddr(oldfunclimits, sa,
+				    sasize)) {
+					return (false);
+				}
+			}
+		} else if (strcmp(cnvlist_name(cookie), "family") == 0) {
+			size_t i, sfamilies;
+			const uint64_t *families;
+
+			if (cnvlist_type(cookie) != NV_TYPE_NUMBER_ARRAY)
+				return (false);
+
+			families = cnvlist_get_number_array(cookie, &sfamilies);
+			for (i = 0; i < sfamilies; i++) {
+				if (!net_allowed_family(oldfunclimits,
+				    families[i])) {
+					return (false);
+				}
+			}
+		} else {
+			return (false);
+		}
+	}
+
+	return (true);
+}
+
+static bool
+verify_name2addr_newlimits(const nvlist_t *oldlimits,
+    const nvlist_t *newfunclimit)
+{
+	void *cookie;
+	const nvlist_t *oldfunclimits;
+
+	oldfunclimits = NULL;
+	if (oldlimits != NULL) {
+		oldfunclimits = dnvlist_get_nvlist(oldlimits,
+		    LIMIT_NV_ADDR2NAME, NULL);
+	}
+
+	cookie = NULL;
+	while (nvlist_next(newfunclimit, NULL, &cookie) != NULL) {
+		if (strcmp(cnvlist_name(cookie), "hosts") == 0) {
+			void *hostcookie;
+
+			if (cnvlist_type(cookie) != NV_TYPE_NVLIST)
+				return (false);
+
+			hostcookie = NULL;
+			while (nvlist_next(cnvlist_get_nvlist(cookie), NULL,
+			    &hostcookie) != NULL) {
+				if (cnvlist_type(hostcookie) != NV_TYPE_STRING)
+					return (false);
+
+				if (!net_allowed_hosts(oldfunclimits,
+				    cnvlist_name(hostcookie),
+				    cnvlist_get_string(hostcookie))) {
+					return (false);
+				}
+			}
+		} else if (strcmp(cnvlist_name(cookie), "family") == 0) {
+			size_t i, sfamilies;
+			const uint64_t *families;
+
+			if (cnvlist_type(cookie) != NV_TYPE_NUMBER_ARRAY)
+				return (false);
+
+			families = cnvlist_get_number_array(cookie, &sfamilies);
+			for (i = 0; i < sfamilies; i++) {
+				if (!net_allowed_family(oldfunclimits,
+				    families[i])) {
+					return (false);
+				}
+			}
+		} else {
+			return (false);
+		}
+	}
+
+	return (true);
+}
+
+static int
+net_limit(const nvlist_t *oldlimits, const nvlist_t *newlimits)
+{
+	const char *name;
+	void *cookie;
+	bool hasmode, hasconnect, hasbind, hasaddr2name, hasname2addr;
+
+	/*
+	 * Modes:
+	 *	ADDR2NAME:
+	 *		getnameinfo
+	 *	DEPRECATED_ADDR2NAME:
+	 *		gethostbyaddr
+	 *
+	 *	NAME2ADDR:
+	 *		getaddrinfo
+	 *	DEPRECATED_NAME2ADDR:
+	 *		gethostbyname
+	 *
+	 * Limit scheme:
+	 *	mode	: NV_TYPE_NUMBER
+	 *	connect : NV_TYPE_NVLIST
+	 *		sockaddr : NV_TYPE_NVLIST
+	 *			""	: NV_TYPE_BINARY
+	 *			...	: NV_TYPE_BINARY
+	 *	bind	: NV_TYPE_NVLIST
+	 *		sockaddr : NV_TYPE_NVLIST
+	 *			""	: NV_TYPE_BINARY
+	 *			...	: NV_TYPE_BINARY
+	 *	addr2name : NV_TYPE_NVLIST
+	 *		family  : NV_TYPE_NUMBER_ARRAY
+	 *		sockaddr : NV_TYPE_NVLIST
+	 *			""	: NV_TYPE_BINARY
+	 *			...	: NV_TYPE_BINARY
+	 *	name2addr : NV_TYPE_NVLIST
+	 *		family : NV_TYPE_NUMBER
+	 *		hosts	: NV_TYPE_NVLIST
+	 *			host	: servname : NV_TYPE_STRING
+	 */
+
+	hasmode = false;
+	hasconnect = false;
+	hasbind = false;
+	hasaddr2name = false;
+	hasname2addr = false;
+
+	cookie = NULL;
+	while ((name = nvlist_next(newlimits, NULL, &cookie)) != NULL) {
+		if (strcmp(name, "mode") == 0) {
+			if (cnvlist_type(cookie) != NV_TYPE_NUMBER) {
+				return (NO_RECOVERY);
+			}
+			if (!net_allowed_mode(oldlimits,
+			    cnvlist_get_number(cookie))) {
+				return (ENOTCAPABLE);
+			}
+			hasmode = true;
+			continue;
+		}
+
+		if (cnvlist_type(cookie) != NV_TYPE_NVLIST) {
+			return (NO_RECOVERY);
+		}
+
+		if (strcmp(name, LIMIT_NV_BIND) == 0) {
+			hasbind = true;
+			if (!verify_bind_newlimts(oldlimits,
+			    cnvlist_get_nvlist(cookie))) {
+				return (ENOTCAPABLE);
+			}
+		} else if (strcmp(name, LIMIT_NV_CONNECT) == 0) {
+			hasconnect = true;
+			if (!verify_connect_newlimits(oldlimits,
+			    cnvlist_get_nvlist(cookie))) {
+				return (ENOTCAPABLE);
+			}
+		} else if (strcmp(name, LIMIT_NV_ADDR2NAME) == 0) {
+			hasaddr2name = true;
+			if (!verify_addr2name_newlimits(oldlimits,
+			    cnvlist_get_nvlist(cookie))) {
+				return (ENOTCAPABLE);
+			}
+		} else if (strcmp(name, LIMIT_NV_NAME2ADDR) == 0) {
+			hasname2addr = true;
+			if (!verify_name2addr_newlimits(oldlimits,
+			    cnvlist_get_nvlist(cookie))) {
+				return (ENOTCAPABLE);
+			}
+		}
+	}
+
+	/* Mode is required. */
+	if (!hasmode)
+		return (ENOTCAPABLE);
+
+	/*
+	 * If the new limit doesn't mention mode or family we have to
+	 * check if the current limit does have those. Missing mode or
+	 * family in the limit means that all modes or families are
+	 * allowed.
+	 */
+	if (oldlimits == NULL)
+		return (0);
+	if (!hasconnect && nvlist_exists(oldlimits, LIMIT_NV_BIND))
+		return (ENOTCAPABLE);
+	if (!hasconnect && nvlist_exists(oldlimits, LIMIT_NV_CONNECT))
+		return (ENOTCAPABLE);
+	if (!hasaddr2name && nvlist_exists(oldlimits, LIMIT_NV_ADDR2NAME))
+		return (ENOTCAPABLE);
+	if (!hasname2addr && nvlist_exists(oldlimits, LIMIT_NV_NAME2ADDR))
+		return (ENOTCAPABLE);
+	return (0);
+}
+
+static int
+net_command(const char *cmd, const nvlist_t *limits, nvlist_t *nvlin,
+    nvlist_t *nvlout)
+{
+
+	if (strcmp(cmd, "bind") == 0)
+		return (net_bind(limits, nvlin, nvlout));
+	else if (strcmp(cmd, "connect") == 0)
+		return (net_connect(limits, nvlin, nvlout));
+	else if (strcmp(cmd, "gethostbyname") == 0)
+		return (net_gethostbyname(limits, nvlin, nvlout));
+	else if (strcmp(cmd, "gethostbyaddr") == 0)
+		return (net_gethostbyaddr(limits, nvlin, nvlout));
+	else if (strcmp(cmd, "getnameinfo") == 0)
+		return (net_getnameinfo(limits, nvlin, nvlout));
+	else if (strcmp(cmd, "getaddrinfo") == 0)
+		return (net_getaddrinfo(limits, nvlin, nvlout));
+
+	return (EINVAL);
+}
+
+CREATE_SERVICE("system.net", net_limit, net_command, 0);

Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h	(nonexistent)
+++ projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h	(revision 364279)
@@ -0,0 +1,165 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Mariusz Zaborski <oshogbo@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CAP_NETWORK_H_
+#define	_CAP_NETWORK_H_
+
+#ifdef HAVE_CASPER
+#define WITH_CASPER
+#endif
+
+#include <sys/dnv.h>
+#include <sys/nv.h>
+
+#include <sys/socket.h>
+
+struct addrinfo;
+struct hostent;
+
+struct cap_net_limit;
+typedef struct cap_net_limit cap_net_limit_t;
+
+#define CAPNET_ADDR2NAME		(0x01)
+#define CAPNET_NAME2ADDR		(0x02)
+#define CAPNET_DEPRECATED_ADDR2NAME	(0x04)
+#define CAPNET_DEPRECATED_NAME2ADDR	(0x08)
+#define CAPNET_CONNECT			(0x10)
+#define CAPNET_BIND			(0x20)
+#define CAPNET_CONNECTDNS		(0x40)
+
+#ifdef WITH_CASPER
+/* Capability functions. */
+int cap_bind(cap_channel_t *chan, int s, const struct sockaddr *addr,
+    socklen_t addrlen);
+int cap_connect(cap_channel_t *chan, int s, const struct sockaddr *name,
+    socklen_t namelen);
+
+int cap_getaddrinfo(cap_channel_t *chan, const char *hostname,
+    const char *servname, const struct addrinfo *hints, struct addrinfo **res);
+int cap_getnameinfo(cap_channel_t *chan, const struct sockaddr *sa,
+    socklen_t salen, char *host, size_t hostlen, char *serv, size_t servlen,
+    int flags);
+
+/* Limit functions. */
+cap_net_limit_t *cap_net_limit_init(cap_channel_t *chan, uint64_t mode);
+int cap_net_limit(cap_net_limit_t *limit);
+void cap_net_free(cap_net_limit_t *limit);
+
+cap_net_limit_t *cap_net_limit_addr2name_family(cap_net_limit_t *limit,
+    int *family, size_t size);
+cap_net_limit_t *cap_net_limit_addr2name(cap_net_limit_t *limit,
+    const struct sockaddr *sa, socklen_t salen);
+
+cap_net_limit_t *cap_net_limit_name2addr_family(cap_net_limit_t *limit,
+    int *family, size_t size);
+cap_net_limit_t *cap_net_limit_name2addr(cap_net_limit_t *limit,
+    const char *name, const char *serv);
+
+cap_net_limit_t *cap_net_limit_connect(cap_net_limit_t *limit,
+    const struct sockaddr *sa, socklen_t salen);
+
+cap_net_limit_t *cap_net_limit_bind(cap_net_limit_t *limit,
+    const struct sockaddr *sa, socklen_t salen);
+
+/* Deprecated functions. */
+struct hostent *cap_gethostbyname(cap_channel_t *chan, const char *name);
+struct hostent *cap_gethostbyname2(cap_channel_t *chan, const char *name,
+    int af);
+struct hostent *cap_gethostbyaddr(cap_channel_t *chan, const void *addr,
+    socklen_t len, int af);
+#else
+/* Capability functions. */
+#define cap_bind(chan, s, addr, addrlen)					\
+	bind(s, addr, addrlen)
+#define cap_connect(chan, s, name, namelen)					\
+	connect(s, name, namelen)
+#define	cap_getaddrinfo(chan, hostname, servname, hints, res)			\
+	getaddrinfo(hostname, servname, hints, res)
+#define	cap_getnameinfo(chan, sa, salen, host, hostlen, serv, servlen, flags)	\
+	getnameinfo(sa, salen, host, hostlen, serv, servlen, flags)
+
+/* Limit functions. */
+#define cap_net_limit_init(chan, mode)	((cap_net_limit_t *)malloc(8))
+#define cap_net_free(limit)		free(limit)
+static inline int
+cap_net_limit(cap_net_limit_t *limit)
+{
+	free(limit);
+	return (0);
+}
+
+static inline cap_net_limit_t *
+cap_net_limit_addr2name_family(cap_net_limit_t *limit,
+    int *family __unused, size_t size __unused)
+{
+	return (limit);
+}
+
+static inline cap_net_limit_t *
+cap_net_limit_addr2name(cap_net_limit_t *limit,
+    const struct sockaddr *sa __unused, socklen_t salen __unused)
+{
+	return (limit);
+}
+
+static inline cap_net_limit_t *
+cap_net_limit_name2addr_family(cap_net_limit_t *limit,
+    int *family __unused, size_t size __unused)
+{
+	return (limit);
+}
+
+static inline cap_net_limit_t *
+cap_net_limit_name2addr(cap_net_limit_t *limit,
+    const char *name __unused, const char *serv __unused)
+{
+	return (limit);
+}
+
+static inline cap_net_limit_t *
+cap_net_limit_connect(cap_net_limit_t *limit,
+    const struct sockaddr *sa __unused, socklen_t salen __unused)
+{
+	return (limit);
+}
+
+static inline cap_net_limit_t *
+cap_net_limit_bind(cap_net_limit_t *limit,
+    const struct sockaddr *sa __unused, socklen_t salen __unused)
+{
+	return (limit);
+}
+
+/* Deprecated functions. */
+#define	cap_gethostbyname(chan, name)		 gethostbyname(name)
+#define	cap_gethostbyname2(chan, name, type)	 gethostbyname2(name, type)
+#define	cap_gethostbyaddr(chan, addr, len, type) gethostbyaddr(addr, len, type)
+#endif
+
+#endif	/* !_CAP_NETWORK_H_ */

Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile	(nonexistent)
+++ projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile	(revision 364279)
@@ -0,0 +1,16 @@
+# $FreeBSD$
+
+.include <src.opts.mk>
+
+ATF_TESTS_C=	net_test
+
+.if ${MK_CASPER} != "no"
+LIBADD+=	casper
+LIBADD+=	cap_net
+CFLAGS+=-DWITH_CASPER
+.endif
+LIBADD+=	nv
+
+WARNS?=		3
+
+.include <bsd.test.mk>

Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c
===================================================================
--- projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c	(nonexistent)
+++ projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c	(revision 364279)
@@ -0,0 +1,1160 @@
+/*-
+ * Copyright (c) 2020 Mariusz Zaborski <oshogbo@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <errno.h>
+#include <netdb.h>
+
+#include <atf-c.h>
+
+#include <libcasper.h>
+#include <casper/cap_net.h>
+
+#define	TEST_DOMAIN_0	"example.com"
+#define	TEST_DOMAIN_1	"freebsd.org"
+#define	TEST_IPV4	"1.1.1.1"
+#define	TEST_IPV6	"2001:4860:4860::8888"
+#define	TEST_BIND_IPV4	"127.0.0.1"
+
+static cap_channel_t *
+create_network_service(void)
+{
+	cap_channel_t *capcas, *capnet;
+
+	capcas = cap_init();
+	ATF_REQUIRE(capcas != NULL);
+
+	capnet = cap_service_open(capcas, "system.net");
+	ATF_REQUIRE(capnet != NULL);
+
+	cap_close(capcas);
+	return (capnet);
+}
+
+static int
+test_getnameinfo_v4(cap_channel_t *chan, int family, const char *ip)
+{
+	struct sockaddr_in ipaddr;
+	char capfn[MAXHOSTNAMELEN];
+	char origfn[MAXHOSTNAMELEN];
+	int ret;
+
+	memset(&ipaddr, 0, sizeof(ipaddr));
+	ipaddr.sin_family = family;
+	inet_pton(family, ip, &ipaddr.sin_addr);
+
+	ret = cap_getnameinfo(chan, (struct sockaddr *)&ipaddr, sizeof(ipaddr),
+	    capfn, sizeof(capfn), NULL, 0, NI_NAMEREQD);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	ret = getnameinfo((struct sockaddr *)&ipaddr, sizeof(ipaddr), origfn,
+	    sizeof(origfn), NULL, 0, NI_NAMEREQD);
+	ATF_REQUIRE(ret == 0);
+	ATF_REQUIRE(strcmp(origfn, capfn) == 0);
+
+	return (0);
+}
+
+static int
+test_getnameinfo_v6(cap_channel_t *chan, const char *ip)
+{
+	struct sockaddr_in6 ipaddr;
+	char capfn[MAXHOSTNAMELEN];
+	char origfn[MAXHOSTNAMELEN];
+	int ret;
+
+	memset(&ipaddr, 0, sizeof(ipaddr));
+	ipaddr.sin6_family = AF_INET6;
+	inet_pton(AF_INET6, ip, &ipaddr.sin6_addr);
+
+	ret = cap_getnameinfo(chan, (struct sockaddr *)&ipaddr, sizeof(ipaddr),
+	    capfn, sizeof(capfn), NULL, 0, NI_NAMEREQD);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	ret = getnameinfo((struct sockaddr *)&ipaddr, sizeof(ipaddr), origfn,
+	    sizeof(origfn), NULL, 0, NI_NAMEREQD);
+	ATF_REQUIRE(ret == 0);
+	ATF_REQUIRE(strcmp(origfn, capfn) == 0);
+
+	return (0);
+}
+
+static int
+test_getnameinfo(cap_channel_t *chan, int family, const char *ip)
+{
+
+	if (family == AF_INET6) {
+		return (test_getnameinfo_v6(chan, ip));
+	}
+
+	return (test_getnameinfo_v4(chan, family, ip));
+}
+
+static int
+test_gethostbyaddr_v4(cap_channel_t *chan, int family, const char *ip)
+{
+	struct in_addr ipaddr;
+	struct hostent *caphp, *orighp;
+
+	memset(&ipaddr, 0, sizeof(ipaddr));
+	inet_pton(AF_INET, ip, &ipaddr);
+
+	caphp = cap_gethostbyaddr(chan, &ipaddr, sizeof(ipaddr), family);
+	if (caphp == NULL) {
+		return (h_errno);
+	}
+
+	orighp = gethostbyaddr(&ipaddr, sizeof(ipaddr), family);
+	ATF_REQUIRE(orighp != NULL);
+	ATF_REQUIRE(strcmp(caphp->h_name, caphp->h_name) == 0);
+
+	return (0);
+}
+
+static int
+test_gethostbyaddr_v6(cap_channel_t *chan, const char *ip)
+{
+	struct in6_addr ipaddr;
+	struct hostent *caphp, *orighp;
+
+	memset(&ipaddr, 0, sizeof(ipaddr));
+	inet_pton(AF_INET6, ip, &ipaddr);
+
+	caphp = cap_gethostbyaddr(chan, &ipaddr, sizeof(ipaddr), AF_INET6);
+	if (caphp == NULL)
+		return (h_errno);
+
+	orighp = gethostbyaddr(&ipaddr, sizeof(ipaddr), AF_INET6);
+	ATF_REQUIRE(orighp != NULL);
+	ATF_REQUIRE(strcmp(caphp->h_name, caphp->h_name) == 0);
+
+	return (0);
+}
+
+static int
+test_gethostbyaddr(cap_channel_t *chan, int family, const char *ip)
+{
+
+	if (family == AF_INET6) {
+		return (test_gethostbyaddr_v6(chan, ip));
+	} else {
+		return (test_gethostbyaddr_v4(chan, family, ip));
+	}
+}
+
+static int
+test_getaddrinfo(cap_channel_t *chan, int family, const char *domain,
+    const char *servname)
+{
+	struct addrinfo hints, *capres, *origres, *res0, *res1;
+	bool found;
+	int ret;
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = family;
+	hints.ai_socktype = SOCK_STREAM;
+
+	ret = cap_getaddrinfo(chan, domain, servname, &hints, &capres);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	ret = getaddrinfo(domain, servname, &hints, &origres);
+	ATF_REQUIRE(ret == 0);
+
+	for (res0 = capres; res0 != NULL; res0 = res0->ai_next) {
+		found = false;
+		for (res1 = origres; res1 != NULL; res1 = res1->ai_next) {
+			if (res1->ai_addrlen == res0->ai_addrlen &&
+			    memcmp(res1->ai_addr, res0->ai_addr,
+			    res0->ai_addrlen) == 0) {
+				found = true;
+				break;
+			}
+		}
+		ATF_REQUIRE(found);
+	}
+
+	freeaddrinfo(capres);
+	freeaddrinfo(origres);
+	return (0);
+}
+
+static int
+test_gethostbyname(cap_channel_t *chan, int family, const char *domain)
+{
+	struct hostent *caphp, *orighp;
+
+	caphp = cap_gethostbyname2(chan, domain, family);
+	if (caphp == NULL) {
+		return (h_errno);
+	}
+
+	orighp = gethostbyname2(domain, family);
+	ATF_REQUIRE(orighp != NULL);
+	ATF_REQUIRE(strcmp(caphp->h_name, orighp->h_name) == 0);
+
+	return (0);
+}
+
+static int
+test_bind(cap_channel_t *chan, const char *ip)
+{
+	struct sockaddr_in ipv4;
+	int capfd, ret, serrno;
+
+	capfd = socket(AF_INET, SOCK_STREAM, 0);
+	ATF_REQUIRE(capfd > 0);
+
+	memset(&ipv4, 0, sizeof(ipv4));
+	ipv4.sin_family = AF_INET;
+	inet_pton(AF_INET, ip, &ipv4.sin_addr);
+
+	ret = cap_bind(chan, capfd, (struct sockaddr *)&ipv4, sizeof(ipv4));
+	serrno = errno;
+	close(capfd);
+
+	return (ret < 0 ? serrno : 0);
+}
+
+static int
+test_connect(cap_channel_t *chan, const char *ip, unsigned short port)
+{
+	struct sockaddr_in ipv4;
+	int capfd, ret, serrno;
+
+	capfd = socket(AF_INET, SOCK_STREAM, 0);
+	ATF_REQUIRE(capfd > 0);
+
+	memset(&ipv4, 0, sizeof(ipv4));
+	ipv4.sin_family = AF_INET;
+	ipv4.sin_port = htons(port);
+	inet_pton(AF_INET, ip, &ipv4.sin_addr);
+
+	ret = cap_connect(chan, capfd, (struct sockaddr *)&ipv4, sizeof(ipv4));
+	serrno = errno;
+	close(capfd);
+
+	return (ret < 0 ? serrno : 0);
+}
+
+static void
+test_extend_mode(cap_channel_t *capnet, int current)
+{
+	cap_net_limit_t *limit;
+	const int rights[] = {
+		CAPNET_ADDR2NAME,
+		CAPNET_NAME2ADDR,
+		CAPNET_DEPRECATED_ADDR2NAME,
+		CAPNET_DEPRECATED_NAME2ADDR,
+		CAPNET_CONNECT,
+		CAPNET_BIND,
+		CAPNET_CONNECTDNS
+	};
+	size_t i;
+
+	for (i = 0; i < nitems(rights); i++) {
+		if (current == rights[i])
+			continue;
+
+		limit = cap_net_limit_init(capnet, current | rights[i]);
+		ATF_REQUIRE(limit != NULL);
+		ATF_REQUIRE(cap_net_limit(limit) != 0);
+	}
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_addr2name_mode);
+ATF_TC_BODY(capnet__limits_addr2name_mode, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* LIMIT */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	/* ALLOWED */
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0);
+
+	/* DISALLOWED */
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE);
+
+	test_extend_mode(capnet, CAPNET_ADDR2NAME);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_addr2name_family);
+ATF_TC_BODY(capnet__limits_addr2name_family, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	int family[] = { AF_INET6, AF_INET };
+
+	capnet = create_network_service();
+
+	/* Limit to AF_INET6 and AF_INET. */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name_family(limit, family, nitems(family));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0);
+
+	/* Limit to AF_INET6 and AF_INET. */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name_family(limit, &family[0], 1);
+	cap_net_limit_addr2name_family(limit, &family[1], 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0);
+
+	/* Limit to AF_INET6. */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name_family(limit, family, 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_addr2name);
+ATF_TC_BODY(capnet__limits_addr2name, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	struct sockaddr_in ipaddrv4;
+	struct sockaddr_in6 ipaddrv6;
+
+	capnet = create_network_service();
+
+	/* Limit to TEST_IPV4 and TEST_IPV6. */
+	memset(&ipaddrv4, 0, sizeof(ipaddrv4));
+	memset(&ipaddrv6, 0, sizeof(ipaddrv6));
+
+	ipaddrv4.sin_family = AF_INET;
+	inet_pton(AF_INET, TEST_IPV4, &ipaddrv4.sin_addr);
+
+	ipaddrv6.sin6_family = AF_INET6;
+	inet_pton(AF_INET6, TEST_IPV6, &ipaddrv6.sin6_addr);
+
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+
+	cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4,
+	    sizeof(ipaddrv4));
+	cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv6,
+	    sizeof(ipaddrv6));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, "127.0.0.1") ==
+	    ENOTCAPABLE);
+
+	/* Limit to AF_INET. */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4,
+	    sizeof(ipaddrv4));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, "127.0.0.1") ==
+	    ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_addr2name_mode);
+ATF_TC_BODY(capnet__limits_deprecated_addr2name_mode, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* LIMIT */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	/* ALLOWED */
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0);
+
+	/* DISALLOWED */
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_addr2name_family);
+ATF_TC_BODY(capnet__limits_deprecated_addr2name_family, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	int family[] = { AF_INET6, AF_INET };
+
+	capnet = create_network_service();
+
+	/* Limit to AF_INET6 and AF_INET. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name_family(limit, family, nitems(family));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, PF_LINK, TEST_IPV4) ==
+	    ENOTCAPABLE);
+
+	/* Limit to AF_INET6 and AF_INET. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name_family(limit, &family[0], 1);
+	cap_net_limit_addr2name_family(limit, &family[1], 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, PF_LINK, TEST_IPV4) ==
+	    ENOTCAPABLE);
+
+	/* Limit to AF_INET6. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name_family(limit, family, 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, PF_LINK, TEST_IPV4) ==
+	    ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_addr2name);
+ATF_TC_BODY(capnet__limits_deprecated_addr2name, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	struct in_addr ipaddrv4;
+	struct in6_addr ipaddrv6;
+
+	capnet = create_network_service();
+
+	/* Limit to TEST_IPV4 and TEST_IPV6. */
+	memset(&ipaddrv4, 0, sizeof(ipaddrv4));
+	memset(&ipaddrv6, 0, sizeof(ipaddrv6));
+
+	inet_pton(AF_INET, TEST_IPV4, &ipaddrv4);
+	inet_pton(AF_INET6, TEST_IPV6, &ipaddrv6);
+
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+
+	cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4,
+	    sizeof(ipaddrv4));
+	cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv6,
+	    sizeof(ipaddrv6));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, "127.0.0.1") ==
+	    ENOTCAPABLE);
+
+	/* Limit to AF_INET. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4,
+	    sizeof(ipaddrv4));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, "127.0.0.1") ==
+	    ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_mode);
+ATF_TC_BODY(capnet__limits_name2addr_mode, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* LIMIT */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	/* ALLOWED */
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+
+	/* DISALLOWED */
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE);
+
+	test_extend_mode(capnet, CAPNET_ADDR2NAME);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_hosts);
+ATF_TC_BODY(capnet__limits_name2addr_hosts, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* Limit to TEST_DOMAIN_0 and localhost only. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr(limit, "localhost", NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, "localhost", NULL) == 0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, NULL) ==
+	    ENOTCAPABLE);
+
+	/* Limit to TEST_DOMAIN_0 only. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, "localhost", NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_hosts_servnames_strict);
+ATF_TC_BODY(capnet__limits_name2addr_hosts_servnames_strict, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/*
+	 * Limit to TEST_DOMAIN_0 and HTTP service.
+	 */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, "http");
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "http") ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "snmp") ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "http") ==
+	    ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_hosts_servnames_mix);
+ATF_TC_BODY(capnet__limits_name2addr_hosts_servnames_mix, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/*
+	 * Limit to TEST_DOMAIN_0 and any servnamex, and any domain with
+	 * servname HTTP.
+	 */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr(limit, NULL, "http");
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "http") ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "http") ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "snmp") ==
+	    ENOTCAPABLE);
+
+	/* Limit to HTTTP servname only. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, NULL, "http");
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "http") ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "http") ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "snmp") ==
+	    ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_family);
+ATF_TC_BODY(capnet__limits_name2addr_family, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	int family[] = { AF_INET6, AF_INET };
+
+	capnet = create_network_service();
+
+	/* Limit to AF_INET and AF_INET6. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr_family(limit, family, nitems(family));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET6, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, PF_LINK, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+
+	/* Limit to AF_INET and AF_INET6. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr_family(limit, &family[0], 1);
+	cap_net_limit_name2addr_family(limit, &family[1], 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET6, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, PF_LINK, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+
+	/* Limit to AF_INET6 only. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr_family(limit, family, 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET6, TEST_DOMAIN_0, NULL) ==
+	    0);
+	ATF_REQUIRE(test_getaddrinfo(capnet, PF_LINK, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_name2addr_mode);
+ATF_TC_BODY(capnet__limits_deprecated_name2addr_mode, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* LIMIT */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	/* ALLOWED */
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0);
+
+	/* DISALLOWED */
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE);
+
+	test_extend_mode(capnet, CAPNET_ADDR2NAME);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_name2addr_hosts);
+ATF_TC_BODY(capnet__limits_deprecated_name2addr_hosts, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* Limit to TEST_DOMAIN_0 and localhost only. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr(limit, "localhost", NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0);
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, "localhost") == 0);
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_1) == ENOTCAPABLE);
+
+	/* Limit to TEST_DOMAIN_0 only. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, "localhost") == ENOTCAPABLE);
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_1) == ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_name2addr_family);
+ATF_TC_BODY(capnet__limits_deprecated_name2addr_family, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	int family[] = { AF_INET6, AF_INET };
+
+	capnet = create_network_service();
+
+	/* Limit to AF_INET and AF_INET6. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr_family(limit, family, nitems(family));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0);
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET6, TEST_DOMAIN_0) == 0);
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, PF_LINK, TEST_DOMAIN_0) == ENOTCAPABLE);
+
+	/* Limit to AF_INET and AF_INET6. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr_family(limit, &family[0], 1);
+	cap_net_limit_name2addr_family(limit, &family[1], 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0);
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET6, TEST_DOMAIN_0) == 0);
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, PF_LINK, TEST_DOMAIN_0) == ENOTCAPABLE);
+
+	/* Limit to AF_INET6 only. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL);
+	cap_net_limit_name2addr_family(limit, family, 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyname(capnet, AF_INET6, TEST_DOMAIN_0) == 0);
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, PF_LINK, TEST_DOMAIN_0) == ENOTCAPABLE);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_bind_mode);
+ATF_TC_BODY(capnet__limits_bind_mode, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* LIMIT */
+	limit = cap_net_limit_init(capnet, CAPNET_BIND);
+	ATF_REQUIRE(limit != NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	/* ALLOWED */
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == 0);
+
+	/* DISALLOWED */
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE);
+
+	test_extend_mode(capnet, CAPNET_ADDR2NAME);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_bind);
+ATF_TC_BODY(capnet__limits_bind, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	struct sockaddr_in ipv4;
+
+	capnet = create_network_service();
+
+	limit = cap_net_limit_init(capnet, CAPNET_BIND);
+	ATF_REQUIRE(limit != NULL);
+
+	memset(&ipv4, 0, sizeof(ipv4));
+	ipv4.sin_family = AF_INET;
+	inet_pton(AF_INET, TEST_BIND_IPV4, &ipv4.sin_addr);
+
+	cap_net_limit_bind(limit, (struct sockaddr *)&ipv4, sizeof(ipv4));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == 0);
+	ATF_REQUIRE(test_bind(capnet, "127.0.0.2") == ENOTCAPABLE);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_connect_mode);
+ATF_TC_BODY(capnet__limits_connect_mode, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+
+	capnet = create_network_service();
+
+	/* LIMIT */
+	limit = cap_net_limit_init(capnet, CAPNET_CONNECT);
+	ATF_REQUIRE(limit != NULL);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	/* ALLOWED */
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == 0);
+
+	/* DISALLOWED */
+	ATF_REQUIRE(
+	    test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE);
+	ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) ==
+	    ENOTCAPABLE);
+	ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE);
+
+	test_extend_mode(capnet, CAPNET_ADDR2NAME);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_connect);
+ATF_TC_BODY(capnet__limits_connect, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	struct sockaddr_in ipv4;
+
+	capnet = create_network_service();
+
+	/* Limit only to TEST_IPV4 on port 80 and 443. */
+	limit = cap_net_limit_init(capnet, CAPNET_CONNECT);
+	ATF_REQUIRE(limit != NULL);
+	memset(&ipv4, 0, sizeof(ipv4));
+	ipv4.sin_family = AF_INET;
+	ipv4.sin_port = htons(80);
+	inet_pton(AF_INET, TEST_IPV4, &ipv4.sin_addr);
+	cap_net_limit_connect(limit, (struct sockaddr *)&ipv4, sizeof(ipv4));
+
+	ipv4.sin_port = htons(443);
+	cap_net_limit_connect(limit, (struct sockaddr *)&ipv4, sizeof(ipv4));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == 0);
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 80) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 443) == 0);
+
+	/* Limit only to TEST_IPV4 on port 443. */
+	limit = cap_net_limit_init(capnet, CAPNET_CONNECT);
+	cap_net_limit_connect(limit, (struct sockaddr *)&ipv4, sizeof(ipv4));
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 433) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 80) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE);
+	ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 443) == 0);
+
+	/* Unable to set empty limits. Empty limits means full access. */
+	limit = cap_net_limit_init(capnet, CAPNET_CONNECT);
+	ATF_REQUIRE(cap_net_limit(limit) != 0);
+
+	cap_close(capnet);
+}
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_connecttodns);
+ATF_TC_BODY(capnet__limits_connecttodns, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	struct addrinfo hints, *capres, *res;
+	int family[] = { AF_INET };
+
+	capnet = create_network_service();
+
+	limit = cap_net_limit_init(capnet, CAPNET_CONNECTDNS |
+	    CAPNET_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_IPV4, "80");
+	cap_net_limit_name2addr_family(limit, family, 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = AF_INET;
+	hints.ai_socktype = SOCK_STREAM;
+
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE);
+	ATF_REQUIRE(cap_getaddrinfo(capnet, TEST_IPV4, "80", &hints, &capres) ==
+	    0);
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE);
+
+	for (res = capres; res != NULL; res = res->ai_next) {
+		int s;
+
+		ATF_REQUIRE(res->ai_family == AF_INET);
+		ATF_REQUIRE(res->ai_socktype == SOCK_STREAM);
+
+		s = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+		ATF_REQUIRE(s >= 0);
+
+		ATF_REQUIRE(cap_connect(capnet, s, res->ai_addr,
+		    res->ai_addrlen) == 0);
+		close(s);
+	}
+
+	freeaddrinfo(capres);
+	cap_close(capnet);
+}
+
+
+ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_connecttodns);
+ATF_TC_BODY(capnet__limits_deprecated_connecttodns, tc)
+{
+	cap_channel_t *capnet;
+	cap_net_limit_t *limit;
+	struct hostent *caphp;
+	struct in_addr ipaddr;
+	struct sockaddr_in connaddr;
+	int family[] = { AF_INET };
+	int i;
+
+	capnet = create_network_service();
+
+	limit = cap_net_limit_init(capnet, CAPNET_CONNECTDNS |
+	    CAPNET_DEPRECATED_NAME2ADDR);
+	ATF_REQUIRE(limit != NULL);
+	cap_net_limit_name2addr(limit, TEST_IPV4, NULL);
+	cap_net_limit_name2addr_family(limit, family, 1);
+	ATF_REQUIRE(cap_net_limit(limit) == 0);
+
+	memset(&ipaddr, 0, sizeof(ipaddr));
+	inet_pton(AF_INET, TEST_IPV4, &ipaddr);
+
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE);
+	caphp = cap_gethostbyname2(capnet, TEST_IPV4, AF_INET);
+	ATF_REQUIRE(caphp != NULL);
+	ATF_REQUIRE(caphp->h_addrtype == AF_INET);
+	ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE);
+
+	for (i = 0; caphp->h_addr_list[i] != NULL; i++) {
+		int s;
+
+		s = socket(AF_INET, SOCK_STREAM, 0);
+		ATF_REQUIRE(s >= 0);
+
+		memset(&connaddr, 0, sizeof(connaddr));
+		connaddr.sin_family = AF_INET;
+		memcpy((char *)&connaddr.sin_addr.s_addr,
+		    (char *)caphp->h_addr_list[i], caphp->h_length);
+		connaddr.sin_port = htons(80);
+
+		ATF_REQUIRE(cap_connect(capnet, s, (struct sockaddr *)&connaddr,
+		    sizeof(connaddr)) == 0);
+		close(s);
+	}
+
+	cap_close(capnet);
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+
+	ATF_TP_ADD_TC(tp, capnet__limits_addr2name_mode);
+	ATF_TP_ADD_TC(tp, capnet__limits_addr2name_family);
+	ATF_TP_ADD_TC(tp, capnet__limits_addr2name);
+
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_addr2name_mode);
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_addr2name_family);
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_addr2name);
+
+	ATF_TP_ADD_TC(tp, capnet__limits_name2addr_mode);
+	ATF_TP_ADD_TC(tp, capnet__limits_name2addr_hosts);
+	ATF_TP_ADD_TC(tp, capnet__limits_name2addr_hosts_servnames_strict);
+	ATF_TP_ADD_TC(tp, capnet__limits_name2addr_hosts_servnames_mix);
+	ATF_TP_ADD_TC(tp, capnet__limits_name2addr_family);
+
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_name2addr_mode);
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_name2addr_hosts);
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_name2addr_family);
+
+	ATF_TP_ADD_TC(tp, capnet__limits_bind_mode);
+	ATF_TP_ADD_TC(tp, capnet__limits_bind);
+
+	ATF_TP_ADD_TC(tp, capnet__limits_connect_mode);
+	ATF_TP_ADD_TC(tp, capnet__limits_connect);
+
+	ATF_TP_ADD_TC(tp, capnet__limits_connecttodns);
+	ATF_TP_ADD_TC(tp, capnet__limits_deprecated_connecttodns);
+
+	return (atf_no_error());
+}

Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/clang1100-import/share/mk/src.libnames.mk
===================================================================
--- projects/clang1100-import/share/mk/src.libnames.mk	(revision 364278)
+++ projects/clang1100-import/share/mk/src.libnames.mk	(revision 364279)
@@ -1,696 +1,698 @@
 # $FreeBSD$
 #
 # The include file <src.libnames.mk> define library names suitable
 # for INTERNALLIB and PRIVATELIB definition
 
 .if !target(__<bsd.init.mk>__)
 .error src.libnames.mk cannot be included directly.
 .endif
 
 .if !target(__<src.libnames.mk>__)
 __<src.libnames.mk>__:
 
 .include <src.opts.mk>
 
 _PRIVATELIBS=	\
 		atf_c \
 		atf_cxx \
 		auditd \
 		bsdstat \
 		devdctl \
 		event1 \
 		gmock \
 		gtest \
 		gmock_main \
 		gtest_main \
 		heimipcc \
 		heimipcs \
 		ldns \
 		sqlite3 \
 		ssh \
 		ucl \
 		unbound \
 		zstd
 
 _INTERNALLIBS=	\
 		amu \
 		bsnmptools \
 		c_nossp_pic \
 		cron \
 		elftc \
 		fifolog \
 		ifconfig \
 		ipf \
 		kyua_cli \
 		kyua_drivers \
 		kyua_engine \
 		kyua_model \
 		kyua_store \
 		kyua_utils \
 		lpr \
 		lua \
 		lutok \
 		netbsd \
 		ntp \
 		ntpevent \
 		openbsd \
 		opts \
 		parse \
 		pe \
 		pmcstat \
 		sl \
 		sm \
 		smdb \
 		smutil \
 		telnet \
 		vers
 
 _LIBRARIES=	\
 		${_PRIVATELIBS} \
 		${_INTERNALLIBS} \
 		${LOCAL_LIBRARIES} \
 		80211 \
 		alias \
 		archive \
 		asn1 \
 		avl \
 		be \
 		begemot \
 		bluetooth \
 		bsdxml \
 		bsm \
 		bsnmp \
 		bz2 \
 		c \
 		c_pic \
 		calendar \
 		cam \
 		casper \
 		cap_dns \
 		cap_fileargs \
 		cap_grp \
+		cap_net \
 		cap_pwd \
 		cap_sysctl \
 		cap_syslog \
 		com_err \
 		compiler_rt \
 		crypt \
 		crypto \
 		ctf \
 		cuse \
 		cxxrt \
 		devctl \
 		devdctl \
 		devinfo \
 		devstat \
 		dialog \
 		dl \
 		dpv \
 		dtrace \
 		dwarf \
 		edit \
 		efivar \
 		elf \
 		execinfo \
 		fetch \
 		figpar \
 		geom \
 		gnuregex \
 		gpio \
 		gssapi \
 		gssapi_krb5 \
 		hdb \
 		heimbase \
 		heimntlm \
 		heimsqlite \
 		hx509 \
 		ipsec \
 		ipt \
 		jail \
 		kadm5clnt \
 		kadm5srv \
 		kafs5 \
 		kdc \
 		kiconv \
 		krb5 \
 		kvm \
 		l \
 		lzma \
 		m \
 		magic \
 		md \
 		memstat \
 		mp \
 		mt \
 		ncurses \
 		ncursesw \
 		netgraph \
 		ngatm \
 		nv \
 		nvpair \
 		opencsd \
 		opie \
 		pam \
 		panel \
 		panelw \
 		pcap \
 		pcsclite \
 		pjdlog \
 		pmc \
 		proc \
 		procstat \
 		pthread \
 		radius \
 		regex \
 		roken \
 		rpcsec_gss \
 		rpcsvc \
 		rt \
 		rtld_db \
 		sbuf \
 		sdp \
 		sm \
 		smb \
 		ssl \
 		ssp_nonshared \
 		stats \
 		stdthreads \
 		supcplusplus \
 		sysdecode \
 		tacplus \
 		termcap \
 		termcapw \
 		ufs \
 		ugidfw \
 		ulog \
 		umem \
 		usb \
 		usbhid \
 		util \
 		uutil \
 		vmmapi \
 		wind \
 		wrap \
 		xo \
 		y \
 		ypclnt \
 		z \
 		zfs_core \
 		zfs \
 		zpool \
 
 .if ${MK_BLACKLIST} != "no"
 _LIBRARIES+= \
 		blacklist \
 
 .endif
 
 .if ${MK_OFED} != "no"
 _LIBRARIES+= \
 		cxgb4 \
 		ibcm \
 		ibmad \
 		ibnetdisc \
 		ibumad \
 		ibverbs \
 		mlx4 \
 		mlx5 \
 		rdmacm \
 		osmcomp \
 		opensm \
 		osmvendor
 .endif
 
 .if ${MK_BEARSSL} == "yes"
 _LIBRARIES+= \
 		bearssl \
 		secureboot \
 
 LIBBEARSSL?=	${LIBBEARSSLDIR}/libbearssl.a
 LIBSECUREBOOT?=	${LIBSECUREBOOTDIR}/libsecureboot.a
 .endif
 
 .if ${MK_VERIEXEC} == "yes"
 _LIBRARIES+= veriexec
 
 LIBVERIEXEC?=	${LIBVERIEXECDIR}/libveriexec.a
 .endif
 
 # Each library's LIBADD needs to be duplicated here for static linkage of
 # 2nd+ order consumers.  Auto-generating this would be better.
 _DP_80211=	sbuf bsdxml
 _DP_archive=	z bz2 lzma bsdxml zstd
 _DP_zstd=	pthread
 .if ${MK_BLACKLIST} != "no"
 _DP_blacklist+=	pthread
 .endif
 _DP_crypto=	pthread
 .if ${MK_OPENSSL} != "no"
 _DP_archive+=	crypto
 .else
 _DP_archive+=	md
 .endif
 _DP_sqlite3=	pthread
 _DP_ssl=	crypto
 _DP_ssh=	crypto crypt z
 .if ${MK_LDNS} != "no"
 _DP_ssh+=	ldns
 .endif
 _DP_edit=	ncursesw
 .if ${MK_OPENSSL} != "no"
 _DP_bsnmp=	crypto
 .endif
 _DP_geom=	bsdxml sbuf
 _DP_cam=	sbuf
 _DP_kvm=	elf
 _DP_kyua_cli=		kyua_drivers kyua_engine kyua_model kyua_store kyua_utils
 _DP_kyua_drivers=	kyua_model kyua_engine kyua_store
 _DP_kyua_engine=	lutok kyua_utils
 _DP_kyua_model=		lutok
 _DP_kyua_utils=		lutok
 _DP_kyua_store=		kyua_model kyua_utils sqlite3
 _DP_casper=	nv
 _DP_cap_dns=	nv
 _DP_cap_fileargs=	nv
 _DP_cap_grp=	nv
 _DP_cap_pwd=	nv
 _DP_cap_sysctl=	nv
 _DP_cap_syslog=	nv
 .if ${MK_OFED} != "no"
 _DP_pcap=	ibverbs mlx5
 .endif
 _DP_pjdlog=	util
 _DP_opie=	md
 _DP_usb=	pthread
 _DP_unbound=	ssl crypto pthread
 _DP_rt=	pthread
 .if ${MK_OPENSSL} == "no"
 _DP_radius=	md
 .else
 _DP_radius=	crypto
 .endif
 _DP_rtld_db=	elf procstat
 _DP_procstat=	kvm util elf
 .if ${MK_CXX} == "yes"
 .if ${MK_LIBCPLUSPLUS} != "no"
 _DP_proc=	cxxrt
 .else
 _DP_proc=	supcplusplus
 .endif
 .endif
 .if ${MK_CDDL} != "no"
 _DP_proc+=	ctf
 .endif
 _DP_proc+=	elf procstat rtld_db util
 _DP_mp=	crypto
 _DP_memstat=	kvm
 _DP_magic=	z
 _DP_mt=		sbuf bsdxml
 _DP_ldns=	ssl crypto
 _DP_lua=	m
 _DP_lutok=	lua
 .if ${MK_OPENSSL} != "no"
 _DP_fetch=	ssl crypto
 .else
 _DP_fetch=	md
 .endif
 _DP_execinfo=	elf
 _DP_dwarf=	elf
 _DP_dpv=	dialog figpar util ncursesw
 _DP_dialog=	ncursesw m
 _DP_cuse=	pthread
 _DP_atf_cxx=	atf_c
 _DP_gtest=	pthread regex
 _DP_gmock=	gtest
 _DP_gmock_main=	gmock
 _DP_gtest_main=	gtest
 _DP_devstat=	kvm
 _DP_pam=	radius tacplus opie md util
 .if ${MK_KERBEROS} != "no"
 _DP_pam+=	krb5
 .endif
 .if ${MK_OPENSSH} != "no"
 _DP_pam+=	ssh
 .endif
 .if ${MK_NIS} != "no"
 _DP_pam+=	ypclnt
 .endif
 _DP_roken=	crypt
 _DP_kadm5clnt=	com_err krb5 roken
 _DP_kadm5srv=	com_err hdb krb5 roken
 _DP_heimntlm=	crypto com_err krb5 roken
 _DP_hx509=	asn1 com_err crypto roken wind
 _DP_hdb=	asn1 com_err krb5 roken sqlite3
 _DP_asn1=	com_err roken
 _DP_kdc=	roken hdb hx509 krb5 heimntlm asn1 crypto
 _DP_wind=	com_err roken
 _DP_heimbase=	pthread
 _DP_heimipcc=	heimbase roken pthread
 _DP_heimipcs=	heimbase roken pthread
 _DP_kafs5=	asn1 krb5 roken
 _DP_krb5+=	asn1 com_err crypt crypto hx509 roken wind heimbase heimipcc
 _DP_gssapi_krb5+=	gssapi krb5 crypto roken asn1 com_err
 _DP_lzma=	md pthread
 _DP_ucl=	m
 _DP_vmmapi=	util
 _DP_opencsd=	cxxrt
 _DP_ctf=	z
 _DP_dtrace=	ctf elf proc pthread rtld_db
 _DP_xo=		util
 # The libc dependencies are not strictly needed but are defined to make the
 # assert happy.
 _DP_c=		compiler_rt
 .if ${MK_SSP} != "no" && \
     (${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH:Mpower*} != "")
 _DP_c+=		ssp_nonshared
 .endif
 _DP_stats=	sbuf pthread
 _DP_stdthreads=	pthread
 _DP_tacplus=	md
 _DP_panel=	ncurses
 _DP_panelw=	ncursesw
 _DP_rpcsec_gss=	gssapi
 _DP_smb=	kiconv
 _DP_ulog=	md
 _DP_fifolog=	z
 _DP_ipf=	kvm
 _DP_zfs=	md pthread umem util uutil m nvpair avl bsdxml geom nvpair z \
 		zfs_core
 _DP_zfs_core=	nvpair
 _DP_zpool=	md pthread z nvpair avl umem
 _DP_be=		zfs nvpair
 
 # OFED support
 .if ${MK_OFED} != "no"
 _DP_cxgb4=	ibverbs pthread
 _DP_ibcm=	ibverbs
 _DP_ibmad=	ibumad
 _DP_ibnetdisc=	osmcomp ibmad ibumad
 _DP_ibumad=	
 _DP_ibverbs=
 _DP_mlx4=	ibverbs pthread
 _DP_mlx5=	ibverbs pthread
 _DP_rdmacm=	ibverbs
 _DP_osmcomp=	pthread
 _DP_opensm=	pthread
 _DP_osmvendor=	ibumad pthread
 .endif
 
 # Define special cases
 LDADD_supcplusplus=	-lsupc++
 LIBATF_C=	${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c.a
 LIBATF_CXX=	${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c++.a
 LDADD_atf_c=	-lprivateatf-c
 LDADD_atf_cxx=	-lprivateatf-c++
 
 LIBGMOCK=	${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock.a
 LIBGMOCK_MAIN=	${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock_main.a
 LIBGTEST=	${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest.a
 LIBGTEST_MAIN=	${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest_main.a
 LDADD_gmock=	-lprivategmock
 LDADD_gtest=	-lprivategtest
 LDADD_gmock_main= -lprivategmock_main
 LDADD_gtest_main= -lprivategtest_main
 
 .for _l in ${_PRIVATELIBS}
 LIB${_l:tu}?=	${LIBDESTDIR}${LIBDIR_BASE}/libprivate${_l}.a
 .endfor
 
 .if ${MK_PIE} != "no"
 PIE_SUFFIX=	_pie
 .endif
 
 .for _l in ${_LIBRARIES}
 .if ${_INTERNALLIBS:M${_l}} || !defined(SYSROOT)
 LDADD_${_l}_L+=		-L${LIB${_l:tu}DIR}
 .endif
 DPADD_${_l}?=	${LIB${_l:tu}}
 .if ${_PRIVATELIBS:M${_l}}
 LDADD_${_l}?=	-lprivate${_l}
 .elif ${_INTERNALLIBS:M${_l}}
 LDADD_${_l}?=	${LDADD_${_l}_L} -l${_l:S/${PIE_SUFFIX}//}${PIE_SUFFIX}
 .else
 LDADD_${_l}?=	${LDADD_${_l}_L} -l${_l}
 .endif
 # Add in all dependencies for static linkage.
 .if defined(_DP_${_l}) && (${_INTERNALLIBS:M${_l}} || \
     (defined(NO_SHARED) && ${NO_SHARED:tl} != "no"))
 .for _d in ${_DP_${_l}}
 DPADD_${_l}+=	${DPADD_${_d}}
 LDADD_${_l}+=	${LDADD_${_d}}
 .endfor
 .endif
 .endfor
 
 # These are special cases where the library is broken and anything that uses
 # it needs to add more dependencies.  Broken usually means that it has a
 # cyclic dependency and cannot link its own dependencies.  This is bad, please
 # fix the library instead.
 # Unless the library itself is broken then the proper place to define
 # dependencies is _DP_* above.
 
 # libatf-c++ exposes libatf-c abi hence we need to explicit link to atf_c for
 # atf_cxx
 DPADD_atf_cxx+=	${DPADD_atf_c}
 LDADD_atf_cxx+=	${LDADD_atf_c}
 
 DPADD_gmock+=	${DPADD_gtest}
 LDADD_gmock+=	${LDADD_gtest}
 
 DPADD_gmock_main+=	${DPADD_gmock}
 LDADD_gmock_main+=	${LDADD_gmock}
 
 DPADD_gtest_main+=	${DPADD_gtest}
 LDADD_gtest_main+=	${LDADD_gtest}
 
 # Detect LDADD/DPADD that should be LIBADD, before modifying LDADD here.
 _BADLDADD=
 .for _l in ${LDADD:M-l*:N-l*/*:C,^-l,,}
 .if ${_LIBRARIES:M${_l}} && !${_PRIVATELIBS:M${_l}}
 _BADLDADD+=	${_l}
 .endif
 .endfor
 .if !empty(_BADLDADD)
 .error ${.CURDIR}: These libraries should be LIBADD+=foo rather than DPADD/LDADD+=-lfoo: ${_BADLDADD}
 .endif
 
 .for _l in ${LIBADD}
 DPADD+=		${DPADD_${_l}}
 LDADD+=		${LDADD_${_l}}
 .endfor
 
 _LIB_OBJTOP?=	${OBJTOP}
 # INTERNALLIB definitions.
 LIBELFTCDIR=	${_LIB_OBJTOP}/lib/libelftc
 LIBELFTC?=	${LIBELFTCDIR}/libelftc${PIE_SUFFIX}.a
 
 LIBKYUA_CLIDIR=	${_LIB_OBJTOP}/lib/kyua/cli
 LIBKYUA_CLI?=	${LIBKYUA_CLIDIR}/libkyua_cli${PIE_SUFFIX}.a
 
 LIBKYUA_DRIVERSDIR=	${_LIB_OBJTOP}/lib/kyua/drivers
 LIBKYUA_DRIVERS?=	${LIBKYUA_DRIVERSDIR}/libkyua_drivers${PIE_SUFFIX}.a
 
 LIBKYUA_ENGINEDIR=	${_LIB_OBJTOP}/lib/kyua/engine
 LIBKYUA_ENGINE?=	${LIBKYUA_ENGINEDIR}/libkyua_engine${PIE_SUFFIX}.a
 
 LIBKYUA_MODELDIR=	${_LIB_OBJTOP}/lib/kyua/model
 LIBKYUA_MODEL?=		${LIBKYUA_MODELDIR}/libkyua_model${PIE_SUFFIX}.a
 
 LIBKYUA_STOREDIR=	${_LIB_OBJTOP}/lib/kyua/store
 LIBKYUA_STORE?=		${LIBKYUA_STOREDIR}/libkyua_store${PIE_SUFFIX}.a
 
 LIBKYUA_UTILSDIR=	${_LIB_OBJTOP}/lib/kyua/utils
 LIBKYUA_UTILS?=		${LIBKYUA_UTILSDIR}/libkyua_utils${PIE_SUFFIX}.a
 
 LIBLUADIR=	${_LIB_OBJTOP}/lib/liblua
 LIBLUA?=	${LIBLUADIR}/liblua${PIE_SUFFIX}.a
 
 LIBLUTOKDIR=	${_LIB_OBJTOP}/lib/liblutok
 LIBLUTOK?=	${LIBLUTOKDIR}/liblutok${PIE_SUFFIX}.a
 
 LIBPEDIR=	${_LIB_OBJTOP}/lib/libpe
 LIBPE?=		${LIBPEDIR}/libpe${PIE_SUFFIX}.a
 
 LIBOPENBSDDIR=	${_LIB_OBJTOP}/lib/libopenbsd
 LIBOPENBSD?=	${LIBOPENBSDDIR}/libopenbsd${PIE_SUFFIX}.a
 
 LIBSMDIR=	${_LIB_OBJTOP}/lib/libsm
 LIBSM?=		${LIBSMDIR}/libsm${PIE_SUFFIX}.a
 
 LIBSMDBDIR=	${_LIB_OBJTOP}/lib/libsmdb
 LIBSMDB?=	${LIBSMDBDIR}/libsmdb${PIE_SUFFIX}.a
 
 LIBSMUTILDIR=	${_LIB_OBJTOP}/lib/libsmutil
 LIBSMUTIL?=	${LIBSMUTILDIR}/libsmutil${PIE_SUFFIX}.a
 
 LIBNETBSDDIR?=	${_LIB_OBJTOP}/lib/libnetbsd
 LIBNETBSD?=	${LIBNETBSDDIR}/libnetbsd${PIE_SUFFIX}.a
 
 LIBVERSDIR?=	${_LIB_OBJTOP}/kerberos5/lib/libvers
 LIBVERS?=	${LIBVERSDIR}/libvers${PIE_SUFFIX}.a
 
 LIBSLDIR=	${_LIB_OBJTOP}/kerberos5/lib/libsl
 LIBSL?=		${LIBSLDIR}/libsl${PIE_SUFFIX}.a
 
 LIBIFCONFIGDIR=	${_LIB_OBJTOP}/lib/libifconfig
 LIBIFCONFIG?=	${LIBIFCONFIGDIR}/libifconfig${PIE_SUFFIX}.a
 
 LIBIPFDIR=	${_LIB_OBJTOP}/sbin/ipf/libipf
 LIBIPF?=	${LIBIPFDIR}/libipf${PIE_SUFFIX}.a
 
 LIBTELNETDIR=	${_LIB_OBJTOP}/lib/libtelnet
 LIBTELNET?=	${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a
 
 LIBCRONDIR=	${_LIB_OBJTOP}/usr.sbin/cron/lib
 LIBCRON?=	${LIBCRONDIR}/libcron${PIE_SUFFIX}.a
 
 LIBNTPDIR=	${_LIB_OBJTOP}/usr.sbin/ntp/libntp
 LIBNTP?=	${LIBNTPDIR}/libntp${PIE_SUFFIX}.a
 
 LIBNTPEVENTDIR=	${_LIB_OBJTOP}/usr.sbin/ntp/libntpevent
 LIBNTPEVENT?=	${LIBNTPEVENTDIR}/libntpevent${PIE_SUFFIX}.a
 
 LIBOPTSDIR=	${_LIB_OBJTOP}/usr.sbin/ntp/libopts
 LIBOPTS?=	${LIBOPTSDIR}/libopts${PIE_SUFFIX}.a
 
 LIBPARSEDIR=	${_LIB_OBJTOP}/usr.sbin/ntp/libparse
 LIBPARSE?=	${LIBPARSEDIR}/libparse${PIE_SUFFIX}.a
 
 LIBLPRDIR=	${_LIB_OBJTOP}/usr.sbin/lpr/common_source
 LIBLPR?=	${LIBLPRDIR}/liblpr${PIE_SUFFIX}.a
 
 LIBFIFOLOGDIR=	${_LIB_OBJTOP}/usr.sbin/fifolog/lib
 LIBFIFOLOG?=	${LIBFIFOLOGDIR}/libfifolog${PIE_SUFFIX}.a
 
 LIBBSNMPTOOLSDIR=	${_LIB_OBJTOP}/usr.sbin/bsnmpd/tools/libbsnmptools
 LIBBSNMPTOOLS?=	${LIBBSNMPTOOLSDIR}/libbsnmptools${PIE_SUFFIX}.a
 
 LIBAMUDIR=	${_LIB_OBJTOP}/usr.sbin/amd/libamu
 LIBAMU?=	${LIBAMUDIR}/libamu${PIE_SUFFIX}.a
 
 LIBBE?=		${LIBBEDIR}/libbe${PIE_SUFFIX}.a
 
 LIBPMCSTATDIR=	${_LIB_OBJTOP}/lib/libpmcstat
 LIBPMCSTAT?=	${LIBPMCSTATDIR}/libpmcstat${PIE_SUFFIX}.a
 
 LIBC_NOSSP_PICDIR=	${_LIB_OBJTOP}/lib/libc
 LIBC_NOSSP_PIC?=	${LIBC_NOSSP_PICDIR}/libc_nossp_pic.a
 
 # Define a directory for each library.  This is useful for adding -L in when
 # not using a --sysroot or for meta mode bootstrapping when there is no
 # Makefile.depend.  These are sorted by directory.
 LIBAVLDIR=	${OBJTOP}/cddl/lib/libavl
 LIBCTFDIR=	${OBJTOP}/cddl/lib/libctf
 LIBDTRACEDIR=	${OBJTOP}/cddl/lib/libdtrace
 LIBNVPAIRDIR=	${OBJTOP}/cddl/lib/libnvpair
 LIBUMEMDIR=	${OBJTOP}/cddl/lib/libumem
 LIBUUTILDIR=	${OBJTOP}/cddl/lib/libuutil
 LIBZFSDIR=	${OBJTOP}/cddl/lib/libzfs
 LIBZFS_COREDIR=	${OBJTOP}/cddl/lib/libzfs_core
 LIBZPOOLDIR=	${OBJTOP}/cddl/lib/libzpool
 
 # OFED support
 LIBCXGB4DIR=	${OBJTOP}/lib/ofed/libcxgb4
 LIBIBCMDIR=	${OBJTOP}/lib/ofed/libibcm
 LIBIBMADDIR=	${OBJTOP}/lib/ofed/libibmad
 LIBIBNETDISCDIR=${OBJTOP}/lib/ofed/libibnetdisc
 LIBIBUMADDIR=	${OBJTOP}/lib/ofed/libibumad
 LIBIBVERBSDIR=	${OBJTOP}/lib/ofed/libibverbs
 LIBMLX4DIR=	${OBJTOP}/lib/ofed/libmlx4
 LIBMLX5DIR=	${OBJTOP}/lib/ofed/libmlx5
 LIBRDMACMDIR=	${OBJTOP}/lib/ofed/librdmacm
 LIBOSMCOMPDIR=	${OBJTOP}/lib/ofed/complib
 LIBOPENSMDIR=	${OBJTOP}/lib/ofed/libopensm
 LIBOSMVENDORDIR=${OBJTOP}/lib/ofed/libvendor
 
 LIBDIALOGDIR=	${OBJTOP}/gnu/lib/libdialog
 LIBGNUREGEXDIR=	${OBJTOP}/gnu/lib/libregex
 LIBSSPDIR=	${OBJTOP}/lib/libssp
 LIBSSP_NONSHAREDDIR=	${OBJTOP}/lib/libssp_nonshared
 LIBASN1DIR=	${OBJTOP}/kerberos5/lib/libasn1
 LIBGSSAPI_KRB5DIR=	${OBJTOP}/kerberos5/lib/libgssapi_krb5
 LIBGSSAPI_NTLMDIR=	${OBJTOP}/kerberos5/lib/libgssapi_ntlm
 LIBGSSAPI_SPNEGODIR=	${OBJTOP}/kerberos5/lib/libgssapi_spnego
 LIBHDBDIR=	${OBJTOP}/kerberos5/lib/libhdb
 LIBHEIMBASEDIR=	${OBJTOP}/kerberos5/lib/libheimbase
 LIBHEIMIPCCDIR=	${OBJTOP}/kerberos5/lib/libheimipcc
 LIBHEIMIPCSDIR=	${OBJTOP}/kerberos5/lib/libheimipcs
 LIBHEIMNTLMDIR=	${OBJTOP}/kerberos5/lib/libheimntlm
 LIBHX509DIR=	${OBJTOP}/kerberos5/lib/libhx509
 LIBKADM5CLNTDIR=	${OBJTOP}/kerberos5/lib/libkadm5clnt
 LIBKADM5SRVDIR=	${OBJTOP}/kerberos5/lib/libkadm5srv
 LIBKAFS5DIR=	${OBJTOP}/kerberos5/lib/libkafs5
 LIBKDCDIR=	${OBJTOP}/kerberos5/lib/libkdc
 LIBKRB5DIR=	${OBJTOP}/kerberos5/lib/libkrb5
 LIBROKENDIR=	${OBJTOP}/kerberos5/lib/libroken
 LIBWINDDIR=	${OBJTOP}/kerberos5/lib/libwind
 LIBATF_CDIR=	${OBJTOP}/lib/atf/libatf-c
 LIBATF_CXXDIR=	${OBJTOP}/lib/atf/libatf-c++
 LIBGMOCKDIR=	${OBJTOP}/lib/googletest/gmock
 LIBGMOCK_MAINDIR=	${OBJTOP}/lib/googletest/gmock_main
 LIBGTESTDIR=	${OBJTOP}/lib/googletest/gtest
 LIBGTEST_MAINDIR=	${OBJTOP}/lib/googletest/gtest_main
 LIBALIASDIR=	${OBJTOP}/lib/libalias/libalias
 LIBBLACKLISTDIR=	${OBJTOP}/lib/libblacklist
 LIBBLOCKSRUNTIMEDIR=	${OBJTOP}/lib/libblocksruntime
 LIBBSNMPDIR=	${OBJTOP}/lib/libbsnmp/libbsnmp
 LIBCASPERDIR=	${OBJTOP}/lib/libcasper/libcasper
 LIBCAP_DNSDIR=	${OBJTOP}/lib/libcasper/services/cap_dns
 LIBCAP_GRPDIR=	${OBJTOP}/lib/libcasper/services/cap_grp
+LIBCAP_NETDIR=	${OBJTOP}/lib/libcasper/services/cap_net
 LIBCAP_PWDDIR=	${OBJTOP}/lib/libcasper/services/cap_pwd
 LIBCAP_SYSCTLDIR=	${OBJTOP}/lib/libcasper/services/cap_sysctl
 LIBCAP_SYSLOGDIR=	${OBJTOP}/lib/libcasper/services/cap_syslog
 LIBBSDXMLDIR=	${OBJTOP}/lib/libexpat
 LIBKVMDIR=	${OBJTOP}/lib/libkvm
 LIBPTHREADDIR=	${OBJTOP}/lib/libthr
 LIBMDIR=	${OBJTOP}/lib/msun
 LIBFORMDIR=	${OBJTOP}/lib/ncurses/form
 LIBFORMLIBWDIR=	${OBJTOP}/lib/ncurses/formw
 LIBMENUDIR=	${OBJTOP}/lib/ncurses/menu
 LIBMENULIBWDIR=	${OBJTOP}/lib/ncurses/menuw
 LIBNCURSESDIR=	${OBJTOP}/lib/ncurses/ncurses
 LIBNCURSESWDIR=	${OBJTOP}/lib/ncurses/ncursesw
 LIBPANELDIR=	${OBJTOP}/lib/ncurses/panel
 LIBPANELWDIR=	${OBJTOP}/lib/ncurses/panelw
 LIBCRYPTODIR=	${OBJTOP}/secure/lib/libcrypto
 LIBSSHDIR=	${OBJTOP}/secure/lib/libssh
 LIBSSLDIR=	${OBJTOP}/secure/lib/libssl
 LIBTEKENDIR=	${OBJTOP}/sys/teken/libteken
 LIBEGACYDIR=	${OBJTOP}/tools/build
 LIBLNDIR=	${OBJTOP}/usr.bin/lex/lib
 
 LIBTERMCAPDIR=	${LIBNCURSESDIR}
 LIBTERMCAPWDIR=	${LIBNCURSESWDIR}
 
 # Default other library directories to lib/libNAME.
 .for lib in ${_LIBRARIES}
 LIB${lib:tu}DIR?=	${OBJTOP}/lib/lib${lib}
 .endfor
 
 # Validate that listed LIBADD are valid.
 .for _l in ${LIBADD}
 .if empty(_LIBRARIES:M${_l})
 _BADLIBADD+= ${_l}
 .endif
 .endfor
 .if !empty(_BADLIBADD)
 .error ${.CURDIR}: Invalid LIBADD used which may need to be added to ${_this:T}: ${_BADLIBADD}
 .endif
 
 # Sanity check that libraries are defined here properly when building them.
 .if defined(LIB) && ${_LIBRARIES:M${LIB}} != ""
 .if !empty(LIBADD) && \
     (!defined(_DP_${LIB}) || ${LIBADD:O:u} != ${_DP_${LIB}:O:u})
 .error ${.CURDIR}: Missing or incorrect _DP_${LIB} entry in ${_this:T}.  Should match LIBADD for ${LIB} ('${LIBADD}' vs '${_DP_${LIB}}')
 .endif
 # Note that OBJTOP is not yet defined here but for the purpose of the check
 # it is fine as it resolves to the SRC directory.
 .if !defined(LIB${LIB:tu}DIR) || !exists(${SRCTOP}/${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,})
 .error ${.CURDIR}: Missing or incorrect value for LIB${LIB:tu}DIR in ${_this:T}: ${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,}
 .endif
 .if ${_INTERNALLIBS:M${LIB}} != "" && !defined(LIB${LIB:tu})
 .error ${.CURDIR}: Missing value for LIB${LIB:tu} in ${_this:T}.  Likely should be: LIB${LIB:tu}?= $${LIB${LIB:tu}DIR}/lib${LIB}.a
 .endif
 .endif
 
 .endif	# !target(__<src.libnames.mk>__)
Index: projects/clang1100-import/sys/compat/linuxkpi/common/include/linux/fs.h
===================================================================
--- projects/clang1100-import/sys/compat/linuxkpi/common/include/linux/fs.h	(revision 364278)
+++ projects/clang1100-import/sys/compat/linuxkpi/common/include/linux/fs.h	(revision 364279)
@@ -1,305 +1,305 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2018 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUX_FS_H_
 #define	_LINUX_FS_H_
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/dcache.h>
 
 struct module;
 struct kiocb;
 struct iovec;
 struct dentry;
 struct page;
 struct file_lock;
 struct pipe_inode_info;
 struct vm_area_struct;
 struct poll_table_struct;
 struct files_struct;
 struct pfs_node;
 struct linux_cdev;
 
 #define	inode	vnode
 #define	i_cdev	v_rdev
 #define	i_private v_data
 
 #define	S_IRUGO	(S_IRUSR | S_IRGRP | S_IROTH)
 #define	S_IWUGO	(S_IWUSR | S_IWGRP | S_IWOTH)
 
 
 typedef struct files_struct *fl_owner_t;
 
 struct file_operations;
 
 struct linux_file_wait_queue {
 	struct wait_queue wq;
 	struct wait_queue_head *wqh;
 	atomic_t state;
 #define	LINUX_FWQ_STATE_INIT 0
 #define	LINUX_FWQ_STATE_NOT_READY 1
 #define	LINUX_FWQ_STATE_QUEUED 2
 #define	LINUX_FWQ_STATE_READY 3
 #define	LINUX_FWQ_STATE_MAX 4
 };
 
 struct linux_file {
 	struct file	*_file;
 	const struct file_operations	*f_op;
 	void		*private_data;
 	int		f_flags;
 	int		f_mode;	/* Just starting mode. */
 	struct dentry	*f_dentry;
 	struct dentry	f_dentry_store;
 	struct selinfo	f_selinfo;
 	struct sigio	*f_sigio;
 	struct vnode	*f_vnode;
 #define	f_inode	f_vnode
 	volatile u_int	f_count;
 
 	/* anonymous shmem object */
 	vm_object_t	f_shmem;
 
 	/* kqfilter support */
 	int		f_kqflags;
 #define	LINUX_KQ_FLAG_HAS_READ (1 << 0)
 #define	LINUX_KQ_FLAG_HAS_WRITE (1 << 1)
 #define	LINUX_KQ_FLAG_NEED_READ (1 << 2)
 #define	LINUX_KQ_FLAG_NEED_WRITE (1 << 3)
 	/* protects f_selinfo.si_note */
 	spinlock_t	f_kqlock;
 	struct linux_file_wait_queue f_wait_queue;
 
 	/* pointer to associated character device, if any */
 	struct linux_cdev *f_cdev;
 };
 
 #define	file		linux_file
 #define	fasync_struct	sigio *
 
 #define	fasync_helper(fd, filp, on, queue)				\
 ({									\
 	if ((on))							\
 		*(queue) = &(filp)->f_sigio;				\
 	else								\
 		*(queue) = NULL;					\
 	0;								\
 })
 
 #define	kill_fasync(queue, sig, pollstat)				\
 do {									\
 	if (*(queue) != NULL)						\
 		pgsigio(*(queue), (sig), 0);				\
 } while (0)
 
 typedef int (*filldir_t)(void *, const char *, int, off_t, u64, unsigned);
 
 struct file_operations {
 	struct module *owner;
 	ssize_t (*read)(struct linux_file *, char __user *, size_t, off_t *);
 	ssize_t (*write)(struct linux_file *, const char __user *, size_t, off_t *);
 	unsigned int (*poll) (struct linux_file *, struct poll_table_struct *);
 	long (*unlocked_ioctl)(struct linux_file *, unsigned int, unsigned long);
 	long (*compat_ioctl)(struct linux_file *, unsigned int, unsigned long);
 	int (*mmap)(struct linux_file *, struct vm_area_struct *);
 	int (*open)(struct inode *, struct file *);
 	int (*release)(struct inode *, struct linux_file *);
 	int (*fasync)(int, struct linux_file *, int);
 
 /* Although not supported in FreeBSD, to align with Linux code
  * we are adding llseek() only when it is mapped to no_llseek which returns
  * an illegal seek error
  */
 	off_t (*llseek)(struct linux_file *, off_t, int);
 #if 0
 	/* We do not support these methods.  Don't permit them to compile. */
 	loff_t (*llseek)(struct file *, loff_t, int);
 	ssize_t (*aio_read)(struct kiocb *, const struct iovec *,
 	    unsigned long, loff_t);
 	ssize_t (*aio_write)(struct kiocb *, const struct iovec *,
 	    unsigned long, loff_t);
 	int (*readdir)(struct file *, void *, filldir_t);
 	int (*ioctl)(struct inode *, struct file *, unsigned int,
 	    unsigned long);
 	int (*flush)(struct file *, fl_owner_t id);
 	int (*fsync)(struct file *, struct dentry *, int datasync);
 	int (*aio_fsync)(struct kiocb *, int datasync);
 	int (*lock)(struct file *, int, struct file_lock *);
 	ssize_t (*sendpage)(struct file *, struct page *, int, size_t,
 	    loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long,
 	    unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
 	int (*flock)(struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 	    loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *,
 	    struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
 #endif
 };
 #define	fops_get(fops)		(fops)
 #define	replace_fops(f, fops)	((f)->f_op = (fops))
 
 #define	FMODE_READ	FREAD
 #define	FMODE_WRITE	FWRITE
 #define	FMODE_EXEC	FEXEC
 
 int __register_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops);
 int __register_chrdev_p(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops, uid_t uid,
     gid_t gid, int mode);
 void __unregister_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name);
 
 static inline void
 unregister_chrdev(unsigned int major, const char *name)
 {
 
 	__unregister_chrdev(major, 0, 256, name);
 }
 
 static inline int
 register_chrdev(unsigned int major, const char *name,
     const struct file_operations *fops)
 {
 
 	return (__register_chrdev(major, 0, 256, name, fops));
 }
 
 static inline int
 register_chrdev_p(unsigned int major, const char *name,
     const struct file_operations *fops, uid_t uid, gid_t gid, int mode)
 {
 
 	return (__register_chrdev_p(major, 0, 256, name, fops, uid, gid, mode));
 }
 
 static inline int
 register_chrdev_region(dev_t dev, unsigned range, const char *name)
 {
 
 	return 0;
 }
 
 static inline void
 unregister_chrdev_region(dev_t dev, unsigned range)
 {
 
 	return;
 }
 
 static inline int
 alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 			const char *name)
 {
 
 	return 0;
 }
 
 /* No current support for seek op in FreeBSD */
 static inline int
 nonseekable_open(struct inode *inode, struct file *filp)
 {
 	return 0;
 }
 
 extern unsigned int linux_iminor(struct inode *);
 #define	iminor(...) linux_iminor(__VA_ARGS__)
 
 static inline struct linux_file *
 get_file(struct linux_file *f)
 {
 
 	refcount_acquire(f->_file == NULL ? &f->f_count : &f->_file->f_count);
 	return (f);
 }
 
 static inline struct inode *
 igrab(struct inode *inode)
 {
 	int error;
 
-	error = vget(inode, 0, curthread);
+	error = vget(inode, 0);
 	if (error)
 		return (NULL);
 
 	return (inode);
 }
 
 static inline void
 iput(struct inode *inode)
 {
 
 	vrele(inode);
 }
 
 static inline loff_t
 no_llseek(struct file *file, loff_t offset, int whence)
 {
 
 	return (-ESPIPE);
 }
 
 static inline loff_t
 noop_llseek(struct linux_file *file, loff_t offset, int whence)
 {
 
 	return (file->_file->f_offset);
 }
 
 static inline struct vnode *
 file_inode(const struct linux_file *file)
 {
 
 	return (file->f_vnode);
 }
 
 static inline int
 call_mmap(struct linux_file *file, struct vm_area_struct *vma)
 {
 
 	return (file->f_op->mmap(file, vma));
 }
 
 #endif /* _LINUX_FS_H_ */
Index: projects/clang1100-import/sys/fs/autofs/autofs_vnops.c
===================================================================
--- projects/clang1100-import/sys/fs/autofs/autofs_vnops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/autofs/autofs_vnops.c	(revision 364279)
@@ -1,715 +1,715 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/condvar.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 #include <vm/uma.h>
 
 #include <fs/autofs/autofs.h>
 
 static int	autofs_trigger_vn(struct vnode *vp, const char *path,
 		    int pathlen, struct vnode **newvp);
 
 extern struct autofs_softc	*autofs_softc;
 
 static int
 autofs_access(struct vop_access_args *ap)
 {
 
 	/*
 	 * Nothing to do here; the only kind of access control
 	 * needed is in autofs_mkdir().
 	 */
 
 	return (0);
 }
 
 static int
 autofs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp, *newvp;
 	struct autofs_node *anp;
 	struct mount *mp;
 	struct vattr *vap;
 	int error;
 
 	vp = ap->a_vp;
 	anp = vp->v_data;
 	mp = vp->v_mount;
 	vap = ap->a_vap;
 
 	KASSERT(ap->a_vp->v_type == VDIR, ("!VDIR"));
 
 	/*
 	 * The reason we must do this is that some tree-walking software,
 	 * namely fts(3), assumes that stat(".") results will not change
 	 * between chdir("subdir") and chdir(".."), and fails with ENOENT
 	 * otherwise.
 	 */
 	if (autofs_mount_on_stat && autofs_cached(anp, NULL, 0) == false &&
 	    autofs_ignore_thread(curthread) == false) {
 		error = autofs_trigger_vn(vp, "", 0, &newvp);
 		if (error != 0)
 			return (error);
 
 		if (newvp != NULL) {
 			error = VOP_GETATTR(newvp, ap->a_vap,
 			    ap->a_cred);
 			vput(newvp);
 			return (error);
 		}
 	}
 
 	vap->va_type = VDIR;
 	vap->va_mode = 0755;
 	vap->va_nlink = 3; /* XXX */
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = NODEV;
 	vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = anp->an_fileno;
 	vap->va_size = S_BLKSIZE;
 	vap->va_blocksize = S_BLKSIZE;
 	vap->va_mtime = anp->an_ctime;
 	vap->va_atime = anp->an_ctime;
 	vap->va_ctime = anp->an_ctime;
 	vap->va_birthtime = anp->an_ctime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = 0;
 	vap->va_bytes = S_BLKSIZE;
 	vap->va_filerev = 0;
 	vap->va_spare = 0;
 
 	return (0);
 }
 
 /*
  * Unlock the vnode, request automountd(8) action, and then lock it back.
  * If anything got mounted on top of the vnode, return the new filesystem's
  * root vnode in 'newvp', locked.
  */
 static int
 autofs_trigger_vn(struct vnode *vp, const char *path, int pathlen,
     struct vnode **newvp)
 {
 	struct autofs_node *anp;
 	int error, lock_flags;
 
 	anp = vp->v_data;
 
 	/*
 	 * Release the vnode lock, so that other operations, in partcular
 	 * mounting a filesystem on top of it, can proceed.  Increase use
 	 * count, to prevent the vnode from being deallocated and to prevent
 	 * filesystem from being unmounted.
 	 */
 	lock_flags = VOP_ISLOCKED(vp);
 	vref(vp);
 	VOP_UNLOCK(vp);
 
 	sx_xlock(&autofs_softc->sc_lock);
 
 	/*
 	 * XXX: Workaround for mounting the same thing multiple times; revisit.
 	 */
 	if (vp->v_mountedhere != NULL) {
 		error = 0;
 		goto mounted;
 	}
 
 	error = autofs_trigger(anp, path, pathlen);
 mounted:
 	sx_xunlock(&autofs_softc->sc_lock);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	vunref(vp);
 	if (VN_IS_DOOMED(vp)) {
 		AUTOFS_DEBUG("VIRF_DOOMED");
 		return (ENOENT);
 	}
 
 	if (error != 0)
 		return (error);
 
 	if (vp->v_mountedhere == NULL) {
 		*newvp = NULL;
 		return (0);
 	} else {
 		/*
 		 * If the operation that succeeded was mount, then mark
 		 * the node as non-cached.  Otherwise, if someone unmounts
 		 * the filesystem before the cache times out, we will fail
 		 * to trigger.
 		 */
 		anp->an_cached = false;
 	}
 
 	error = VFS_ROOT(vp->v_mountedhere, lock_flags, newvp);
 	if (error != 0) {
 		AUTOFS_WARN("VFS_ROOT() failed with error %d", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 autofs_vget_callback(struct mount *mp, void *arg, int flags,
     struct vnode **vpp)
 {
 
 
 	return (autofs_node_vn(arg, mp, flags, vpp));
 }
 
 static int
 autofs_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp, *newvp, **vpp;
 	struct mount *mp;
 	struct autofs_mount *amp;
 	struct autofs_node *anp, *child;
 	struct componentname *cnp;
 	int error;
 
 	dvp = ap->a_dvp;
 	vpp = ap->a_vpp;
 	mp = dvp->v_mount;
 	amp = VFSTOAUTOFS(mp);
 	anp = dvp->v_data;
 	cnp = ap->a_cnp;
 
 	if (cnp->cn_flags & ISDOTDOT) {
 		KASSERT(anp->an_parent != NULL, ("NULL parent"));
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 */
 		error = vn_vget_ino_gen(dvp, autofs_vget_callback,
 		    anp->an_parent, cnp->cn_lkflags, vpp);
 		if (error != 0) {
 			AUTOFS_WARN("vn_vget_ino_gen() failed with error %d",
 			    error);
 			return (error);
 		}
 		return (error);
 	}
 
 	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 		vref(dvp);
 		*vpp = dvp;
 
 		return (0);
 	}
 
 	if (autofs_cached(anp, cnp->cn_nameptr, cnp->cn_namelen) == false &&
 	    autofs_ignore_thread(cnp->cn_thread) == false) {
 		error = autofs_trigger_vn(dvp,
 		    cnp->cn_nameptr, cnp->cn_namelen, &newvp);
 		if (error != 0)
 			return (error);
 
 		if (newvp != NULL) {
 			/*
 			 * The target filesystem got automounted.
 			 * Let the lookup(9) go around with the same
 			 * path component.
 			 */
 			vput(newvp);
 			return (ERELOOKUP);
 		}
 	}
 
 	AUTOFS_SLOCK(amp);
 	error = autofs_node_find(anp, cnp->cn_nameptr, cnp->cn_namelen, &child);
 	if (error != 0) {
 		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
 			AUTOFS_SUNLOCK(amp);
 			return (EJUSTRETURN);
 		}
 
 		AUTOFS_SUNLOCK(amp);
 		return (ENOENT);
 	}
 
 	/*
 	 * XXX: Dropping the node here is ok, because we never remove nodes.
 	 */
 	AUTOFS_SUNLOCK(amp);
 
 	error = autofs_node_vn(child, mp, cnp->cn_lkflags, vpp);
 	if (error != 0) {
 		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE)
 			return (EJUSTRETURN);
 
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 autofs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *vp;
 	struct autofs_node *anp;
 	struct autofs_mount *amp;
 	struct autofs_node *child;
 	int error;
 
 	vp = ap->a_dvp;
 	anp = vp->v_data;
 	amp = VFSTOAUTOFS(vp->v_mount);
 
 	/*
 	 * Do not allow mkdir() if the calling thread is not
 	 * automountd(8) descendant.
 	 */
 	if (autofs_ignore_thread(curthread) == false)
 		return (EPERM);
 
 	AUTOFS_XLOCK(amp);
 	error = autofs_node_new(anp, amp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_namelen, &child);
 	if (error != 0) {
 		AUTOFS_XUNLOCK(amp);
 		return (error);
 	}
 	AUTOFS_XUNLOCK(amp);
 
 	error = autofs_node_vn(child, vp->v_mount, LK_EXCLUSIVE, ap->a_vpp);
 
 	return (error);
 }
 
 static int
 autofs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp;
 	struct autofs_node *anp;
 
 	vp = ap->a_vp;
 	anp = vp->v_data;
 
 	printf("    name \"%s\", fileno %d, cached %d, wildcards %d\n",
 	    anp->an_name, anp->an_fileno, anp->an_cached, anp->an_wildcards);
 
 	return (0);
 }
 
 /*
  * Write out a single 'struct dirent', based on 'name' and 'fileno' arguments.
  */
 static int
 autofs_readdir_one(struct uio *uio, const char *name, int fileno,
     size_t *reclenp)
 {
 	struct dirent dirent;
 	size_t namlen, reclen;
 	int error;
 
 	namlen = strlen(name);
 	reclen = _GENERIC_DIRLEN(namlen);
 	if (reclenp != NULL)
 		*reclenp = reclen;
 
 	if (uio == NULL)
 		return (0);
 
 	if (uio->uio_resid < reclen)
 		return (EINVAL);
 
 	dirent.d_fileno = fileno;
 	dirent.d_reclen = reclen;
 	dirent.d_type = DT_DIR;
 	dirent.d_namlen = namlen;
 	memcpy(dirent.d_name, name, namlen);
 	dirent_terminate(&dirent);
 	error = uiomove(&dirent, reclen, uio);
 
 	return (error);
 }
 
 static size_t
 autofs_dirent_reclen(const char *name)
 {
 	size_t reclen;
 
 	(void)autofs_readdir_one(NULL, name, -1, &reclen);
 
 	return (reclen);
 }
 
 static int
 autofs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp, *newvp;
 	struct autofs_mount *amp;
 	struct autofs_node *anp, *child;
 	struct uio *uio;
 	size_t reclen, reclens;
 	ssize_t initial_resid;
 	int error;
 
 	vp = ap->a_vp;
 	amp = VFSTOAUTOFS(vp->v_mount);
 	anp = vp->v_data;
 	uio = ap->a_uio;
 	initial_resid = ap->a_uio->uio_resid;
 
 	KASSERT(vp->v_type == VDIR, ("!VDIR"));
 
 	if (autofs_cached(anp, NULL, 0) == false &&
 	    autofs_ignore_thread(curthread) == false) {
 		error = autofs_trigger_vn(vp, "", 0, &newvp);
 		if (error != 0)
 			return (error);
 
 		if (newvp != NULL) {
 			error = VOP_READDIR(newvp, ap->a_uio, ap->a_cred,
 			    ap->a_eofflag, ap->a_ncookies, ap->a_cookies);
 			vput(newvp);
 			return (error);
 		}
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = FALSE;
 
 	/*
 	 * Write out the directory entry for ".".  This is conditional
 	 * on the current offset into the directory; same applies to the
 	 * other two cases below.
 	 */
 	if (uio->uio_offset == 0) {
 		error = autofs_readdir_one(uio, ".", anp->an_fileno, &reclen);
 		if (error != 0)
 			goto out;
 	}
 	reclens = autofs_dirent_reclen(".");
 
 	/*
 	 * Write out the directory entry for "..".
 	 */
 	if (uio->uio_offset <= reclens) {
 		if (uio->uio_offset != reclens)
 			return (EINVAL);
 		if (anp->an_parent == NULL) {
 			error = autofs_readdir_one(uio, "..",
 			    anp->an_fileno, &reclen);
 		} else {
 			error = autofs_readdir_one(uio, "..",
 			    anp->an_parent->an_fileno, &reclen);
 		}
 		if (error != 0)
 			goto out;
 	}
 
 	reclens += autofs_dirent_reclen("..");
 
 	/*
 	 * Write out the directory entries for subdirectories.
 	 */
 	AUTOFS_SLOCK(amp);
 	RB_FOREACH(child, autofs_node_tree, &anp->an_children) {
 		/*
 		 * Check the offset to skip entries returned by previous
 		 * calls to getdents().
 		 */
 		if (uio->uio_offset > reclens) {
 			reclens += autofs_dirent_reclen(child->an_name);
 			continue;
 		}
 
 		/*
 		 * Prevent seeking into the middle of dirent.
 		 */
 		if (uio->uio_offset != reclens) {
 			AUTOFS_SUNLOCK(amp);
 			return (EINVAL);
 		}
 
 		error = autofs_readdir_one(uio, child->an_name,
 		    child->an_fileno, &reclen);
 		reclens += reclen;
 		if (error != 0) {
 			AUTOFS_SUNLOCK(amp);
 			goto out;
 		}
 	}
 	AUTOFS_SUNLOCK(amp);
 
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = TRUE;
 
 	return (0);
 
 out:
 	/*
 	 * Return error if the initial buffer was too small to do anything.
 	 */
 	if (uio->uio_resid == initial_resid)
 		return (error);
 
 	/*
 	 * Don't return an error if we managed to copy out some entries.
 	 */
 	if (uio->uio_resid < reclen)
 		return (0);
 
 	return (error);
 }
 
 static int
 autofs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct autofs_node *anp;
 
 	vp = ap->a_vp;
 	anp = vp->v_data;
 
 	/*
 	 * We do not free autofs_node here; instead we are
 	 * destroying them in autofs_node_delete().
 	 */
 	sx_xlock(&anp->an_vnode_lock);
 	anp->an_vnode = NULL;
 	vp->v_data = NULL;
 	sx_xunlock(&anp->an_vnode_lock);
 
 	return (0);
 }
 
 struct vop_vector autofs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		autofs_access,
 	.vop_lookup =		autofs_lookup,
 	.vop_create =		VOP_EOPNOTSUPP,
 	.vop_getattr =		autofs_getattr,
 	.vop_link =		VOP_EOPNOTSUPP,
 	.vop_mkdir =		autofs_mkdir,
 	.vop_mknod =		VOP_EOPNOTSUPP,
 	.vop_print =		autofs_print,
 	.vop_read =		VOP_EOPNOTSUPP,
 	.vop_readdir =		autofs_readdir,
 	.vop_remove =		VOP_EOPNOTSUPP,
 	.vop_rename =		VOP_EOPNOTSUPP,
 	.vop_rmdir =		VOP_EOPNOTSUPP,
 	.vop_setattr =		VOP_EOPNOTSUPP,
 	.vop_symlink =		VOP_EOPNOTSUPP,
 	.vop_write =		VOP_EOPNOTSUPP,
 	.vop_reclaim =		autofs_reclaim,
 };
 VFS_VOP_VECTOR_REGISTER(autofs_vnodeops);
 
 int
 autofs_node_new(struct autofs_node *parent, struct autofs_mount *amp,
     const char *name, int namelen, struct autofs_node **anpp)
 {
 	struct autofs_node *anp;
 
 	if (parent != NULL) {
 		AUTOFS_ASSERT_XLOCKED(parent->an_mount);
 
 		KASSERT(autofs_node_find(parent, name, namelen, NULL) == ENOENT,
 		    ("node \"%s\" already exists", name));
 	}
 
 	anp = uma_zalloc(autofs_node_zone, M_WAITOK | M_ZERO);
 	if (namelen >= 0)
 		anp->an_name = strndup(name, namelen, M_AUTOFS);
 	else
 		anp->an_name = strdup(name, M_AUTOFS);
 	anp->an_fileno = atomic_fetchadd_int(&amp->am_last_fileno, 1);
 	callout_init(&anp->an_callout, 1);
 	/*
 	 * The reason for SX_NOWITNESS here is that witness(4)
 	 * cannot tell vnodes apart, so the following perfectly
 	 * valid lock order...
 	 *
 	 * vnode lock A -> autofsvlk B -> vnode lock B
 	 *
 	 * ... gets reported as a LOR.
 	 */
 	sx_init_flags(&anp->an_vnode_lock, "autofsvlk", SX_NOWITNESS);
 	getnanotime(&anp->an_ctime);
 	anp->an_parent = parent;
 	anp->an_mount = amp;
 	if (parent != NULL)
 		RB_INSERT(autofs_node_tree, &parent->an_children, anp);
 	RB_INIT(&anp->an_children);
 
 	*anpp = anp;
 	return (0);
 }
 
 int
 autofs_node_find(struct autofs_node *parent, const char *name,
     int namelen, struct autofs_node **anpp)
 {
 	struct autofs_node *anp, find;
 	int error;
 
 	AUTOFS_ASSERT_LOCKED(parent->an_mount);
 
 	if (namelen >= 0)
 		find.an_name = strndup(name, namelen, M_AUTOFS);
 	else
 		find.an_name = strdup(name, M_AUTOFS);
 
 	anp = RB_FIND(autofs_node_tree, &parent->an_children, &find);
 	if (anp != NULL) {
 		error = 0;
 		if (anpp != NULL)
 			*anpp = anp;
 	} else {
 		error = ENOENT;
 	}
 
 	free(find.an_name, M_AUTOFS);
 
 	return (error);
 }
 
 void
 autofs_node_delete(struct autofs_node *anp)
 {
 	struct autofs_node *parent;
 
 	AUTOFS_ASSERT_XLOCKED(anp->an_mount);
 	KASSERT(RB_EMPTY(&anp->an_children), ("have children"));
 
 	callout_drain(&anp->an_callout);
 
 	parent = anp->an_parent;
 	if (parent != NULL)
 		RB_REMOVE(autofs_node_tree, &parent->an_children, anp);
 	sx_destroy(&anp->an_vnode_lock);
 	free(anp->an_name, M_AUTOFS);
 	uma_zfree(autofs_node_zone, anp);
 }
 
 int
 autofs_node_vn(struct autofs_node *anp, struct mount *mp, int flags,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	AUTOFS_ASSERT_UNLOCKED(anp->an_mount);
 
 	sx_xlock(&anp->an_vnode_lock);
 
 	vp = anp->an_vnode;
 	if (vp != NULL) {
-		error = vget(vp, flags | LK_RETRY, curthread);
+		error = vget(vp, flags | LK_RETRY);
 		if (error != 0) {
 			AUTOFS_WARN("vget failed with error %d", error);
 			sx_xunlock(&anp->an_vnode_lock);
 			return (error);
 		}
 		if (VN_IS_DOOMED(vp)) {
 			/*
 			 * We got forcibly unmounted.
 			 */
 			AUTOFS_DEBUG("doomed vnode");
 			sx_xunlock(&anp->an_vnode_lock);
 			vput(vp);
 
 			return (ENOENT);
 		}
 
 		*vpp = vp;
 		sx_xunlock(&anp->an_vnode_lock);
 		return (0);
 	}
 
 	error = getnewvnode("autofs", mp, &autofs_vnodeops, &vp);
 	if (error != 0) {
 		sx_xunlock(&anp->an_vnode_lock);
 		return (error);
 	}
 
 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0) {
 		sx_xunlock(&anp->an_vnode_lock);
 		vdrop(vp);
 		return (error);
 	}
 
 	vp->v_type = VDIR;
 	if (anp->an_parent == NULL)
 		vp->v_vflag |= VV_ROOT;
 	vp->v_data = anp;
 
 	VN_LOCK_ASHARE(vp);
 
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		AUTOFS_DEBUG("insmntque() failed with error %d", error);
 		sx_xunlock(&anp->an_vnode_lock);
 		return (error);
 	}
 
 	KASSERT(anp->an_vnode == NULL, ("lost race"));
 	anp->an_vnode = vp;
 
 	sx_xunlock(&anp->an_vnode_lock);
 
 	*vpp = vp;
 	return (0);
 }
Index: projects/clang1100-import/sys/fs/ext2fs/ext2_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/ext2fs/ext2_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/ext2fs/ext2_vfsops.c	(revision 364279)
@@ -1,1445 +1,1445 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/endian.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/mutex.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <fs/ext2fs/fs.h>
 #include <fs/ext2fs/ext2_mount.h>
 #include <fs/ext2fs/inode.h>
 
 #include <fs/ext2fs/ext2fs.h>
 #include <fs/ext2fs/ext2_dinode.h>
 #include <fs/ext2fs/ext2_extern.h>
 #include <fs/ext2fs/ext2_extents.h>
 
 SDT_PROVIDER_DECLARE(ext2fs);
 /*
  * ext2fs trace probe:
  * arg0: verbosity. Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(ext2fs, , vfsops, trace, "int", "char*");
 SDT_PROBE_DEFINE2(ext2fs, , vfsops, ext2_cg_validate_error, "char*", "int");
 SDT_PROBE_DEFINE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "char*");
 
 
 static int	ext2_flushfiles(struct mount *mp, int flags, struct thread *td);
 static int	ext2_mountfs(struct vnode *, struct mount *);
 static int	ext2_reload(struct mount *mp, struct thread *td);
 static int	ext2_sbupdate(struct ext2mount *, int);
 static int	ext2_cgupdate(struct ext2mount *, int);
 static vfs_unmount_t		ext2_unmount;
 static vfs_root_t		ext2_root;
 static vfs_statfs_t		ext2_statfs;
 static vfs_sync_t		ext2_sync;
 static vfs_vget_t		ext2_vget;
 static vfs_fhtovp_t		ext2_fhtovp;
 static vfs_mount_t		ext2_mount;
 
 MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part");
 static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure");
 
 static struct vfsops ext2fs_vfsops = {
 	.vfs_fhtovp =		ext2_fhtovp,
 	.vfs_mount =		ext2_mount,
 	.vfs_root =		ext2_root,	/* root inode via vget */
 	.vfs_statfs =		ext2_statfs,
 	.vfs_sync =		ext2_sync,
 	.vfs_unmount =		ext2_unmount,
 	.vfs_vget =		ext2_vget,
 };
 
 VFS_SET(ext2fs_vfsops, ext2fs, 0);
 
 static int	ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev,
 		    int ronly);
 static int	ext2_compute_sb_data(struct vnode * devvp,
 		    struct ext2fs * es, struct m_ext2fs * fs);
 
 static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr", 
     "noclusterw", "noexec", "export", "force", "from", "multilabel",
     "suiddir", "nosymfollow", "sync", "union", NULL };
 
 /*
  * VFS Operations.
  *
  * mount system call
  */
 static int
 ext2_mount(struct mount *mp)
 {
 	struct vfsoptlist *opts;
 	struct vnode *devvp;
 	struct thread *td;
 	struct ext2mount *ump = NULL;
 	struct m_ext2fs *fs;
 	struct nameidata nd, *ndp = &nd;
 	accmode_t accmode;
 	char *path, *fspec;
 	int error, flags, len;
 
 	td = curthread;
 	opts = mp->mnt_optnew;
 
 	if (vfs_filteropt(opts, ext2_opts))
 		return (EINVAL);
 
 	vfs_getopt(opts, "fspath", (void **)&path, NULL);
 	/* Double-check the length of path.. */
 	if (strlen(path) >= MAXMNTLEN)
 		return (ENAMETOOLONG);
 
 	fspec = NULL;
 	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
 	if (!error && fspec[len - 1] != '\0')
 		return (EINVAL);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOEXT2(mp);
 		fs = ump->um_e2fs;
 		error = 0;
 		if (fs->e2fs_ronly == 0 &&
 		    vfs_flagopt(opts, "ro", NULL, 0)) {
 			error = VFS_SYNC(mp, MNT_WAIT);
 			if (error)
 				return (error);
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			error = ext2_flushfiles(mp, flags, td);
 			if (error == 0 && fs->e2fs_wasvalid &&
 			    ext2_cgupdate(ump, MNT_WAIT) == 0) {
 				fs->e2fs->e2fs_state =
 				    htole16((le16toh(fs->e2fs->e2fs_state) |
 				    E2FS_ISCLEAN));
 				ext2_sbupdate(ump, MNT_WAIT);
 			}
 			fs->e2fs_ronly = 1;
 			vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY);
 			g_topology_lock();
 			g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 		}
 		if (!error && (mp->mnt_flag & MNT_RELOAD))
 			error = ext2_reload(mp, td);
 		if (error)
 			return (error);
 		devvp = ump->um_devvp;
 		if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) {
 			if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0))
 				return (EPERM);
 
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp);
 				return (error);
 			}
 			VOP_UNLOCK(devvp);
 			g_topology_lock();
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error)
 				return (error);
 
 			if ((le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN) == 0 ||
 			    (le16toh(fs->e2fs->e2fs_state) & E2FS_ERRORS)) {
 				if (mp->mnt_flag & MNT_FORCE) {
 					printf(
 "WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt);
 				} else {
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->e2fs_fsmnt);
 					return (EPERM);
 				}
 			}
 			fs->e2fs->e2fs_state =
 			    htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN);
 			(void)ext2_cgupdate(ump, MNT_WAIT);
 			fs->e2fs_ronly = 0;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		}
 		if (vfs_flagopt(opts, "export", NULL, 0)) {
 			/* Process export requests in vfs_mount.c. */
 			return (error);
 		}
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	if (fspec == NULL)
 		return (EINVAL);
 	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	if ((error = namei(ndp)) != 0)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 *
 	 * XXXRW: VOP_ACCESS() enough?
 	 */
 	accmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = ext2_mountfs(devvp, mp);
 	} else {
 		if (devvp != ump->um_devvp) {
 			vput(devvp);
 			return (EINVAL);	/* needs translation */
 		} else
 			vput(devvp);
 	}
 	if (error) {
 		vrele(devvp);
 		return (error);
 	}
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 
 	/*
 	 * Note that this strncpy() is ok because of a check at the start
 	 * of ext2_mount().
 	 */
 	strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN);
 	fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0';
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 static int
 ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly)
 {
 	uint32_t i, mask;
 
 	if (le16toh(es->e2fs_magic) != E2FS_MAGIC) {
 		printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n",
 		    devtoname(dev), le16toh(es->e2fs_magic), E2FS_MAGIC);
 		return (1);
 	}
 	if (le32toh(es->e2fs_rev) > E2FS_REV0) {
 		mask = le32toh(es->e2fs_features_incompat) & ~(EXT2F_INCOMPAT_SUPP);
 		if (mask) {
 			printf("WARNING: mount of %s denied due to "
 			    "unsupported optional features:\n", devtoname(dev));
 			for (i = 0;
 			    i < sizeof(incompat)/sizeof(struct ext2_feature);
 			    i++)
 				if (mask & incompat[i].mask)
 					printf("%s ", incompat[i].name);
 			printf("\n");
 			return (1);
 		}
 		mask = le32toh(es->e2fs_features_rocompat) & ~EXT2F_ROCOMPAT_SUPP;
 		if (!ronly && mask) {
 			printf("WARNING: R/W mount of %s denied due to "
 			    "unsupported optional features:\n", devtoname(dev));
 			for (i = 0;
 			    i < sizeof(ro_compat)/sizeof(struct ext2_feature);
 			    i++)
 				if (mask & ro_compat[i].mask)
 					printf("%s ", ro_compat[i].name);
 			printf("\n");
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static e4fs_daddr_t
 ext2_cg_location(struct m_ext2fs *fs, int number)
 {
 	int cg, descpb, logical_sb, has_super = 0;
 
 	/*
 	 * Adjust logical superblock block number.
 	 * Godmar thinks: if the blocksize is greater than 1024, then
 	 * the superblock is logically part of block zero.
 	 */
 	logical_sb = fs->e2fs_bsize > SBSIZE ? 0 : 1;
 
 	if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_META_BG) ||
 	    number < le32toh(fs->e2fs->e3fs_first_meta_bg))
 		return (logical_sb + number + 1);
 
 	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT))
 		descpb = fs->e2fs_bsize / sizeof(struct ext2_gd);
 	else
 		descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE;
 
 	cg = descpb * number;
 
 	if (ext2_cg_has_sb(fs, cg))
 		has_super = 1;
 
 	return (has_super + cg * (e4fs_daddr_t)EXT2_BLOCKS_PER_GROUP(fs) +
 	    le32toh(fs->e2fs->e2fs_first_dblock));
 }
 
 static int
 ext2_cg_validate(struct m_ext2fs *fs)
 {
 	uint64_t b_bitmap;
 	uint64_t i_bitmap;
 	uint64_t i_tables;
 	uint64_t first_block, last_block, last_cg_block;
 	struct ext2_gd *gd;
 	unsigned int i, cg_count;
 
 	first_block = le32toh(fs->e2fs->e2fs_first_dblock);
 	last_cg_block = ext2_cg_number_gdb(fs, 0);
 	cg_count = fs->e2fs_gcount;
 
 	for (i = 0; i < fs->e2fs_gcount; i++) {
 		gd = &fs->e2fs_gd[i];
 
 		if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG) ||
 		    i == fs->e2fs_gcount - 1) {
 			last_block = fs->e2fs_bcount - 1;
 		} else {
 			last_block = first_block +
 			    (EXT2_BLOCKS_PER_GROUP(fs) - 1);
 		}
 
 		if ((cg_count == fs->e2fs_gcount) &&
 		    !(le16toh(gd->ext4bgd_flags) & EXT2_BG_INODE_ZEROED))
 			cg_count = i;
 
 		b_bitmap = e2fs_gd_get_b_bitmap(gd);
 		if (b_bitmap == 0) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "block bitmap is zero", i);
 			return (EINVAL);
 
 		}
 		if (b_bitmap <= last_cg_block) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "block bitmap overlaps gds", i);
 			return (EINVAL);
 		}
 		if (b_bitmap < first_block || b_bitmap > last_block) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "block bitmap not in group", i);
 			return (EINVAL);
 		}
 
 		i_bitmap = e2fs_gd_get_i_bitmap(gd);
 		if (i_bitmap == 0) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "inode bitmap is zero", i);
 			return (EINVAL);
 		}
 		if (i_bitmap <= last_cg_block) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "inode bitmap overlaps gds", i);
 			return (EINVAL);
 		}
 		if (i_bitmap < first_block || i_bitmap > last_block) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "inode bitmap not in group blk", i);
 			return (EINVAL);
 		}
 
 		i_tables = e2fs_gd_get_i_tables(gd);
 		if (i_tables == 0) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "inode table is zero", i);
 			return (EINVAL);
 		}
 		if (i_tables <= last_cg_block) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "inode talbes overlaps gds", i);
 			return (EINVAL);
 		}
 		if (i_tables < first_block ||
 		    i_tables + fs->e2fs_itpg - 1 > last_block) {
 			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
 			    "inode tables not in group blk", i);
 			return (EINVAL);
 		}
 
 		if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG))
 			first_block += EXT2_BLOCKS_PER_GROUP(fs);
 	}
 
 	return (0);
 }
 
 /*
  * This computes the fields of the m_ext2fs structure from the
  * data in the ext2fs structure read in.
  */
 static int
 ext2_compute_sb_data(struct vnode *devvp, struct ext2fs *es,
     struct m_ext2fs *fs)
 {
 	struct buf *bp;
 	uint32_t e2fs_descpb, e2fs_gdbcount_alloc;
 	int i, j;
 	int g_count = 0;
 	int error;
 
 	/* Check checksum features */
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) &&
 	    EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "incorrect checksum features combination");
 		return (EINVAL);
 	}
 
 	/* Precompute checksum seed for all metadata */
 	ext2_sb_csum_set_seed(fs);
 
 	/* Verify sb csum if possible */
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
 		error = ext2_sb_csum_verify(fs);
 		if (error) {
 			return (error);
 		}
 	}
 
 	/* Check for block size = 1K|2K|4K */
 	if (le32toh(es->e2fs_log_bsize) > 2) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "bad block size");
 		return (EINVAL);
 	}
 
 	fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + le32toh(es->e2fs_log_bsize);
 	fs->e2fs_bsize = 1U << fs->e2fs_bshift;
 	fs->e2fs_fsbtodb = le32toh(es->e2fs_log_bsize) + 1;
 	fs->e2fs_qbmask = fs->e2fs_bsize - 1;
 
 	/* Check for fragment size */
 	if (le32toh(es->e2fs_log_fsize) >
 	    (EXT2_MAX_FRAG_LOG_SIZE - EXT2_MIN_BLOCK_LOG_SIZE)) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "invalid log cluster size");
 		return (EINVAL);
 	}
 
 	fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << le32toh(es->e2fs_log_fsize);
 	if (fs->e2fs_fsize != fs->e2fs_bsize) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "fragment size != block size");
 		return (EINVAL);
 	}
 
 	fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize;
 
 	/* Check reserved gdt blocks for future filesystem expansion */
 	if (le16toh(es->e2fs_reserved_ngdb) > (fs->e2fs_bsize / 4)) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "number of reserved GDT blocks too large");
 		return (EINVAL);
 	}
 
 	if (le32toh(es->e2fs_rev) == E2FS_REV0) {
 		fs->e2fs_isize = E2FS_REV0_INODE_SIZE;
 	} else {
 		fs->e2fs_isize = le16toh(es->e2fs_inode_size);
 
 		/*
 		 * Check first ino.
 		 */
 		if (le32toh(es->e2fs_first_ino) < EXT2_FIRSTINO) {
 			SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 			    "invalid first ino");
 			return (EINVAL);
 		}
 
 		/*
 		 * Simple sanity check for superblock inode size value.
 		 */
 		if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE ||
 		    EXT2_INODE_SIZE(fs) > fs->e2fs_bsize ||
 		    (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) {
 			SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 			    "invalid inode size");
 			return (EINVAL);
 		}
 	}
 
 	/* Check group descriptors */
 	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) &&
 	    le16toh(es->e3fs_desc_size) != E2FS_64BIT_GD_SIZE) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "unsupported 64bit descriptor size");
 		return (EINVAL);
 	}
 
 	fs->e2fs_bpg = le32toh(es->e2fs_bpg);
 	fs->e2fs_fpg = le32toh(es->e2fs_fpg);
 	if (fs->e2fs_bpg == 0 || fs->e2fs_fpg == 0) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "zero blocks/fragments per group");
 		return (EINVAL);
 	} else if (fs->e2fs_bpg != fs->e2fs_fpg) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "blocks per group not equal fragments per group");
 		return (EINVAL);
 	}
 
 	if (fs->e2fs_bpg != fs->e2fs_bsize * 8) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "non-standard group size unsupported");
 		return (EINVAL);
 	}
 
 	fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs);
 	if (fs->e2fs_ipb == 0 ||
 	    fs->e2fs_ipb > fs->e2fs_bsize / E2FS_REV0_INODE_SIZE) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "bad inodes per block size");
 		return (EINVAL);
 	}
 
 	fs->e2fs_ipg = le32toh(es->e2fs_ipg);
 	if (fs->e2fs_ipg < fs->e2fs_ipb || fs->e2fs_ipg >  fs->e2fs_bsize * 8) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "invalid inodes per group");
 		return (EINVAL);
 	}
 
 	fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb;
 
 	fs->e2fs_bcount = le32toh(es->e2fs_bcount);
 	fs->e2fs_rbcount = le32toh(es->e2fs_rbcount);
 	fs->e2fs_fbcount = le32toh(es->e2fs_fbcount);
 	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
 		fs->e2fs_bcount |= (uint64_t)(le32toh(es->e4fs_bcount_hi)) << 32;
 		fs->e2fs_rbcount |= (uint64_t)(le32toh(es->e4fs_rbcount_hi)) << 32;
 		fs->e2fs_fbcount |= (uint64_t)(le32toh(es->e4fs_fbcount_hi)) << 32;
 	}
 	if (fs->e2fs_rbcount > fs->e2fs_bcount ||
 	    fs->e2fs_fbcount > fs->e2fs_bcount) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "invalid block count");
 		return (EINVAL);
 	}
 
 	fs->e2fs_ficount = le32toh(es->e2fs_ficount);
 	if (fs->e2fs_ficount > le32toh(es->e2fs_icount)) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "invalid number of free inodes");
 		return (EINVAL);
 	}
 
 	if (le32toh(es->e2fs_first_dblock) >= fs->e2fs_bcount) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "first data block out of range");
 		return (EINVAL);
 	}
 
 	fs->e2fs_gcount = howmany(fs->e2fs_bcount -
 	    le32toh(es->e2fs_first_dblock), EXT2_BLOCKS_PER_GROUP(fs));
 	if (fs->e2fs_gcount > ((uint64_t)1 << 32) - EXT2_DESCS_PER_BLOCK(fs)) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "groups count too large");
 		return (EINVAL);
 	}
 
 	/* Check for extra isize in big inodes. */
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) &&
 	    EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) {
 		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
 		    "no space for extra inode timestamps");
 		return (EINVAL);
 	}
 
 	/* s_resuid / s_resgid ? */
 
 	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
 		e2fs_descpb = fs->e2fs_bsize / E2FS_64BIT_GD_SIZE;
 		e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, e2fs_descpb);
 	} else {
 		e2fs_descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE;
 		e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount,
 		    fs->e2fs_bsize / sizeof(struct ext2_gd));
 	}
 	fs->e2fs_gdbcount = howmany(fs->e2fs_gcount, e2fs_descpb);
 	fs->e2fs_gd = malloc(e2fs_gdbcount_alloc * fs->e2fs_bsize,
 	    M_EXT2MNT, M_WAITOK | M_ZERO);
 	fs->e2fs_contigdirs = malloc(fs->e2fs_gcount *
 	    sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < fs->e2fs_gdbcount; i++) {
 		error = bread(devvp,
 		    fsbtodb(fs, ext2_cg_location(fs, i)),
 		    fs->e2fs_bsize, NOCRED, &bp);
 		if (error) {
 			/*
 			 * fs->e2fs_gd and fs->e2fs_contigdirs
 			 * will be freed later by the caller,
 			 * because this function could be called from
 			 * MNT_UPDATE path.
 			 */
 			return (error);
 		}
 		if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
 			memcpy(&fs->e2fs_gd[
 			    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
 			    bp->b_data, fs->e2fs_bsize);
 		} else {
 			for (j = 0; j < e2fs_descpb &&
 			    g_count < fs->e2fs_gcount; j++, g_count++)
 				memcpy(&fs->e2fs_gd[g_count],
 				    bp->b_data + j * E2FS_REV0_GD_SIZE,
 				    E2FS_REV0_GD_SIZE);
 		}
 		brelse(bp);
 		bp = NULL;
 	}
 
 	/* Validate cgs consistency */
 	error = ext2_cg_validate(fs);
 	if (error)
 		return (error);
 
 	/* Verfy cgs csum */
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) ||
 	    EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
 		error = ext2_gd_csum_verify(fs, devvp->v_rdev);
 		if (error)
 			return (error);
 	}
 	/* Initialization for the ext2 Orlov allocator variant. */
 	fs->e2fs_total_dir = 0;
 	for (i = 0; i < fs->e2fs_gcount; i++)
 		fs->e2fs_total_dir += e2fs_gd_get_ndirs(&fs->e2fs_gd[i]);
 
 	if (le32toh(es->e2fs_rev) == E2FS_REV0 ||
 	    !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE))
 		fs->e2fs_maxfilesize = 0x7fffffff;
 	else {
 		fs->e2fs_maxfilesize = 0xffffffffffff;
 		if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE))
 			fs->e2fs_maxfilesize = 0x7fffffffffffffff;
 	}
 	if (le32toh(es->e4fs_flags) & E2FS_UNSIGNED_HASH) {
 		fs->e2fs_uhash = 3;
 	} else if ((le32toh(es->e4fs_flags) & E2FS_SIGNED_HASH) == 0) {
 #ifdef __CHAR_UNSIGNED__
 		es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_UNSIGNED_HASH);
 		fs->e2fs_uhash = 3;
 #else
 		es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_SIGNED_HASH);
 #endif
 	}
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
 		error = ext2_sb_csum_verify(fs);
 
 	return (error);
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) invalidate all cluster summary information.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  * XXX we are missing some steps, in particular # 3, this has to be reviewed.
  */
 static int
 ext2_reload(struct mount *mp, struct thread *td)
 {
 	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	struct buf *bp;
 	struct ext2fs *es;
 	struct m_ext2fs *fs;
 	struct csum *sump;
 	int error, i;
 	int32_t *lp;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOEXT2(mp)->um_devvp;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	if (vinvalbuf(devvp, 0, 0, 0) != 0)
 		panic("ext2_reload: dirty1");
 	VOP_UNLOCK(devvp);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 * constants have been adjusted for ext2
 	 */
 	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
 		return (error);
 	es = (struct ext2fs *)bp->b_data;
 	if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) {
 		brelse(bp);
 		return (EIO);		/* XXX needs translation */
 	}
 	fs = VFSTOEXT2(mp)->um_e2fs;
 	bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs));
 
 	if ((error = ext2_compute_sb_data(devvp, es, fs)) != 0) {
 		brelse(bp);
 		return (error);
 	}
 #ifdef UNKLAR
 	if (fs->fs_sbsize < SBSIZE)
 		bp->b_flags |= B_INVAL;
 #endif
 	brelse(bp);
 
 	/*
 	 * Step 3: invalidate all cluster summary information.
 	 */
 	if (fs->e2fs_contigsumsize > 0) {
 		lp = fs->e2fs_maxcluster;
 		sump = fs->e2fs_clustersum;
 		for (i = 0; i < fs->e2fs_gcount; i++, sump++) {
 			*lp++ = fs->e2fs_contigsumsize;
 			sump->cs_init = 0;
 			bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1);
 		}
 	}
 
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
-		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, 0, 0))
 			panic("ext2_reload: dirty2");
 
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->e2fs_bsize, NOCRED, &bp);
 		if (error) {
 			VOP_UNLOCK(vp);
 			vrele(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (error);
 		}
 
 		error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data +
 		    EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip);
 
 		brelse(bp);
 		VOP_UNLOCK(vp);
 		vrele(vp);
 
 		if (error) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for mount and mountroot.
  */
 static int
 ext2_mountfs(struct vnode *devvp, struct mount *mp)
 {
 	struct ext2mount *ump;
 	struct buf *bp;
 	struct m_ext2fs *fs;
 	struct ext2fs *es;
 	struct cdev *dev = devvp->v_rdev;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 	struct csum *sump;
 	int error;
 	int ronly;
 	int i;
 	u_long size;
 	int32_t *lp;
 	int32_t e2fs_maxcontig;
 
 	ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0);
 	/* XXX: use VOP_ACESS to check FS perms */
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1);
 	g_topology_unlock();
 	VOP_UNLOCK(devvp);
 	if (error)
 		return (error);
 
 	/* XXX: should we check for some sectorsize or 512 instead? */
 	if (((SBSIZE % cp->provider->sectorsize) != 0) ||
 	    (SBSIZE < cp->provider->sectorsize)) {
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
 		return (EINVAL);
 	}
 
 	bo = &devvp->v_bufobj;
 	bo->bo_private = cp;
 	bo->bo_ops = g_vfs_bufops;
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	bp = NULL;
 	ump = NULL;
 	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
 		goto out;
 	es = (struct ext2fs *)bp->b_data;
 	if (ext2_check_sb_compat(es, dev, ronly) != 0) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	if ((le16toh(es->e2fs_state) & E2FS_ISCLEAN) == 0 ||
 	    (le16toh(es->e2fs_state) & E2FS_ERRORS)) {
 		if (ronly || (mp->mnt_flag & MNT_FORCE)) {
 			printf(
 "WARNING: Filesystem was not properly dismounted\n");
 		} else {
 			printf(
 "WARNING: R/W mount denied.  Filesystem is not clean - run fsck\n");
 			error = EPERM;
 			goto out;
 		}
 	}
 	ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO);
 
 	/*
 	 * I don't know whether this is the right strategy. Note that
 	 * we dynamically allocate both an m_ext2fs and an ext2fs
 	 * while Linux keeps the super block in a locked buffer.
 	 */
 	ump->um_e2fs = malloc(sizeof(struct m_ext2fs),
 	    M_EXT2MNT, M_WAITOK | M_ZERO);
 	ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs),
 	    M_EXT2MNT, M_WAITOK);
 	mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF);
 	bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs));
 	if ((error = ext2_compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs)))
 		goto out;
 
 	/*
 	 * Calculate the maximum contiguous blocks and size of cluster summary
 	 * array.  In FFS this is done by newfs; however, the superblock
 	 * in ext2fs doesn't have these variables, so we can calculate
 	 * them here.
 	 */
 	e2fs_maxcontig = MAX(1, MAXPHYS / ump->um_e2fs->e2fs_bsize);
 	ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG);
 	if (ump->um_e2fs->e2fs_contigsumsize > 0) {
 		size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t);
 		ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK);
 		size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum);
 		ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK);
 		lp = ump->um_e2fs->e2fs_maxcluster;
 		sump = ump->um_e2fs->e2fs_clustersum;
 		for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) {
 			*lp++ = ump->um_e2fs->e2fs_contigsumsize;
 			sump->cs_init = 0;
 			sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) *
 			    sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO);
 		}
 	}
 
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_e2fs;
 	fs->e2fs_ronly = ronly;	/* ronly is set according to mnt_flags */
 
 	/*
 	 * If the fs is not mounted read-only, make sure the super block is
 	 * always written back on a sync().
 	 */
 	fs->e2fs_wasvalid = le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN ? 1 : 0;
 	if (ronly == 0) {
 		fs->e2fs_fmod = 1;	/* mark it modified and set fs invalid */
 		fs->e2fs->e2fs_state =
 		    htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN);
 	}
 	mp->mnt_data = ump;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_cp = cp;
 
 	/*
 	 * Setting those two parameters allowed us to use
 	 * ufs_bmap w/o changse!
 	 */
 	ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs);
 	ump->um_bptrtodb = le32toh(fs->e2fs->e2fs_log_bsize) + 1;
 	ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs);
 	if (ronly == 0)
 		ext2_sbupdate(ump, MNT_WAIT);
 	/*
 	 * Initialize filesystem stat information in mount struct.
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
 	    MNTK_USES_BCACHE;
 	MNT_IUNLOCK(mp);
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
 	}
 	if (ump) {
 		mtx_destroy(EXT2_MTX(ump));
 		free(ump->um_e2fs->e2fs_gd, M_EXT2MNT);
 		free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT);
 		free(ump->um_e2fs->e2fs, M_EXT2MNT);
 		free(ump->um_e2fs, M_EXT2MNT);
 		free(ump, M_EXT2MNT);
 		mp->mnt_data = NULL;
 	}
 	return (error);
 }
 
 /*
  * Unmount system call.
  */
 static int
 ext2_unmount(struct mount *mp, int mntflags)
 {
 	struct ext2mount *ump;
 	struct m_ext2fs *fs;
 	struct csum *sump;
 	int error, flags, i, ronly;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		if (mp->mnt_flag & MNT_ROOTFS)
 			return (EINVAL);
 		flags |= FORCECLOSE;
 	}
 	if ((error = ext2_flushfiles(mp, flags, curthread)) != 0)
 		return (error);
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	ronly = fs->e2fs_ronly;
 	if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) {
 		if (fs->e2fs_wasvalid)
 			fs->e2fs->e2fs_state =
 			    htole16(le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN);
 		ext2_sbupdate(ump, MNT_WAIT);
 	}
 
 	g_topology_lock();
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
 	vrele(ump->um_devvp);
 	sump = fs->e2fs_clustersum;
 	for (i = 0; i < fs->e2fs_gcount; i++, sump++)
 		free(sump->cs_sum, M_EXT2MNT);
 	free(fs->e2fs_clustersum, M_EXT2MNT);
 	free(fs->e2fs_maxcluster, M_EXT2MNT);
 	free(fs->e2fs_gd, M_EXT2MNT);
 	free(fs->e2fs_contigdirs, M_EXT2MNT);
 	free(fs->e2fs, M_EXT2MNT);
 	free(fs, M_EXT2MNT);
 	free(ump, M_EXT2MNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 static int
 ext2_flushfiles(struct mount *mp, int flags, struct thread *td)
 {
 	int error;
 
 	error = vflush(mp, 0, flags, td);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 int
 ext2_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct ext2mount *ump;
 	struct m_ext2fs *fs;
 	uint32_t overhead, overhead_per_group, ngdb;
 	int i, ngroups;
 
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	if (le16toh(fs->e2fs->e2fs_magic) != E2FS_MAGIC)
 		panic("ext2_statfs");
 
 	/*
 	 * Compute the overhead (FS structures)
 	 */
 	overhead_per_group =
 	    1 /* block bitmap */ +
 	    1 /* inode bitmap */ +
 	    fs->e2fs_itpg;
 	overhead = le32toh(fs->e2fs->e2fs_first_dblock) +
 	    fs->e2fs_gcount * overhead_per_group;
 	if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 &&
 	    le32toh(fs->e2fs->e2fs_features_rocompat) & EXT2F_ROCOMPAT_SPARSESUPER) {
 		for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) {
 			if (ext2_cg_has_sb(fs, i))
 				ngroups++;
 		}
 	} else {
 		ngroups = fs->e2fs_gcount;
 	}
 	ngdb = fs->e2fs_gdbcount;
 	if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 &&
 	    le32toh(fs->e2fs->e2fs_features_compat) & EXT2F_COMPAT_RESIZE)
 		ngdb += le16toh(fs->e2fs->e2fs_reserved_ngdb);
 	overhead += ngroups * (1 /* superblock */ + ngdb);
 
 	sbp->f_bsize = EXT2_FRAG_SIZE(fs);
 	sbp->f_iosize = EXT2_BLOCK_SIZE(fs);
 	sbp->f_blocks = fs->e2fs_bcount - overhead;
 	sbp->f_bfree = fs->e2fs_fbcount;
 	sbp->f_bavail = sbp->f_bfree - fs->e2fs_rbcount;
 	sbp->f_files = le32toh(fs->e2fs->e2fs_icount);
 	sbp->f_ffree = fs->e2fs_ficount;
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 static int
 ext2_sync(struct mount *mp, int waitfor)
 {
 	struct vnode *mvp, *vp;
 	struct thread *td;
 	struct inode *ip;
 	struct ext2mount *ump = VFSTOEXT2(mp);
 	struct m_ext2fs *fs;
 	int error, allerror = 0;
 
 	td = curthread;
 	fs = ump->um_e2fs;
 	if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) {		/* XXX */
 		panic("ext2_sync: rofs mod fs=%s", fs->e2fs_fsmnt);
 	}
 
 	/*
 	 * Write back each (modified) inode.
 	 */
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
+		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK);
 		if (error) {
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 		if ((error = VOP_FSYNC(vp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(vp);
 		vrele(vp);
 	}
 
 	/*
 	 * Force stale filesystem control information to be flushed.
 	 */
 	if (waitfor != MNT_LAZY) {
 		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 		if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0)
 			allerror = error;
 		VOP_UNLOCK(ump->um_devvp);
 	}
 
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->e2fs_fmod != 0) {
 		fs->e2fs_fmod = 0;
 		fs->e2fs->e2fs_wtime = htole32(time_second);
 		if ((error = ext2_cgupdate(ump, waitfor)) != 0)
 			allerror = error;
 	}
 	return (allerror);
 }
 
 /*
  * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it
  * in from disk.  If it is in core, wait for the lock bit to clear, then
  * return the inode locked.  Detection and handling of mount points must be
  * done by the calling routine.
  */
 static int
 ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
 {
 	struct m_ext2fs *fs;
 	struct inode *ip;
 	struct ext2mount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	unsigned int i, used_blocks;
 	int error;
 
 	td = curthread;
 	error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	ump = VFSTOEXT2(mp);
 	ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) {
 		*vpp = NULL;
 		free(ip, M_EXT2NODE);
 		return (error);
 	}
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_e2fs = fs = ump->um_e2fs;
 	ip->i_ump = ump;
 	ip->i_number = ino;
 
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		free(ip, M_EXT2NODE);
 		*vpp = NULL;
 		return (error);
 	}
 	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		brelse(bp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	/* convert ext2 inode to dinode */
 	error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data +
 	    EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip);
 	if (error) {
 		brelse(bp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	ip->i_block_group = ino_to_cg(fs, ino);
 	ip->i_next_alloc_block = 0;
 	ip->i_next_alloc_goal = 0;
 
 	/*
 	 * Now we want to make sure that block pointers for unused
 	 * blocks are zeroed out - ext2_balloc depends on this
 	 * although for regular files and directories only
 	 *
 	 * If IN_E4EXTENTS is enabled, unused blocks are not zeroed
 	 * out because we could corrupt the extent tree.
 	 */
 	if (!(ip->i_flag & IN_E4EXTENTS) &&
 	    (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) {
 		used_blocks = howmany(ip->i_size, fs->e2fs_bsize);
 		for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++)
 			ip->i_db[i] = 0;
 	}
 #ifdef EXT2FS_PRINT_EXTENTS
 	ext2_print_inode(ip);
 	ext4_ext_print_extent_tree_status(ip);
 #endif
 	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Finish inode initialization.
 	 */
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ext2_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 	struct vnode *nvp;
 	struct m_ext2fs *fs;
 	int error;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOEXT2(mp)->um_e2fs;
 	if (ufhp->ufid_ino < EXT2_ROOTINO ||
 	    ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs_ipg)
 		return (ESTALE);
 
 	error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	ip = VTOI(nvp);
 	if (ip->i_mode == 0 ||
 	    ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) {
 		vput(nvp);
 		*vpp = NULLVP;
 		return (ESTALE);
 	}
 	*vpp = nvp;
 	vnode_create_vobject(*vpp, 0, curthread);
 	return (0);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 static int
 ext2_sbupdate(struct ext2mount *mp, int waitfor)
 {
 	struct m_ext2fs *fs = mp->um_e2fs;
 	struct ext2fs *es = fs->e2fs;
 	struct buf *bp;
 	int error = 0;
 
 	es->e2fs_bcount = htole32(fs->e2fs_bcount & 0xffffffff);
 	es->e2fs_rbcount = htole32(fs->e2fs_rbcount & 0xffffffff);
 	es->e2fs_fbcount = htole32(fs->e2fs_fbcount & 0xffffffff);
 	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
 		es->e4fs_bcount_hi = htole32(fs->e2fs_bcount >> 32);
 		es->e4fs_rbcount_hi = htole32(fs->e2fs_rbcount >> 32);
 		es->e4fs_fbcount_hi = htole32(fs->e2fs_fbcount >> 32);
 	}
 
 	es->e2fs_ficount = htole32(fs->e2fs_ficount);
 
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
 		ext2_sb_csum_set(fs);
 
 	bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0);
 	bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs));
 	if (waitfor == MNT_WAIT)
 		error = bwrite(bp);
 	else
 		bawrite(bp);
 
 	/*
 	 * The buffers for group descriptors, inode bitmaps and block bitmaps
 	 * are not busy at this point and are (hopefully) written by the
 	 * usual sync mechanism. No need to write them here.
 	 */
 	return (error);
 }
 int
 ext2_cgupdate(struct ext2mount *mp, int waitfor)
 {
 	struct m_ext2fs *fs = mp->um_e2fs;
 	struct buf *bp;
 	int i, j, g_count = 0, error = 0, allerror = 0;
 
 	allerror = ext2_sbupdate(mp, waitfor);
 
 	/* Update gd csums */
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) ||
 	    EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
 		ext2_gd_csum_set(fs);
 
 	for (i = 0; i < fs->e2fs_gdbcount; i++) {
 		bp = getblk(mp->um_devvp, fsbtodb(fs,
 		    ext2_cg_location(fs, i)),
 		    fs->e2fs_bsize, 0, 0, 0);
 		if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
 			memcpy(bp->b_data, &fs->e2fs_gd[
 			    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
 			    fs->e2fs_bsize);
 		} else {
 			for (j = 0; j < fs->e2fs_bsize / E2FS_REV0_GD_SIZE &&
 			    g_count < fs->e2fs_gcount; j++, g_count++)
 				memcpy(bp->b_data + j * E2FS_REV0_GD_SIZE,
 				    &fs->e2fs_gd[g_count], E2FS_REV0_GD_SIZE);
 		}
 		if (waitfor == MNT_WAIT)
 			error = bwrite(bp);
 		else
 			bawrite(bp);
 	}
 
 	if (!allerror && error)
 		allerror = error;
 	return (allerror);
 }
 
 /*
  * Return the root of a filesystem.
  */
 static int
 ext2_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *nvp;
 	int error;
 
 	error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp);
 	if (error)
 		return (error);
 	*vpp = nvp;
 	return (0);
 }
Index: projects/clang1100-import/sys/fs/fdescfs/fdesc_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/fdescfs/fdesc_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/fdescfs/fdesc_vfsops.c	(revision 364279)
@@ -1,231 +1,231 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vfsops.c	8.4 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 static MALLOC_DEFINE(M_FDESCMNT, "fdesc_mount", "FDESC mount structure");
 
 static vfs_cmount_t	fdesc_cmount;
 static vfs_mount_t	fdesc_mount;
 static vfs_unmount_t	fdesc_unmount;
 static vfs_statfs_t	fdesc_statfs;
 static vfs_root_t	fdesc_root;
 
 /*
  * Compatibility shim for old mount(2) system call.
  */
 int
 fdesc_cmount(struct mntarg *ma, void *data, uint64_t flags)
 {
 
 	return kernel_mount(ma, flags);
 }
 
 /*
  * Mount the per-process file descriptors (/dev/fd)
  */
 static int
 fdesc_mount(struct mount *mp)
 {
 	struct fdescmount *fmp;
 	struct vnode *rvp;
 	int error;
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS))
 		return (EOPNOTSUPP);
 
 	fmp = malloc(sizeof(struct fdescmount),
 				M_FDESCMNT, M_WAITOK);	/* XXX */
 
 	/*
 	 * We need to initialize a few bits of our local mount point struct to
 	 * avoid confusion in allocvp.
 	 */
 	mp->mnt_data = fmp;
 	fmp->flags = 0;
 	if (vfs_getopt(mp->mnt_optnew, "linrdlnk", NULL, NULL) == 0)
 		fmp->flags |= FMNT_LINRDLNKF;
 	error = fdesc_allocvp(Froot, -1, FD_ROOT, mp, &rvp);
 	if (error) {
 		free(fmp, M_FDESCMNT);
 		mp->mnt_data = NULL;
 		return (error);
 	}
 	rvp->v_type = VDIR;
 	rvp->v_vflag |= VV_ROOT;
 	fmp->f_root = rvp;
 	VOP_UNLOCK(rvp);
 	/* XXX -- don't mark as local to work around fts() problems */
 	/*mp->mnt_flag |= MNT_LOCAL;*/
 	vfs_getnewfsid(mp);
 
 	vfs_mountedfrom(mp, "fdescfs");
 	return (0);
 }
 
 static int
 fdesc_unmount(struct mount *mp, int mntflags)
 {
 	struct fdescmount *fmp;
 	int error, flags;
 
 	flags = 0;
 	fmp = mp->mnt_data;
 	if (mntflags & MNT_FORCE) {
 		/* The hash mutex protects the private mount flags. */
 		mtx_lock(&fdesc_hashmtx);
 		fmp->flags |= FMNT_UNMOUNTF;
 		mtx_unlock(&fdesc_hashmtx);
 		flags |= FORCECLOSE;
 	}
 
 	/*
 	 * Clear out buffer cache.  I don't think we
 	 * ever get anything cached at this level at the
 	 * moment, but who knows...
 	 *
 	 * There is 1 extra root vnode reference corresponding
 	 * to f_root.
 	 */
 	if ((error = vflush(mp, 1, flags, curthread)) != 0)
 		return (error);
 
 	/*
 	 * Finally, throw away the fdescmount structure.
 	 */
 	mp->mnt_data = NULL;
 	free(fmp, M_FDESCMNT);
 	return (0);
 }
 
 static int
 fdesc_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 
 	/*
 	 * Return locked reference to root.
 	 */
 	vp = VFSTOFDESC(mp)->f_root;
-	vget(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	vget(vp, LK_EXCLUSIVE | LK_RETRY);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 fdesc_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct thread *td;
 	struct filedesc *fdp;
 	int lim;
 	int i;
 	int last;
 	int freefd;
 	uint64_t limit;
 
 	td = curthread;
 
 	/*
 	 * Compute number of free file descriptors.
 	 * [ Strange results will ensue if the open file
 	 * limit is ever reduced below the current number
 	 * of open files... ]
 	 */
 	lim = lim_cur(td, RLIMIT_NOFILE);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	limit = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	if (lim > limit)
 		lim = limit;
 	last = min(fdp->fd_nfiles, lim);
 	freefd = 0;
 	for (i = fdp->fd_freefile; i < last; i++)
 		if (fdp->fd_ofiles[i].fde_file == NULL)
 			freefd++;
 
 	/*
 	 * Adjust for the fact that the fdesc array may not
 	 * have been fully allocated yet.
 	 */
 	if (fdp->fd_nfiles < lim)
 		freefd += (lim - fdp->fd_nfiles);
 	FILEDESC_SUNLOCK(fdp);
 
 	sbp->f_flags = 0;
 	sbp->f_bsize = DEV_BSIZE;
 	sbp->f_iosize = DEV_BSIZE;
 	sbp->f_blocks = 2;		/* 1K to keep df happy */
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = lim + 1;		/* Allow for "." */
 	sbp->f_ffree = freefd;		/* See comments above */
 	return (0);
 }
 
 static struct vfsops fdesc_vfsops = {
 	.vfs_cmount =		fdesc_cmount,
 	.vfs_init =		fdesc_init,
 	.vfs_mount =		fdesc_mount,
 	.vfs_root =		fdesc_root,
 	.vfs_statfs =		fdesc_statfs,
 	.vfs_uninit =		fdesc_uninit,
 	.vfs_unmount =		fdesc_unmount,
 };
 
 VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC | VFCF_JAIL);
Index: projects/clang1100-import/sys/fs/fdescfs/fdesc_vnops.c
===================================================================
--- projects/clang1100-import/sys/fs/fdescfs/fdesc_vnops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/fdescfs/fdesc_vnops.c	(revision 364279)
@@ -1,661 +1,661 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vnops.c	8.9 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>	/* boottime */
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/file.h>	/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 #define	NFDCACHE 4
 #define FD_NHASH(ix) \
 	(&fdhashtbl[(ix) & fdhash])
 static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl;
 static u_long fdhash;
 
 struct mtx fdesc_hashmtx;
 
 static vop_getattr_t	fdesc_getattr;
 static vop_lookup_t	fdesc_lookup;
 static vop_open_t	fdesc_open;
 static vop_pathconf_t	fdesc_pathconf;
 static vop_readdir_t	fdesc_readdir;
 static vop_readlink_t	fdesc_readlink;
 static vop_reclaim_t	fdesc_reclaim;
 static vop_setattr_t	fdesc_setattr;
 
 static struct vop_vector fdesc_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_NULL,
 	.vop_getattr =		fdesc_getattr,
 	.vop_lookup =		fdesc_lookup,
 	.vop_open =		fdesc_open,
 	.vop_pathconf =		fdesc_pathconf,
 	.vop_readdir =		fdesc_readdir,
 	.vop_readlink =		fdesc_readlink,
 	.vop_reclaim =		fdesc_reclaim,
 	.vop_setattr =		fdesc_setattr,
 };
 VFS_VOP_VECTOR_REGISTER(fdesc_vnodeops);
 
 static void fdesc_insmntque_dtr(struct vnode *, void *);
 static void fdesc_remove_entry(struct fdescnode *);
 
 /*
  * Initialise cache headers
  */
 int
 fdesc_init(struct vfsconf *vfsp)
 {
 
 	mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF);
 	fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
 	return (0);
 }
 
 /*
  * Uninit ready for unload.
  */
 int
 fdesc_uninit(struct vfsconf *vfsp)
 {
 
 	hashdestroy(fdhashtbl, M_CACHE, fdhash);
 	mtx_destroy(&fdesc_hashmtx);
 	return (0);
 }
 
 /*
  * If allocating vnode fails, call this.
  */
 static void
 fdesc_insmntque_dtr(struct vnode *vp, void *arg)
 {
 
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Remove an entry from the hash if it exists.
  */
 static void
 fdesc_remove_entry(struct fdescnode *fd)
 {
 	struct fdhashhead *fc;
 	struct fdescnode *fd2;
 
 	fc = FD_NHASH(fd->fd_ix);
 	mtx_lock(&fdesc_hashmtx);
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd == fd2) {
 			LIST_REMOVE(fd, fd_hash);
 			break;
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 }
 
 int
 fdesc_allocvp(fdntype ftype, unsigned fd_fd, int ix, struct mount *mp,
     struct vnode **vpp)
 {
 	struct fdescmount *fmp;
 	struct fdhashhead *fc;
 	struct fdescnode *fd, *fd2;
 	struct vnode *vp, *vp2;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	fc = FD_NHASH(ix);
 loop:
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		return (-1);
 	}
 
 	LIST_FOREACH(fd, fc, fd_hash) {
 		if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp = fd->fd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&fdesc_hashmtx);
-			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td))
+			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK))
 				goto loop;
 			*vpp = vp;
 			return (0);
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 
 	fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK);
 
 	error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp);
 	if (error) {
 		free(fd, M_TEMP);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_data = fd;
 	fd->fd_vnode = vp;
 	fd->fd_type = ftype;
 	fd->fd_fd = fd_fd;
 	fd->fd_ix = ix;
 	if (ftype == Fdesc && fmp->flags & FMNT_LINRDLNKF)
 		vp->v_vflag |= VV_READLINK;
 	error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL);
 	if (error != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 
 	/* Make sure that someone didn't beat us when inserting the vnode. */
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		vgone(vp);
 		vput(vp);
 		*vpp = NULLVP;
 		return (-1);
 	}
 
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp2 = fd2->fd_vnode;
 			VI_LOCK(vp2);
 			mtx_unlock(&fdesc_hashmtx);
-			error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, td);
+			error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK);
 			/* Someone beat us, dec use count and wait for reclaim */
 			vgone(vp);
 			vput(vp);
 			/* If we didn't get it, return no vnode. */
 			if (error)
 				vp2 = NULLVP;
 			*vpp = vp2;
 			return (error);
 		}
 	}
 
 	/* If we came here, we can insert it safely. */
 	LIST_INSERT_HEAD(fc, fd, fd_hash);
 	mtx_unlock(&fdesc_hashmtx);
 	*vpp = vp;
 	return (0);
 }
 
 struct fdesc_get_ino_args {
 	fdntype ftype;
 	unsigned fd_fd;
 	int ix;
 	struct file *fp;
 	struct thread *td;
 };
 
 static int
 fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 	struct fdesc_get_ino_args *a;
 	int error;
 
 	a = arg;
 	error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp);
 	fdrop(a->fp, a->td);
 	return (error);
 }
 
 
 /*
  * vp is the current namei directory
  * ndp is the name to locate in that directory...
  */
 static int
 fdesc_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	char *pname = cnp->cn_nameptr;
 	struct thread *td = cnp->cn_thread;
 	struct file *fp;
 	struct fdesc_get_ino_args arg;
 	int nlen = cnp->cn_namelen;
 	u_int fd, fd1;
 	int error;
 	struct vnode *fvp;
 
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad;
 	}
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (VTOFDESC(dvp)->fd_type != Froot) {
 		error = ENOTDIR;
 		goto bad;
 	}
 
 	fd = 0;
 	/* the only time a leading 0 is acceptable is if it's "0" */
 	if (*pname == '0' && nlen != 1) {
 		error = ENOENT;
 		goto bad;
 	}
 	while (nlen--) {
 		if (*pname < '0' || *pname > '9') {
 			error = ENOENT;
 			goto bad;
 		}
 		fd1 = 10 * fd + *pname++ - '0';
 		if (fd1 < fd) {
 			error = ENOENT;
 			goto bad;
 		}
 		fd = fd1;
 	}
 
 	/*
 	 * No rights to check since 'fp' isn't actually used.
 	 */
 	if ((error = fget(td, fd, &cap_no_rights, &fp)) != 0)
 		goto bad;
 
 	/* Check if we're looking up ourselves. */
 	if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) {
 		/*
 		 * In case we're holding the last reference to the file, the dvp
 		 * will be re-acquired.
 		 */
 		vhold(dvp);
 		VOP_UNLOCK(dvp);
 		fdrop(fp, td);
 
 		/* Re-aquire the lock afterwards. */
 		vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(dvp);
 		fvp = dvp;
 		if (VN_IS_DOOMED(dvp))
 			error = ENOENT;
 	} else {
 		/*
 		 * Unlock our root node (dvp) when doing this, since we might
 		 * deadlock since the vnode might be locked by another thread
 		 * and the root vnode lock will be obtained afterwards (in case
 		 * we're looking up the fd of the root vnode), which will be the
 		 * opposite lock order. Vhold the root vnode first so we don't
 		 * lose it.
 		 */
 		arg.ftype = Fdesc;
 		arg.fd_fd = fd;
 		arg.ix = FD_DESC + fd;
 		arg.fp = fp;
 		arg.td = td;
 		error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg,
 		    LK_EXCLUSIVE, &fvp);
 	}
 
 	if (error)
 		goto bad;
 	*vpp = fvp;
 	return (0);
 
 bad:
 	*vpp = NULL;
 	return (error);
 }
 
 static int
 fdesc_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (VTOFDESC(vp)->fd_type == Froot)
 		return (0);
 
 	/*
 	 * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file
 	 * descriptor being sought for duplication. The error return ensures
 	 * that the vnode for this device will be released by vn_open. Open
 	 * will detect this special error and take the actions in dupfdopen.
 	 * Other callers of vn_open or VOP_OPEN will simply report the
 	 * error.
 	 */
 	ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd;	/* XXX */
 	return (ENODEV);
 }
 
 static int
 fdesc_pathconf(struct vop_pathconf_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error;
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			*ap->a_retval = 2;
 		else
 			*ap->a_retval = 1;
 		return (0);
 	default:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			return (vop_stdpathconf(ap));
 		vref(vp);
 		VOP_UNLOCK(vp);
 		error = kern_fpathconf(curthread, VTOFDESC(vp)->fd_fd,
 		    ap->a_name, ap->a_retval);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vunref(vp);
 		return (error);
 	}
 }
 
 static int
 fdesc_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct timeval boottime;
 
 	getboottime(&boottime);
 	vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 	vap->va_fileid = VTOFDESC(vp)->fd_ix;
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_atime.tv_sec = boottime.tv_sec;
 	vap->va_atime.tv_nsec = 0;
 	vap->va_mtime = vap->va_atime;
 	vap->va_ctime = vap->va_mtime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 
 	switch (VTOFDESC(vp)->fd_type) {
 	case Froot:
 		vap->va_type = VDIR;
 		vap->va_nlink = 2;
 		vap->va_size = DEV_BSIZE;
 		vap->va_rdev = NODEV;
 		break;
 
 	case Fdesc:
 		vap->va_type = (vp->v_vflag & VV_READLINK) == 0 ? VCHR : VLNK;
 		vap->va_nlink = 1;
 		vap->va_size = 0;
 		vap->va_rdev = makedev(0, vap->va_fileid);
 		break;
 
 	default:
 		panic("fdesc_getattr");
 		break;
 	}
 
 	vp->v_type = vap->va_type;
 	return (0);
 }
 
 static int
 fdesc_setattr(struct vop_setattr_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	struct thread *td = curthread;
 	cap_rights_t rights;
 	unsigned fd;
 	int error;
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (VTOFDESC(ap->a_vp)->fd_type == Froot)
 		return (EACCES);
 
 	fd = VTOFDESC(ap->a_vp)->fd_fd;
 
 	/*
 	 * Allow setattr where there is an underlying vnode.
 	 */
 	error = getvnode(td, fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
 	if (error) {
 		/*
 		 * getvnode() returns EINVAL if the file descriptor is not
 		 * backed by a vnode.  Silently drop all changes except
 		 * chflags(2) in this case.
 		 */
 		if (error == EINVAL) {
 			if (vap->va_flags != VNOVAL)
 				error = EOPNOTSUPP;
 			else
 				error = 0;
 		}
 		return (error);
 	}
 	vp = fp->f_vnode;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 #define UIO_MX _GENERIC_DIRLEN(10) /* number of symbols in INT_MAX printout */
 
 static int
 fdesc_readdir(struct vop_readdir_args *ap)
 {
 	struct fdescmount *fmp;
 	struct uio *uio = ap->a_uio;
 	struct filedesc *fdp;
 	struct dirent d;
 	struct dirent *dp = &d;
 	int error, i, off, fcnt;
 
 	if (VTOFDESC(ap->a_vp)->fd_type != Froot)
 		panic("fdesc_readdir: not dir");
 
 	fmp = VFSTOFDESC(ap->a_vp->v_mount);
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
 	    uio->uio_resid < UIO_MX)
 		return (EINVAL);
 	i = (u_int)off / UIO_MX;
 	fdp = uio->uio_td->td_proc->p_fd;
 	error = 0;
 
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
 	FILEDESC_SLOCK(fdp);
 	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
 		bzero((caddr_t)dp, UIO_MX);
 		switch (i) {
 		case 0:	/* `.' */
 		case 1: /* `..' */
 			dp->d_fileno = i + FD_ROOT;
 			dp->d_namlen = i + 1;
 			dp->d_reclen = UIO_MX;
 			bcopy("..", dp->d_name, dp->d_namlen);
 			dp->d_type = DT_DIR;
 			dirent_terminate(dp);
 			break;
 		default:
 			if (fdp->fd_ofiles[fcnt].fde_file == NULL)
 				break;
 			dp->d_namlen = sprintf(dp->d_name, "%d", fcnt);
 			dp->d_reclen = UIO_MX;
 			dp->d_type = (fmp->flags & FMNT_LINRDLNKF) == 0 ?
 			    DT_CHR : DT_LNK;
 			dp->d_fileno = i + FD_DESC;
 			dirent_terminate(dp);
 			break;
 		}
 		/* NOTE: d_off is the offset of the *next* entry. */
 		dp->d_off = UIO_MX * (i + 1);
 		if (dp->d_namlen != 0) {
 			/*
 			 * And ship to userland
 			 */
 			FILEDESC_SUNLOCK(fdp);
 			error = uiomove(dp, UIO_MX, uio);
 			if (error)
 				goto done;
 			FILEDESC_SLOCK(fdp);
 		}
 		i++;
 		fcnt++;
 	}
 	FILEDESC_SUNLOCK(fdp);
 
 done:
 	uio->uio_offset = i * UIO_MX;
 	return (error);
 }
 
 static int
 fdesc_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct fdescnode *fd;
 
  	vp = ap->a_vp;
  	fd = VTOFDESC(vp);
 	fdesc_remove_entry(fd);
 	free(vp->v_data, M_TEMP);
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 fdesc_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vp, *vn;
 	struct thread *td;
 	struct uio *uio;
 	struct file *fp;
 	char *freepath, *fullpath;
 	size_t pathlen;
 	int lockflags, fd_fd;
 	int error;
 
 	freepath = NULL;
 	vn = va->a_vp;
 	if (VTOFDESC(vn)->fd_type != Fdesc)
 		panic("fdesc_readlink: not fdescfs link");
 	fd_fd = ((struct fdescnode *)vn->v_data)->fd_fd;
 	lockflags = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn);
 
 	td = curthread;
 	error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL);
 	if (error != 0)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		error = vn_fullpath(td, vp, &fullpath, &freepath);
 		break;
 	default:
 		fullpath = "anon_inode:[unknown]";
 		break;
 	}
 	if (error == 0) {
 		uio = va->a_uio;
 		pathlen = strlen(fullpath);
 		error = uiomove(fullpath, pathlen, uio);
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 	fdrop(fp, td);
 
 out:
 	vn_lock(vn, lockflags | LK_RETRY);
 	return (error);
 }
Index: projects/clang1100-import/sys/fs/fuse/fuse_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/fuse/fuse_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/fuse/fuse_vfsops.c	(revision 364279)
@@ -1,695 +1,695 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Copyright (c) 2019 The FreeBSD Foundation
  *
  * Portions of this software were developed by BFF Storage Systems, LLC under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/filedesc.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/fcntl.h>
 
 #include "fuse.h"
 #include "fuse_node.h"
 #include "fuse_ipc.h"
 #include "fuse_internal.h"
 
 #include <sys/priv.h>
 #include <security/mac/mac_framework.h>
 
 SDT_PROVIDER_DECLARE(fusefs);
 /* 
  * Fuse trace probe:
  * arg0: verbosity.  Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*");
 
 /* This will do for privilege types for now */
 #ifndef PRIV_VFS_FUSE_ALLOWOTHER
 #define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER
 #endif
 #ifndef PRIV_VFS_FUSE_MOUNT_NONUSER
 #define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER
 #endif
 #ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT
 #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER
 #endif
 
 static vfs_fhtovp_t fuse_vfsop_fhtovp;
 static vfs_mount_t fuse_vfsop_mount;
 static vfs_unmount_t fuse_vfsop_unmount;
 static vfs_root_t fuse_vfsop_root;
 static vfs_statfs_t fuse_vfsop_statfs;
 static vfs_vget_t fuse_vfsop_vget;
 
 struct vfsops fuse_vfsops = {
 	.vfs_fhtovp = fuse_vfsop_fhtovp,
 	.vfs_mount = fuse_vfsop_mount,
 	.vfs_unmount = fuse_vfsop_unmount,
 	.vfs_root = fuse_vfsop_root,
 	.vfs_statfs = fuse_vfsop_statfs,
 	.vfs_vget = fuse_vfsop_vget,
 };
 
 static int fuse_enforce_dev_perms = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW,
     &fuse_enforce_dev_perms, 0,
     "enforce fuse device permissions for secondary mounts");
 
 MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer");
 
 static int
 fuse_getdevice(const char *fspec, struct thread *td, struct cdev **fdevp)
 {
 	struct nameidata nd, *ndp = &nd;
 	struct vnode *devvp;
 	struct cdev *fdev;
 	int err;
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 
 	NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td);
 	if ((err = namei(ndp)) != 0)
 		return err;
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 
 	if (devvp->v_type != VCHR) {
 		vrele(devvp);
 		return ENXIO;
 	}
 	fdev = devvp->v_rdev;
 	dev_ref(fdev);
 
 	if (fuse_enforce_dev_perms) {
 		/*
 	         * Check if mounter can open the fuse device.
 	         *
 	         * This has significance only if we are doing a secondary mount
 	         * which doesn't involve actually opening fuse devices, but we
 	         * still want to enforce the permissions of the device (in
 	         * order to keep control over the circle of fuse users).
 	         *
 	         * (In case of primary mounts, we are either the superuser so
 	         * we can do anything anyway, or we can mount only if the
 	         * device is already opened by us, ie. we are permitted to open
 	         * the device.)
 	         */
 #if 0
 #ifdef MAC
 		err = mac_check_vnode_open(td->td_ucred, devvp, VREAD | VWRITE);
 		if (!err)
 #endif
 #endif /* 0 */
 			err = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td);
 		if (err) {
 			vrele(devvp);
 			dev_rel(fdev);
 			return err;
 		}
 	}
 	/*
 	 * according to coda code, no extra lock is needed --
 	 * although in sys/vnode.h this field is marked "v"
 	 */
 	vrele(devvp);
 
 	if (!fdev->si_devsw ||
 	    strcmp("fuse", fdev->si_devsw->d_name)) {
 		dev_rel(fdev);
 		return ENXIO;
 	}
 	*fdevp = fdev;
 
 	return 0;
 }
 
 #define FUSE_FLAGOPT(fnam, fval) do {				\
 	vfs_flagopt(opts, #fnam, &mntopts, fval);		\
 	vfs_flagopt(opts, "__" #fnam, &__mntopts, fval);	\
 } while (0)
 
 SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t");
 SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char*", "struct fuse_data*",
 	"struct mount*", "int");
 
 static int
 fuse_vfs_remount(struct mount *mp, struct thread *td, uint64_t mntopts,
 	uint32_t max_read, int daemon_timeout)
 {
 	int err = 0;
 	struct fuse_data *data = fuse_get_mpdata(mp);
 	/* Don't allow these options to be changed */
 	const static unsigned long long cant_update_opts = 
 		MNT_USER;	/* Mount owner must be the user running the daemon */
 
 	FUSE_LOCK();
 
 	if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) {
 		err = EOPNOTSUPP;
 		SDT_PROBE4(fusefs, , vfsops, mount_err,
 			"Can't change these mount options during remount",
 			data, mp, err);
 		goto out;
 	}
 	if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) ||
 	     (data->max_read != max_read) ||
 	     (data->daemon_timeout != daemon_timeout)) {
 		// TODO: allow changing options where it makes sense
 		err = EOPNOTSUPP;
 		SDT_PROBE4(fusefs, , vfsops, mount_err,
 			"Can't change fuse mount options during remount",
 			data, mp, err);
 		goto out;
 	}
 
 	if (fdata_get_dead(data)) {
 		err = ENOTCONN;
 		SDT_PROBE4(fusefs, , vfsops, mount_err,
 			"device is dead during mount", data, mp, err);
 		goto out;
 	}
 
 	/* Sanity + permission checks */
 	if (!data->daemoncred)
 		panic("fuse daemon found, but identity unknown");
 	if (mntopts & FSESS_DAEMON_CAN_SPY)
 		err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
 	if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
 		/* are we allowed to do the first mount? */
 		err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);
 
 out:
 	FUSE_UNLOCK();
 	return err;
 }
 
 static int
 fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags,
 	struct vnode **vpp)
 {
 	struct fuse_fid *ffhp = (struct fuse_fid *)fhp;
 	struct fuse_vnode_data *fvdat;
 	struct vnode *nvp;
 	int error;
 
 	if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT))
 		return EOPNOTSUPP;
 
 	error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	fvdat = VTOFUD(nvp);
 	if (fvdat->generation != ffhp->gen ) {
 		vput(nvp);
 		*vpp = NULLVP;
 		return (ESTALE);
 	}
 	*vpp = nvp;
 	vnode_create_vobject(*vpp, 0, curthread);
 	return (0);
 }
 
 static int
 fuse_vfsop_mount(struct mount *mp)
 {
 	int err;
 
 	uint64_t mntopts, __mntopts;
 	uint32_t max_read;
 	int daemon_timeout;
 	int fd;
 
 	struct cdev *fdev;
 	struct fuse_data *data = NULL;
 	struct thread *td;
 	struct file *fp, *fptmp;
 	char *fspec, *subtype;
 	struct vfsoptlist *opts;
 
 	subtype = NULL;
 	max_read = ~0;
 	err = 0;
 	mntopts = 0;
 	__mntopts = 0;
 	td = curthread;
 
 	/* Get the new options passed to mount */
 	opts = mp->mnt_optnew;
 
 	if (!opts)
 		return EINVAL;
 
 	/* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */
 	if (!vfs_getopts(opts, "fspath", &err))
 		return err;
 
 	/*
 	 * With the help of underscored options the mount program
 	 * can inform us from the flags it sets by default
 	 */
 	FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY);
 	FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN);
 	FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS);
 	FUSE_FLAGOPT(intr, FSESS_INTR);
 
 	(void)vfs_scanopt(opts, "max_read=", "%u", &max_read);
 	if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) {
 		if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT)
 			daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT;
 		else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT)
 			daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT;
 	} else {
 		daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT;
 	}
 	subtype = vfs_getopts(opts, "subtype=", &err);
 
 	SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts);
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		return fuse_vfs_remount(mp, td, mntopts, max_read,
 			daemon_timeout);
 	}
 
 	/* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
 	fspec = vfs_getopts(opts, "from", &err);
 	if (!fspec)
 		return err;
 
 	/* `fd' contains the filedescriptor for this session; REQUIRED */
 	if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
 		return EINVAL;
 
 	err = fuse_getdevice(fspec, td, &fdev);
 	if (err != 0)
 		return err;
 
 	err = fget(td, fd, &cap_read_rights, &fp);
 	if (err != 0) {
 		SDT_PROBE2(fusefs, , vfsops, trace, 1,
 			"invalid or not opened device");
 		goto out;
 	}
 	fptmp = td->td_fpop;
 	td->td_fpop = fp;
 	err = devfs_get_cdevpriv((void **)&data);
 	td->td_fpop = fptmp;
 	fdrop(fp, td);
 	FUSE_LOCK();
 
 	if (err != 0 || data == NULL) {
 		err = ENXIO;
 		SDT_PROBE4(fusefs, , vfsops, mount_err,
 			"invalid or not opened device", data, mp, err);
 		FUSE_UNLOCK();
 		goto out;
 	}
 	if (fdata_get_dead(data)) {
 		err = ENOTCONN;
 		SDT_PROBE4(fusefs, , vfsops, mount_err,
 			"device is dead during mount", data, mp, err);
 		FUSE_UNLOCK();
 		goto out;
 	}
 	/* Sanity + permission checks */
 	if (!data->daemoncred)
 		panic("fuse daemon found, but identity unknown");
 	if (mntopts & FSESS_DAEMON_CAN_SPY)
 		err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
 	if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
 		/* are we allowed to do the first mount? */
 		err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);
 	if (err) {
 		FUSE_UNLOCK();
 		goto out;
 	}
 	data->ref++;
 	data->mp = mp;
 	data->dataflags |= mntopts;
 	data->max_read = max_read;
 	data->daemon_timeout = daemon_timeout;
 	data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK;
 	FUSE_UNLOCK();
 
 	vfs_getnewfsid(mp);
 	MNT_ILOCK(mp);
 	mp->mnt_data = data;
 	/* 
 	 * FUSE file systems can be either local or remote, but the kernel
 	 * can't tell the difference.
 	 */
 	mp->mnt_flag &= ~MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_USES_BCACHE;
 	/* 
 	 * Disable nullfs cacheing because it can consume too many resources in
 	 * the FUSE server.
 	 */
 	mp->mnt_kern_flag |= MNTK_NULL_NOCACHE;
 	MNT_IUNLOCK(mp);
 	/* We need this here as this slot is used by getnewvnode() */
 	mp->mnt_stat.f_iosize = maxbcachebuf;
 	if (subtype) {
 		strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN);
 		strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN);
 	}
 	memset(mp->mnt_stat.f_mntfromname, 0, MNAMELEN);
 	strlcpy(mp->mnt_stat.f_mntfromname, fspec, MNAMELEN);
 	mp->mnt_iosize_max = MAXPHYS;
 
 	/* Now handshaking with daemon */
 	fuse_internal_send_init(data, td);
 
 out:
 	if (err) {
 		FUSE_LOCK();
 		if (data != NULL && data->mp == mp) {
 			/*
 			 * Destroy device only if we acquired reference to
 			 * it
 			 */
 			SDT_PROBE4(fusefs, , vfsops, mount_err,
 				"mount failed, destroy device", data, mp, err);
 			data->mp = NULL;
 			mp->mnt_data = NULL;
 			fdata_trydestroy(data);
 		}
 		FUSE_UNLOCK();
 		dev_rel(fdev);
 	}
 	return err;
 }
 
 static int
 fuse_vfsop_unmount(struct mount *mp, int mntflags)
 {
 	int err = 0;
 	int flags = 0;
 
 	struct cdev *fdev;
 	struct fuse_data *data;
 	struct fuse_dispatcher fdi;
 	struct thread *td = curthread;
 
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
 	data = fuse_get_mpdata(mp);
 	if (!data) {
 		panic("no private data for mount point?");
 	}
 	/* There is 1 extra root vnode reference (mp->mnt_data). */
 	FUSE_LOCK();
 	if (data->vroot != NULL) {
 		struct vnode *vroot = data->vroot;
 
 		data->vroot = NULL;
 		FUSE_UNLOCK();
 		vrele(vroot);
 	} else
 		FUSE_UNLOCK();
 	err = vflush(mp, 0, flags, td);
 	if (err) {
 		return err;
 	}
 	if (fdata_get_dead(data)) {
 		goto alreadydead;
 	}
 	if (fsess_isimpl(mp, FUSE_DESTROY)) {
 		fdisp_init(&fdi, 0);
 		fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);
 
 		(void)fdisp_wait_answ(&fdi);
 		fdisp_destroy(&fdi);
 	}
 
 	fdata_set_dead(data);
 
 alreadydead:
 	FUSE_LOCK();
 	data->mp = NULL;
 	fdev = data->fdev;
 	fdata_trydestroy(data);
 	FUSE_UNLOCK();
 
 	MNT_ILOCK(mp);
 	mp->mnt_data = NULL;
 	MNT_IUNLOCK(mp);
 
 	dev_rel(fdev);
 
 	return 0;
 }
 
 SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export,
 	"struct mount*");
 static int
 fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 	uint64_t nodeid = ino;
 	struct thread *td = curthread;
 	struct fuse_dispatcher fdi;
 	struct fuse_entry_out *feo;
 	struct fuse_vnode_data *fvdat;
 	const char dot[] = ".";
 	off_t filesize;
 	enum vtype vtyp;
 	int error;
 
 	if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
 		/*
 		 * Unreachable unless you do something stupid, like export a
 		 * nullfs mount of a fusefs file system.
 		 */
 		SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp);
 		return (EOPNOTSUPP);
 	}
 
 	error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp);
 	if (error || *vpp != NULL)
 		return error;
 
 	/* Do a LOOKUP, using nodeid as the parent and "." as filename */
 	fdisp_init(&fdi, sizeof(dot));
 	fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred);
 	memcpy(fdi.indata, dot, sizeof(dot));
 	error = fdisp_wait_answ(&fdi);
 
 	if (error)
 		return error;
 
 	feo = (struct fuse_entry_out *)fdi.answ;
 	if (feo->nodeid == 0) {
 		/* zero nodeid means ENOENT and cache it */
 		error = ENOENT;
 		goto out;
 	}
 
 	vtyp = IFTOVT(feo->attr.mode);
 	error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp);
 	if (error)
 		goto out;
 	filesize = feo->attr.size;
 
 	/*
 	 * In the case where we are looking up a FUSE node represented by an
 	 * existing cached vnode, and the true size reported by FUSE_LOOKUP
 	 * doesn't match the vnode's cached size, then any cached writes beyond
 	 * the file's current size are lost.
 	 *
 	 * We can get here:
 	 * * following attribute cache expiration, or
 	 * * due a bug in the daemon, or
 	 */
 	fvdat = VTOFUD(*vpp);
 	if (vnode_isreg(*vpp) &&
 	    filesize != fvdat->cached_attrs.va_size &&
 	    fvdat->flag & FN_SIZECHANGE) {
 		printf("%s: WB cache incoherent on %s!\n", __func__,
 		    vnode_mount(*vpp)->mnt_stat.f_mntonname);
 
 		fvdat->flag &= ~FN_SIZECHANGE;
 	}
 
 	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
 		feo->attr_valid_nsec, NULL);
 	fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec,
 		&fvdat->entry_cache_timeout);
 out:
 	fdisp_destroy(&fdi);
 	return error;
 }
 
 static int
 fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 	int err = 0;
 
 	if (data->vroot != NULL) {
-		err = vget(data->vroot, lkflags, curthread);
+		err = vget(data->vroot, lkflags);
 		if (err == 0)
 			*vpp = data->vroot;
 	} else {
 		err = fuse_vnode_get(mp, NULL, FUSE_ROOT_ID, NULL, vpp, NULL,
 		    VDIR);
 		if (err == 0) {
 			FUSE_LOCK();
 			MPASS(data->vroot == NULL || data->vroot == *vpp);
 			if (data->vroot == NULL) {
 				SDT_PROBE2(fusefs, , vfsops, trace, 1,
 					"new root vnode");
 				data->vroot = *vpp;
 				FUSE_UNLOCK();
 				vref(*vpp);
 			} else if (data->vroot != *vpp) {
 				SDT_PROBE2(fusefs, , vfsops, trace, 1,
 					"root vnode race");
 				FUSE_UNLOCK();
 				VOP_UNLOCK(*vpp);
 				vrele(*vpp);
 				vrecycle(*vpp);
 				*vpp = data->vroot;
 			} else
 				FUSE_UNLOCK();
 		}
 	}
 	return err;
 }
 
 static int
 fuse_vfsop_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct fuse_dispatcher fdi;
 	int err = 0;
 
 	struct fuse_statfs_out *fsfo;
 	struct fuse_data *data;
 
 	data = fuse_get_mpdata(mp);
 
 	if (!(data->dataflags & FSESS_INITED))
 		goto fake;
 
 	fdisp_init(&fdi, 0);
 	fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL);
 	err = fdisp_wait_answ(&fdi);
 	if (err) {
 		fdisp_destroy(&fdi);
 		if (err == ENOTCONN) {
 			/*
 	                 * We want to seem a legitimate fs even if the daemon
 	                 * is stiff dead... (so that, eg., we can still do path
 	                 * based unmounting after the daemon dies).
 	                 */
 			goto fake;
 		}
 		return err;
 	}
 	fsfo = fdi.answ;
 
 	sbp->f_blocks = fsfo->st.blocks;
 	sbp->f_bfree = fsfo->st.bfree;
 	sbp->f_bavail = fsfo->st.bavail;
 	sbp->f_files = fsfo->st.files;
 	sbp->f_ffree = fsfo->st.ffree;	/* cast from uint64_t to int64_t */
 	sbp->f_namemax = fsfo->st.namelen;
 	sbp->f_bsize = fsfo->st.frsize;	/* cast from uint32_t to uint64_t */
 
 	fdisp_destroy(&fdi);
 	return 0;
 
 fake:
 	sbp->f_blocks = 0;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 0;
 	sbp->f_ffree = 0;
 	sbp->f_namemax = 0;
 	sbp->f_bsize = S_BLKSIZE;
 
 	return 0;
 }
Index: projects/clang1100-import/sys/fs/msdosfs/msdosfs_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/msdosfs/msdosfs_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/msdosfs/msdosfs_vfsops.c	(revision 364279)
@@ -1,985 +1,985 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/iconv.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <fs/msdosfs/bootsect.h>
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 #ifdef MSDOSFS_DEBUG
 #include <sys/rwlock.h>
 #endif
 
 static const char msdosfs_lock_msg[] = "fatlk";
 
 /* Mount options that we support. */
 static const char *msdosfs_opts[] = {
 	"async", "noatime", "noclusterr", "noclusterw",
 	"export", "force", "from", "sync",
 	"cs_dos", "cs_local", "cs_win", "dirmask",
 	"gid", "kiconv", "longname",
 	"longnames", "mask", "shortname", "shortnames",
 	"uid", "win95", "nowin95",
 	NULL
 };
 
 #if 1 /*def PC98*/
 /*
  * XXX - The boot signature formatted by NEC PC-98 DOS looks like a
  *       garbage or a random value :-{
  *       If you want to use that broken-signatured media, define the
  *       following symbol even though PC/AT.
  *       (ex. mount PC-98 DOS formatted FD on PC/AT)
  */
 #define	MSDOSFS_NOCHECKSIG
 #endif
 
 MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure");
 static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table");
 
 struct iconv_functions *msdosfs_iconv;
 
 static int	update_mp(struct mount *mp, struct thread *td);
 static int	mountmsdosfs(struct vnode *devvp, struct mount *mp);
 static vfs_fhtovp_t	msdosfs_fhtovp;
 static vfs_mount_t	msdosfs_mount;
 static vfs_root_t	msdosfs_root;
 static vfs_statfs_t	msdosfs_statfs;
 static vfs_sync_t	msdosfs_sync;
 static vfs_unmount_t	msdosfs_unmount;
 
 /* Maximum length of a character set name (arbitrary). */
 #define	MAXCSLEN	64
 
 static int
 update_mp(struct mount *mp, struct thread *td)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	void *dos, *win, *local;
 	int error, v;
 
 	if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) {
 		if (msdosfs_iconv != NULL) {
 			error = vfs_getopt(mp->mnt_optnew,
 			    "cs_win", &win, NULL);
 			if (!error)
 				error = vfs_getopt(mp->mnt_optnew,
 				    "cs_local", &local, NULL);
 			if (!error)
 				error = vfs_getopt(mp->mnt_optnew,
 				    "cs_dos", &dos, NULL);
 			if (!error) {
 				msdosfs_iconv->open(win, local, &pmp->pm_u2w);
 				msdosfs_iconv->open(local, win, &pmp->pm_w2u);
 				msdosfs_iconv->open(dos, local, &pmp->pm_u2d);
 				msdosfs_iconv->open(local, dos, &pmp->pm_d2u);
 			}
 			if (error != 0)
 				return (error);
 		} else {
 			pmp->pm_w2u = NULL;
 			pmp->pm_u2w = NULL;
 			pmp->pm_d2u = NULL;
 			pmp->pm_u2d = NULL;
 		}
 	}
 
 	if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1)
 		pmp->pm_gid = v;
 	if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1)
 		pmp->pm_uid = v;
 	if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1)
 		pmp->pm_mask = v & ALLPERMS;
 	if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1)
 		pmp->pm_dirmask = v & ALLPERMS;
 	vfs_flagopt(mp->mnt_optnew, "shortname",
 	    &pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
 	vfs_flagopt(mp->mnt_optnew, "shortnames",
 	    &pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
 	vfs_flagopt(mp->mnt_optnew, "longname",
 	    &pmp->pm_flags, MSDOSFSMNT_LONGNAME);
 	vfs_flagopt(mp->mnt_optnew, "longnames",
 	    &pmp->pm_flags, MSDOSFSMNT_LONGNAME);
 	vfs_flagopt(mp->mnt_optnew, "kiconv",
 	    &pmp->pm_flags, MSDOSFSMNT_KICONV);
 
 	if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0)
 		pmp->pm_flags |= MSDOSFSMNT_NOWIN95;
 	else
 		pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95;
 
 	if (pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 		pmp->pm_flags |= MSDOSFSMNT_SHORTNAME;
 	else
 		pmp->pm_flags |= MSDOSFSMNT_LONGNAME;
 	return 0;
 }
 
 static int
 msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags)
 {
 	struct msdosfs_args args;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof(args.export));
 	ma = mount_argf(ma, "uid", "%d", args.uid);
 	ma = mount_argf(ma, "gid", "%d", args.gid);
 	ma = mount_argf(ma, "mask", "%d", args.mask);
 	ma = mount_argf(ma, "dirmask", "%d", args.dirmask);
 
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname");
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname");
 	ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95");
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv");
 
 	ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN);
 	ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN);
 	ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN);
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * mp - path - addr in user space of mount point (ie /usr or whatever)
  * data - addr in user space of mount params including the name of the block
  * special file to treat as a filesystem.
  */
 static int
 msdosfs_mount(struct mount *mp)
 {
 	struct vnode *devvp;	  /* vnode for blk device to mount */
 	struct thread *td;
 	/* msdosfs specific mount control block */
 	struct msdosfsmount *pmp = NULL;
 	struct nameidata ndp;
 	int error, flags;
 	accmode_t accmode;
 	char *from;
 
 	td = curthread;
 	if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts))
 		return (EINVAL);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		pmp = VFSTOMSDOSFS(mp);
 		if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			error = VFS_SYNC(mp, MNT_WAIT);
 			if (error)
 				return (error);
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			error = vflush(mp, 0, flags, td);
 			if (error)
 				return (error);
 
 			/*
 			 * Now the volume is clean.  Mark it so while the
 			 * device is still rw.
 			 */
 			error = markvoldirty(pmp, 0);
 			if (error) {
 				(void)markvoldirty(pmp, 1);
 				return (error);
 			}
 
 			/* Downgrade the device from rw to ro. */
 			g_topology_lock();
 			error = g_access(pmp->pm_cp, 0, -1, 0);
 			g_topology_unlock();
 			if (error) {
 				(void)markvoldirty(pmp, 1);
 				return (error);
 			}
 
 			/*
 			 * Backing out after an error was painful in the
 			 * above.  Now we are committed to succeeding.
 			 */
 			pmp->pm_fmod = 0;
 			pmp->pm_flags |= MSDOSFSMNT_RONLY;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		} else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			devvp = pmp->pm_devvp;
 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp);
 				return (error);
 			}
 			VOP_UNLOCK(devvp);
 			g_topology_lock();
 			error = g_access(pmp->pm_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error)
 				return (error);
 
 			/* Now that the volume is modifiable, mark it dirty. */
 			error = markvoldirty_upgrade(pmp, true, true);
 			if (error) {
 				/*
 				 * If dirtying the superblock failed, drop GEOM
 				 * 'w' refs (we're still RO).
 				 */
 				g_topology_lock();
 				(void)g_access(pmp->pm_cp, 0, -1, 0);
 				g_topology_unlock();
 
 				return (error);
 			}
 
 			pmp->pm_fmod = 1;
 			pmp->pm_flags &= ~MSDOSFSMNT_RONLY;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		}
 	}
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL))
 		return (EINVAL);
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
 	error = namei(&ndp);
 	if (error)
 		return (error);
 	devvp = ndp.ni_vp;
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	accmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = mountmsdosfs(devvp, mp);
 #ifdef MSDOSFS_DEBUG		/* only needed for the printf below */
 		pmp = VFSTOMSDOSFS(mp);
 #endif
 	} else {
 		vput(devvp);
 		if (devvp != pmp->pm_devvp)
 			return (EINVAL);	/* XXX needs translation */
 	}
 	if (error) {
 		vrele(devvp);
 		return (error);
 	}
 
 	error = update_mp(mp, td);
 	if (error) {
 		if ((mp->mnt_flag & MNT_UPDATE) == 0)
 			msdosfs_unmount(mp, MNT_FORCE);
 		return error;
 	}
 
 	vfs_mountedfrom(mp, from);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap);
 #endif
 	return (0);
 }
 
 static int
 mountmsdosfs(struct vnode *devvp, struct mount *mp)
 {
 	struct msdosfsmount *pmp;
 	struct buf *bp;
 	struct cdev *dev;
 	union bootsector *bsp;
 	struct byte_bpb33 *b33;
 	struct byte_bpb50 *b50;
 	struct byte_bpb710 *b710;
 	uint8_t SecPerClust;
 	u_long clusters;
 	int ronly, error;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 
 	bp = NULL;		/* This and pmp both used in error_exit. */
 	pmp = NULL;
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
 	dev = devvp->v_rdev;
 	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
 	    (uintptr_t)mp) == 0) {
 		VOP_UNLOCK(devvp);
 		return (EBUSY);
 	}
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1);
 	g_topology_unlock();
 	if (error != 0) {
 		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 		VOP_UNLOCK(devvp);
 		return (error);
 	}
 	dev_ref(dev);
 	bo = &devvp->v_bufobj;
 	VOP_UNLOCK(devvp);
 	if (dev->si_iosize_max != 0)
 		mp->mnt_iosize_max = dev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	/*
 	 * Read the boot sector of the filesystem, and then check the
 	 * boot signature.  If not a dos boot sector then error out.
 	 *
 	 * NOTE: 8192 is a magic size that works for ffs.
 	 */
 	error = bread(devvp, 0, 8192, NOCRED, &bp);
 	if (error)
 		goto error_exit;
 	bp->b_flags |= B_AGE;
 	bsp = (union bootsector *)bp->b_data;
 	b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB;
 	b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB;
 	b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB;
 
 #ifndef MSDOSFS_NOCHECKSIG
 	if (bsp->bs50.bsBootSectSig0 != BOOTSIG0
 	    || bsp->bs50.bsBootSectSig1 != BOOTSIG1) {
 		error = EINVAL;
 		goto error_exit;
 	}
 #endif
 
 	pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO);
 	pmp->pm_mountp = mp;
 	pmp->pm_cp = cp;
 	pmp->pm_bo = bo;
 
 	lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0);
 
 	/*
 	 * Initialize ownerships and permissions, since nothing else will
 	 * initialize them iff we are mounting root.
 	 */
 	pmp->pm_uid = UID_ROOT;
 	pmp->pm_gid = GID_WHEEL;
 	pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH |
 	    S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR;
 
 	/*
 	 * Compute several useful quantities from the bpb in the
 	 * bootsector.  Copy in the dos 5 variant of the bpb then fix up
 	 * the fields that are different between dos 5 and dos 3.3.
 	 */
 	SecPerClust = b50->bpbSecPerClust;
 	pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec);
 	if (pmp->pm_BytesPerSec < DEV_BSIZE) {
 		error = EINVAL;
 		goto error_exit;
 	}
 	pmp->pm_ResSectors = getushort(b50->bpbResSectors);
 	pmp->pm_FATs = b50->bpbFATs;
 	pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts);
 	pmp->pm_Sectors = getushort(b50->bpbSectors);
 	pmp->pm_FATsecs = getushort(b50->bpbFATsecs);
 	pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack);
 	pmp->pm_Heads = getushort(b50->bpbHeads);
 	pmp->pm_Media = b50->bpbMedia;
 
 	/* calculate the ratio of sector size to DEV_BSIZE */
 	pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE;
 
 	/*
 	 * We don't check pm_Heads nor pm_SecPerTrack, because
 	 * these may not be set for EFI file systems. We don't
 	 * use these anyway, so we're unaffected if they are
 	 * invalid.
 	 */
 	if (!pmp->pm_BytesPerSec || !SecPerClust) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	if (pmp->pm_Sectors == 0) {
 		pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs);
 		pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors);
 	} else {
 		pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs);
 		pmp->pm_HugeSectors = pmp->pm_Sectors;
 	}
 
 	if (pmp->pm_RootDirEnts == 0) {
 		if (pmp->pm_FATsecs
 		    || getushort(b710->bpbFSVers)) {
 			error = EINVAL;
 #ifdef MSDOSFS_DEBUG
 			printf("mountmsdosfs(): bad FAT32 filesystem\n");
 #endif
 			goto error_exit;
 		}
 		pmp->pm_fatmask = FAT32_MASK;
 		pmp->pm_fatmult = 4;
 		pmp->pm_fatdiv = 1;
 		pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs);
 		if (getushort(b710->bpbExtFlags) & FATMIRROR)
 			pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM;
 		else
 			pmp->pm_flags |= MSDOSFS_FATMIRROR;
 	} else
 		pmp->pm_flags |= MSDOSFS_FATMIRROR;
 
 	/*
 	 * Check a few values (could do some more):
 	 * - logical sector size: power of 2, >= block size
 	 * - sectors per cluster: power of 2, >= 1
 	 * - number of sectors:   >= 1, <= size of partition
 	 * - number of FAT sectors: >= 1
 	 */
 	if ( (SecPerClust == 0)
 	  || (SecPerClust & (SecPerClust - 1))
 	  || (pmp->pm_BytesPerSec < DEV_BSIZE)
 	  || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1))
 	  || (pmp->pm_HugeSectors == 0)
 	  || (pmp->pm_FATsecs == 0)
 	  || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE)
 	) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	pmp->pm_HugeSectors *= pmp->pm_BlkPerSec;
 	pmp->pm_HiddenSects *= pmp->pm_BlkPerSec;	/* XXX not used? */
 	pmp->pm_FATsecs     *= pmp->pm_BlkPerSec;
 	SecPerClust         *= pmp->pm_BlkPerSec;
 
 	pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec;
 
 	if (FAT32(pmp)) {
 		pmp->pm_rootdirblk = getulong(b710->bpbRootClust);
 		pmp->pm_firstcluster = pmp->pm_fatblk
 			+ (pmp->pm_FATs * pmp->pm_FATsecs);
 		pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec;
 	} else {
 		pmp->pm_rootdirblk = pmp->pm_fatblk +
 			(pmp->pm_FATs * pmp->pm_FATsecs);
 		pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts *
 			sizeof(struct direntry), DEV_BSIZE); /* in blocks */
 		pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize;
 	}
 
 	pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) /
 	    SecPerClust + 1;
 	pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE;	/* XXX not used? */
 
 	if (pmp->pm_fatmask == 0) {
 		if (pmp->pm_maxcluster
 		    <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) {
 			/*
 			 * This will usually be a floppy disk. This size makes
 			 * sure that one FAT entry will not be split across
 			 * multiple blocks.
 			 */
 			pmp->pm_fatmask = FAT12_MASK;
 			pmp->pm_fatmult = 3;
 			pmp->pm_fatdiv = 2;
 		} else {
 			pmp->pm_fatmask = FAT16_MASK;
 			pmp->pm_fatmult = 2;
 			pmp->pm_fatdiv = 1;
 		}
 	}
 
 	clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv;
 	if (pmp->pm_maxcluster >= clusters) {
 #ifdef MSDOSFS_DEBUG
 		printf("Warning: number of clusters (%ld) exceeds FAT "
 		    "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters);
 #endif
 		pmp->pm_maxcluster = clusters - 1;
 	}
 
 	if (FAT12(pmp))
 		pmp->pm_fatblocksize = 3 * 512;
 	else
 		pmp->pm_fatblocksize = PAGE_SIZE;
 	pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize,
 	    pmp->pm_BytesPerSec);
 	pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE;
 	pmp->pm_bnshift = ffs(DEV_BSIZE) - 1;
 
 	/*
 	 * Compute mask and shift value for isolating cluster relative byte
 	 * offsets and cluster numbers from a file offset.
 	 */
 	pmp->pm_bpcluster = SecPerClust * DEV_BSIZE;
 	pmp->pm_crbomask = pmp->pm_bpcluster - 1;
 	pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1;
 
 	/*
 	 * Check for valid cluster size
 	 * must be a power of 2
 	 */
 	if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	/*
 	 * Release the bootsector buffer.
 	 */
 	brelse(bp);
 	bp = NULL;
 
 	/*
 	 * Check the fsinfo sector if we have one.  Silently fix up our
 	 * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff)
 	 * or too large.  Ignore fp->fsinfree for now, since we need to
 	 * read the entire FAT anyway to fill the inuse map.
 	 */
 	if (pmp->pm_fsinfo) {
 		struct fsinfo *fp;
 
 		if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
 		    NOCRED, &bp)) != 0)
 			goto error_exit;
 		fp = (struct fsinfo *)bp->b_data;
 		if (!bcmp(fp->fsisig1, "RRaA", 4)
 		    && !bcmp(fp->fsisig2, "rrAa", 4)
 		    && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) {
 			pmp->pm_nxtfree = getulong(fp->fsinxtfree);
 			if (pmp->pm_nxtfree > pmp->pm_maxcluster)
 				pmp->pm_nxtfree = CLUST_FIRST;
 		} else
 			pmp->pm_fsinfo = 0;
 		brelse(bp);
 		bp = NULL;
 	}
 
 	/*
 	 * Finish initializing pmp->pm_nxtfree (just in case the first few
 	 * sectors aren't properly reserved in the FAT).  This completes
 	 * the fixup for fp->fsinxtfree, and fixes up the zero-initialized
 	 * value if there is no fsinfo.  We will use pmp->pm_nxtfree
 	 * internally even if there is no fsinfo.
 	 */
 	if (pmp->pm_nxtfree < CLUST_FIRST)
 		pmp->pm_nxtfree = CLUST_FIRST;
 
 	/*
 	 * Allocate memory for the bitmap of allocated clusters, and then
 	 * fill it in.
 	 */
 	pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS)
 				  * sizeof(*pmp->pm_inusemap),
 				  M_MSDOSFSFAT, M_WAITOK);
 
 	/*
 	 * fillinusemap() needs pm_devvp.
 	 */
 	pmp->pm_devvp = devvp;
 	pmp->pm_dev = dev;
 
 	/*
 	 * Have the inuse map filled in.
 	 */
 	MSDOSFS_LOCK_MP(pmp);
 	error = fillinusemap(pmp);
 	MSDOSFS_UNLOCK_MP(pmp);
 	if (error != 0)
 		goto error_exit;
 
 	/*
 	 * If they want FAT updates to be synchronous then let them suffer
 	 * the performance degradation in exchange for the on disk copy of
 	 * the FAT being correct just about all the time.  I suppose this
 	 * would be a good thing to turn on if the kernel is still flakey.
 	 */
 	if (mp->mnt_flag & MNT_SYNCHRONOUS)
 		pmp->pm_flags |= MSDOSFSMNT_WAITONFAT;
 
 	/*
 	 * Finish up.
 	 */
 	if (ronly)
 		pmp->pm_flags |= MSDOSFSMNT_RONLY;
 	else {
 		if ((error = markvoldirty(pmp, 1)) != 0)
 			goto error_exit;
 		pmp->pm_fmod = 1;
 	}
 	mp->mnt_data =  pmp;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_USES_BCACHE | MNTK_NO_IOPF;
 	MNT_IUNLOCK(mp);
 
 	return (0);
 
 error_exit:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
 	}
 	if (pmp) {
 		lockdestroy(&pmp->pm_fatlock);
 		free(pmp->pm_inusemap, M_MSDOSFSFAT);
 		free(pmp, M_MSDOSFSMNT);
 		mp->mnt_data = NULL;
 	}
 	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 	dev_rel(dev);
 	return (error);
 }
 
 /*
  * Unmount the filesystem described by mp.
  */
 static int
 msdosfs_unmount(struct mount *mp, int mntflags)
 {
 	struct msdosfsmount *pmp;
 	int error, flags;
 
 	error = flags = 0;
 	pmp = VFSTOMSDOSFS(mp);
 	if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0)
 		error = msdosfs_sync(mp, MNT_WAIT);
 	if ((mntflags & MNT_FORCE) != 0)
 		flags |= FORCECLOSE;
 	else if (error != 0)
 		return (error);
 	error = vflush(mp, 0, flags, curthread);
 	if (error != 0 && error != ENXIO)
 		return (error);
 	if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) {
 		error = markvoldirty(pmp, 0);
 		if (error && error != ENXIO) {
 			(void)markvoldirty(pmp, 1);
 			return (error);
 		}
 	}
 	if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
 		if (pmp->pm_w2u)
 			msdosfs_iconv->close(pmp->pm_w2u);
 		if (pmp->pm_u2w)
 			msdosfs_iconv->close(pmp->pm_u2w);
 		if (pmp->pm_d2u)
 			msdosfs_iconv->close(pmp->pm_d2u);
 		if (pmp->pm_u2d)
 			msdosfs_iconv->close(pmp->pm_u2d);
 	}
 
 #ifdef MSDOSFS_DEBUG
 	{
 		struct vnode *vp = pmp->pm_devvp;
 		struct bufobj *bo;
 
 		bo = &vp->v_bufobj;
 		BO_LOCK(bo);
 		VI_LOCK(vp);
 		vn_printf(vp,
 		    "msdosfs_umount(): just before calling VOP_CLOSE()\n");
 		printf("freef %p, freeb %p, mount %p\n",
 		    TAILQ_NEXT(vp, v_vnodelist), vp->v_vnodelist.tqe_prev,
 		    vp->v_mount);
 		printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n",
 		    TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd),
 		    TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd),
 		    vp->v_bufobj.bo_numoutput, vp->v_type);
 		VI_UNLOCK(vp);
 		BO_UNLOCK(bo);
 	}
 #endif
 	g_topology_lock();
 	g_vfs_close(pmp->pm_cp);
 	g_topology_unlock();
 	atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0);
 	vrele(pmp->pm_devvp);
 	dev_rel(pmp->pm_dev);
 	free(pmp->pm_inusemap, M_MSDOSFSFAT);
 	lockdestroy(&pmp->pm_fatlock);
 	free(pmp, M_MSDOSFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 static int
 msdosfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	struct denode *ndep;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp);
 #endif
 	error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep);
 	if (error)
 		return (error);
 	*vpp = DETOV(ndep);
 	return (0);
 }
 
 static int
 msdosfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct msdosfsmount *pmp;
 
 	pmp = VFSTOMSDOSFS(mp);
 	sbp->f_bsize = pmp->pm_bpcluster;
 	sbp->f_iosize = pmp->pm_bpcluster;
 	sbp->f_blocks = pmp->pm_maxcluster + 1;
 	sbp->f_bfree = pmp->pm_freeclustercount;
 	sbp->f_bavail = pmp->pm_freeclustercount;
 	sbp->f_files = pmp->pm_RootDirEnts;	/* XXX */
 	sbp->f_ffree = 0;	/* what to put in here? */
 	return (0);
 }
 
 /*
  * If we have an FSInfo block, update it.
  */
 static int
 msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor)
 {
 	struct fsinfo *fp;
 	struct buf *bp;
 	int error;
 
 	MSDOSFS_LOCK_MP(pmp);
 	if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) {
 		error = 0;
 		goto unlock;
 	}
 	error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
 	    NOCRED, &bp);
 	if (error != 0) {
 		goto unlock;
 	}
 	fp = (struct fsinfo *)bp->b_data;
 	putulong(fp->fsinfree, pmp->pm_freeclustercount);
 	putulong(fp->fsinxtfree, pmp->pm_nxtfree);
 	pmp->pm_flags &= ~MSDOSFS_FSIMOD;
 	if (waitfor == MNT_WAIT)
 		error = bwrite(bp);
 	else
 		bawrite(bp);
 unlock:
 	MSDOSFS_UNLOCK_MP(pmp);
 	return (error);
 }
 
 static int
 msdosfs_sync(struct mount *mp, int waitfor)
 {
 	struct vnode *vp, *nvp;
 	struct thread *td;
 	struct denode *dep;
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	int error, allerror = 0;
 
 	td = curthread;
 
 	/*
 	 * If we ever switch to not updating all of the FATs all the time,
 	 * this would be the place to update them from the first one.
 	 */
 	if (pmp->pm_fmod != 0) {
 		if (pmp->pm_flags & MSDOSFSMNT_RONLY)
 			panic("msdosfs_sync: rofs mod");
 		else {
 			/* update FATs here */
 		}
 	}
 	/*
 	 * Write back each (modified) denode.
 	 */
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, nvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		dep = VTODE(vp);
 		if ((dep->de_flag &
 		    (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 &&
 		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
+		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK);
 		if (error) {
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, nvp);
 				goto loop;
 			}
 			continue;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(vp);
 		vrele(vp);
 	}
 
 	/*
 	 * Flush filesystem control info.
 	 */
 	if (waitfor != MNT_LAZY) {
 		vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(pmp->pm_devvp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(pmp->pm_devvp);
 	}
 
 	error = msdosfs_fsiflush(pmp, waitfor);
 	if (error != 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	struct defid *defhp = (struct defid *) fhp;
 	struct denode *dep;
 	int error;
 
 	error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	*vpp = DETOV(dep);
 	vnode_create_vobject(*vpp, dep->de_FileSize, curthread);
 	return (0);
 }
 
 static struct vfsops msdosfs_vfsops = {
 	.vfs_fhtovp =		msdosfs_fhtovp,
 	.vfs_mount =		msdosfs_mount,
 	.vfs_cmount =		msdosfs_cmount,
 	.vfs_root =		msdosfs_root,
 	.vfs_statfs =		msdosfs_statfs,
 	.vfs_sync =		msdosfs_sync,
 	.vfs_unmount =		msdosfs_unmount,
 };
 
 VFS_SET(msdosfs_vfsops, msdosfs, 0);
 MODULE_VERSION(msdosfs, 1);
Index: projects/clang1100-import/sys/fs/nfsclient/nfs_clvfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/nfsclient/nfs_clvfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/nfsclient/nfs_clvfsops.c	(revision 364279)
@@ -1,2061 +1,2061 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_vfsops.c	8.12 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 
 #include "opt_bootp.h"
 #include "opt_nfsroot.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/jail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <netinet/in.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <nfs/nfsdiskless.h>
 
 FEATURE(nfscl, "NFSv4 client");
 
 extern int nfscl_ticks;
 extern struct timeval nfsboottime;
 extern int nfsrv_useacl;
 extern int nfscl_debuglevel;
 extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
 extern struct mtx ncl_iod_mutex;
 NFSCLSTATEMUTEX;
 extern struct mtx nfsrv_dslock_mtx;
 
 MALLOC_DEFINE(M_NEWNFSREQ, "newnfsclient_req", "NFS request header");
 MALLOC_DEFINE(M_NEWNFSMNT, "newnfsmnt", "NFS mount struct");
 
 SYSCTL_DECL(_vfs_nfs);
 static int nfs_ip_paranoia = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW,
     &nfs_ip_paranoia, 0, "");
 static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY;
 SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY,
         downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, "");
 /* how long between console messages "nfs server foo not responding" */
 static int nfs_tprintf_delay = NFS_TPRINTF_DELAY;
 SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY,
         downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, "");
 #ifdef NFS_DEBUG
 int nfs_debug;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0,
     "Toggle debug flag");
 #endif
 
 static int	nfs_mountroot(struct mount *);
 static void	nfs_sec_name(char *, int *);
 static void	nfs_decode_args(struct mount *mp, struct nfsmount *nmp,
 		    struct nfs_args *argp, const char *, struct ucred *,
 		    struct thread *);
 static int	mountnfs(struct nfs_args *, struct mount *,
 		    struct sockaddr *, char *, u_char *, int, u_char *, int,
 		    u_char *, int, struct vnode **, struct ucred *,
 		    struct thread *, int, int, int);
 static void	nfs_getnlminfo(struct vnode *, uint8_t *, size_t *,
 		    struct sockaddr_storage *, int *, off_t *,
 		    struct timeval *);
 static vfs_mount_t nfs_mount;
 static vfs_cmount_t nfs_cmount;
 static vfs_unmount_t nfs_unmount;
 static vfs_root_t nfs_root;
 static vfs_statfs_t nfs_statfs;
 static vfs_sync_t nfs_sync;
 static vfs_sysctl_t nfs_sysctl;
 static vfs_purge_t nfs_purge;
 
 /*
  * nfs vfs operations.
  */
 static struct vfsops nfs_vfsops = {
 	.vfs_init =		ncl_init,
 	.vfs_mount =		nfs_mount,
 	.vfs_cmount =		nfs_cmount,
 	.vfs_root =		vfs_cache_root,
 	.vfs_cachedroot =	nfs_root,
 	.vfs_statfs =		nfs_statfs,
 	.vfs_sync =		nfs_sync,
 	.vfs_uninit =		ncl_uninit,
 	.vfs_unmount =		nfs_unmount,
 	.vfs_sysctl =		nfs_sysctl,
 	.vfs_purge =		nfs_purge,
 };
 VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK | VFCF_SBDRY);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfs, 1);
 MODULE_DEPEND(nfs, nfscommon, 1, 1, 1);
 MODULE_DEPEND(nfs, krpc, 1, 1, 1);
 MODULE_DEPEND(nfs, nfssvc, 1, 1, 1);
 
 /*
  * This structure is now defined in sys/nfs/nfs_diskless.c so that it
  * can be shared by both NFS clients. It is declared here so that it
  * will be defined for kernels built without NFS_ROOT, although it
  * isn't used in that case.
  */
 #if !defined(NFS_ROOT)
 struct nfs_diskless	nfs_diskless = { { { 0 } } };
 struct nfsv3_diskless	nfsv3_diskless = { { { 0 } } };
 int			nfs_diskless_valid = 0;
 #endif
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD,
     &nfs_diskless_valid, 0,
     "Has the diskless struct been filled correctly");
 
 SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD,
     nfsv3_diskless.root_hostnam, 0, "Path to nfs root");
 
 SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD,
     &nfsv3_diskless.root_saddr, sizeof(nfsv3_diskless.root_saddr),
     "%Ssockaddr_in", "Diskless root nfs address");
 
 
 void		newnfsargs_ntoh(struct nfs_args *);
 static int	nfs_mountdiskless(char *,
 		    struct sockaddr_in *, struct nfs_args *,
 		    struct thread *, struct vnode **, struct mount *);
 static void	nfs_convert_diskless(void);
 static void	nfs_convert_oargs(struct nfs_args *args,
 		    struct onfs_args *oargs);
 
 int
 newnfs_iosize(struct nfsmount *nmp)
 {
 	int iosize, maxio;
 
 	/* First, set the upper limit for iosize */
 	if (nmp->nm_flag & NFSMNT_NFSV4) {
 		maxio = NFS_MAXBSIZE;
 	} else if (nmp->nm_flag & NFSMNT_NFSV3) {
 		if (nmp->nm_sotype == SOCK_DGRAM)
 			maxio = NFS_MAXDGRAMDATA;
 		else
 			maxio = NFS_MAXBSIZE;
 	} else {
 		maxio = NFS_V2MAXDATA;
 	}
 	if (nmp->nm_rsize > maxio || nmp->nm_rsize == 0)
 		nmp->nm_rsize = maxio;
 	if (nmp->nm_rsize > NFS_MAXBSIZE)
 		nmp->nm_rsize = NFS_MAXBSIZE;
 	if (nmp->nm_readdirsize > maxio || nmp->nm_readdirsize == 0)
 		nmp->nm_readdirsize = maxio;
 	if (nmp->nm_readdirsize > nmp->nm_rsize)
 		nmp->nm_readdirsize = nmp->nm_rsize;
 	if (nmp->nm_wsize > maxio || nmp->nm_wsize == 0)
 		nmp->nm_wsize = maxio;
 	if (nmp->nm_wsize > NFS_MAXBSIZE)
 		nmp->nm_wsize = NFS_MAXBSIZE;
 
 	/*
 	 * Calculate the size used for io buffers.  Use the larger
 	 * of the two sizes to minimise nfs requests but make sure
 	 * that it is at least one VM page to avoid wasting buffer
 	 * space.  It must also be at least NFS_DIRBLKSIZ, since
 	 * that is the buffer size used for directories.
 	 */
 	iosize = imax(nmp->nm_rsize, nmp->nm_wsize);
 	iosize = imax(iosize, PAGE_SIZE);
 	iosize = imax(iosize, NFS_DIRBLKSIZ);
 	nmp->nm_mountp->mnt_stat.f_iosize = iosize;
 	return (iosize);
 }
 
 static void
 nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs)
 {
 
 	args->version = NFS_ARGSVERSION;
 	args->addr = oargs->addr;
 	args->addrlen = oargs->addrlen;
 	args->sotype = oargs->sotype;
 	args->proto = oargs->proto;
 	args->fh = oargs->fh;
 	args->fhsize = oargs->fhsize;
 	args->flags = oargs->flags;
 	args->wsize = oargs->wsize;
 	args->rsize = oargs->rsize;
 	args->readdirsize = oargs->readdirsize;
 	args->timeo = oargs->timeo;
 	args->retrans = oargs->retrans;
 	args->readahead = oargs->readahead;
 	args->hostname = oargs->hostname;
 }
 
 static void
 nfs_convert_diskless(void)
 {
 
 	bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif,
 		sizeof(struct ifaliasreq));
 	bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway,
 		sizeof(struct sockaddr_in));
 	nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args);
 	if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) {
 		nfsv3_diskless.root_fhsize = NFSX_MYFH;
 		bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_MYFH);
 	} else {
 		nfsv3_diskless.root_fhsize = NFSX_V2FH;
 		bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
 	}
 	bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr,
 		sizeof(struct sockaddr_in));
 	bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN);
 	nfsv3_diskless.root_time = nfs_diskless.root_time;
 	bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam,
 		MAXHOSTNAMELEN);
 	nfs_diskless_valid = 3;
 }
 
 /*
  * nfs statfs call
  */
 static int
 nfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(mp);
 	struct nfsvattr nfsva;
 	struct nfsfsinfo fs;
 	struct nfsstatfs sb;
 	int error = 0, attrflag, gotfsinfo = 0, ret;
 	struct nfsnode *np;
 
 	td = curthread;
 
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error)
 		return (error);
 	error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
 	if (error) {
 		vfs_unbusy(mp);
 		return (error);
 	}
 	vp = NFSTOV(np);
 	mtx_lock(&nmp->nm_mtx);
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) {
 		mtx_unlock(&nmp->nm_mtx);
 		error = nfsrpc_fsinfo(vp, &fs, td->td_ucred, td, &nfsva,
 		    &attrflag, NULL);
 		if (!error)
 			gotfsinfo = 1;
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 	if (!error)
 		error = nfsrpc_statfs(vp, &sb, &fs, td->td_ucred, td, &nfsva,
 		    &attrflag, NULL);
 	if (error != 0)
 		NFSCL_DEBUG(2, "statfs=%d\n", error);
 	if (attrflag == 0) {
 		ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1,
 		    td->td_ucred, td, &nfsva, NULL, NULL);
 		if (ret) {
 			/*
 			 * Just set default values to get things going.
 			 */
 			NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr));
 			nfsva.na_vattr.va_type = VDIR;
 			nfsva.na_vattr.va_mode = 0777;
 			nfsva.na_vattr.va_nlink = 100;
 			nfsva.na_vattr.va_uid = (uid_t)0;
 			nfsva.na_vattr.va_gid = (gid_t)0;
 			nfsva.na_vattr.va_fileid = 2;
 			nfsva.na_vattr.va_gen = 1;
 			nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE;
 			nfsva.na_vattr.va_size = 512 * 1024;
 		}
 	}
 	(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	if (!error) {
 	    mtx_lock(&nmp->nm_mtx);
 	    if (gotfsinfo || (nmp->nm_flag & NFSMNT_NFSV4))
 		nfscl_loadfsinfo(nmp, &fs);
 	    nfscl_loadsbinfo(nmp, &sb, sbp);
 	    sbp->f_iosize = newnfs_iosize(nmp);
 	    mtx_unlock(&nmp->nm_mtx);
 	    if (sbp != &mp->mnt_stat) {
 		bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN);
 		bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN);
 	    }
 	    strncpy(&sbp->f_fstypename[0], mp->mnt_vfc->vfc_name, MFSNAMELEN);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	vput(vp);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * nfs version 3 fsinfo rpc call
  */
 int
 ncl_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsfsinfo fs;
 	struct nfsvattr nfsva;
 	int error, attrflag;
 	
 	error = nfsrpc_fsinfo(vp, &fs, cred, td, &nfsva, &attrflag, NULL);
 	if (!error) {
 		if (attrflag)
 			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		mtx_lock(&nmp->nm_mtx);
 		nfscl_loadfsinfo(nmp, &fs);
 		mtx_unlock(&nmp->nm_mtx);
 	}
 	return (error);
 }
 
 /*
  * Mount a remote root fs via. nfs. This depends on the info in the
  * nfs_diskless structure that has been filled in properly by some primary
  * bootstrap.
  * It goes something like this:
  * - do enough of "ifconfig" by calling ifioctl() so that the system
  *   can talk to the server
  * - If nfs_diskless.mygateway is filled in, use that address as
  *   a default gateway.
  * - build the rootfs mount point and call mountnfs() to do the rest.
  *
  * It is assumed to be safe to read, modify, and write the nfsv3_diskless
  * structure, as well as other global NFS client variables here, as
  * nfs_mountroot() will be called once in the boot before any other NFS
  * client activity occurs.
  */
 static int
 nfs_mountroot(struct mount *mp)
 {
 	struct thread *td = curthread;
 	struct nfsv3_diskless *nd = &nfsv3_diskless;
 	struct socket *so;
 	struct vnode *vp;
 	struct ifreq ir;
 	int error;
 	u_long l;
 	char buf[128];
 	char *cp;
 
 #if defined(BOOTP_NFSROOT) && defined(BOOTP)
 	bootpc_init();		/* use bootp to get nfs_diskless filled in */
 #elif defined(NFS_ROOT)
 	nfs_setup_diskless();
 #endif
 
 	if (nfs_diskless_valid == 0)
 		return (-1);
 	if (nfs_diskless_valid == 1)
 		nfs_convert_diskless();
 
 	/*
 	 * Do enough of ifconfig(8) so that the critical net interface can
 	 * talk to the server.
 	 */
 	error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0,
 	    td->td_ucred, td);
 	if (error)
 		panic("nfs_mountroot: socreate(%04x): %d",
 			nd->myif.ifra_addr.sa_family, error);
 
 #if 0 /* XXX Bad idea */
 	/*
 	 * We might not have been told the right interface, so we pass
 	 * over the first ten interfaces of the same kind, until we get
 	 * one of them configured.
 	 */
 
 	for (i = strlen(nd->myif.ifra_name) - 1;
 		nd->myif.ifra_name[i] >= '0' &&
 		nd->myif.ifra_name[i] <= '9';
 		nd->myif.ifra_name[i] ++) {
 		error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
 		if(!error)
 			break;
 	}
 #endif
 	error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
 	if (error)
 		panic("nfs_mountroot: SIOCAIFADDR: %d", error);
 	if ((cp = kern_getenv("boot.netif.mtu")) != NULL) {
 		ir.ifr_mtu = strtol(cp, NULL, 10);
 		bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ);
 		freeenv(cp);
 		error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td);
 		if (error)
 			printf("nfs_mountroot: SIOCSIFMTU: %d", error);
 	}
 	soclose(so);
 
 	/*
 	 * If the gateway field is filled in, set it as the default route.
 	 * Note that pxeboot will set a default route of 0 if the route
 	 * is not set by the DHCP server.  Check also for a value of 0
 	 * to avoid panicking inappropriately in that situation.
 	 */
 	if (nd->mygateway.sin_len != 0 &&
 	    nd->mygateway.sin_addr.s_addr != 0) {
 		struct sockaddr_in mask, sin;
 		struct epoch_tracker et;
 		struct rt_addrinfo info;
 		struct rib_cmd_info rc;
 
 		bzero((caddr_t)&mask, sizeof(mask));
 		sin = mask;
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
                 /* XXX MRT use table 0 for this sort of thing */
 		NET_EPOCH_ENTER(et);
 		CURVNET_SET(TD_TO_VNET(td));
 
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_flags = RTF_UP | RTF_GATEWAY;
 		info.rti_info[RTAX_DST] = (struct sockaddr *)&sin;
 		info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&nd->mygateway;
 		info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask;
 
 		error = rib_action(RT_DEFAULT_FIB, RTM_ADD, &info, &rc);
 		CURVNET_RESTORE();
 		NET_EPOCH_EXIT(et);
 		if (error)
 			panic("nfs_mountroot: RTM_ADD: %d", error);
 	}
 
 	/*
 	 * Create the rootfs mount point.
 	 */
 	nd->root_args.fh = nd->root_fh;
 	nd->root_args.fhsize = nd->root_fhsize;
 	l = ntohl(nd->root_saddr.sin_addr.s_addr);
 	snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s",
 		(l >> 24) & 0xff, (l >> 16) & 0xff,
 		(l >>  8) & 0xff, (l >>  0) & 0xff, nd->root_hostnam);
 	printf("NFS ROOT: %s\n", buf);
 	nd->root_args.hostname = buf;
 	if ((error = nfs_mountdiskless(buf,
 	    &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) {
 		return (error);
 	}
 
 	/*
 	 * This is not really an nfs issue, but it is much easier to
 	 * set hostname here and then let the "/etc/rc.xxx" files
 	 * mount the right /var based upon its preset value.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	strlcpy(prison0.pr_hostname, nd->my_hostnam,
 	    sizeof(prison0.pr_hostname));
 	mtx_unlock(&prison0.pr_mtx);
 	inittodr(ntohl(nd->root_time));
 	return (0);
 }
 
 /*
  * Internal version of mount system call for diskless setup.
  */
 static int
 nfs_mountdiskless(char *path,
     struct sockaddr_in *sin, struct nfs_args *args, struct thread *td,
     struct vnode **vpp, struct mount *mp)
 {
 	struct sockaddr *nam;
 	int dirlen, error;
 	char *dirpath;
 
 	/*
 	 * Find the directory path in "path", which also has the server's
 	 * name/ip address in it.
 	 */
 	dirpath = strchr(path, ':');
 	if (dirpath != NULL)
 		dirlen = strlen(++dirpath);
 	else
 		dirlen = 0;
 	nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK);
 	if ((error = mountnfs(args, mp, nam, path, NULL, 0, dirpath, dirlen,
 	    NULL, 0, vpp, td->td_ucred, td, NFS_DEFAULT_NAMETIMEO, 
 	    NFS_DEFAULT_NEGNAMETIMEO, 0)) != 0) {
 		printf("nfs_mountroot: mount %s on /: %d\n", path, error);
 		return (error);
 	}
 	return (0);
 }
 
 static void
 nfs_sec_name(char *sec, int *flagsp)
 {
 	if (!strcmp(sec, "krb5"))
 		*flagsp |= NFSMNT_KERB;
 	else if (!strcmp(sec, "krb5i"))
 		*flagsp |= (NFSMNT_KERB | NFSMNT_INTEGRITY);
 	else if (!strcmp(sec, "krb5p"))
 		*flagsp |= (NFSMNT_KERB | NFSMNT_PRIVACY);
 }
 
 static void
 nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp,
     const char *hostname, struct ucred *cred, struct thread *td)
 {
 	int adjsock;
 	char *p;
 
 	/*
 	 * Set read-only flag if requested; otherwise, clear it if this is
 	 * an update.  If this is not an update, then either the read-only
 	 * flag is already clear, or this is a root mount and it was set
 	 * intentionally at some previous point.
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_RDONLY;
 		MNT_IUNLOCK(mp);
 	} else if (mp->mnt_flag & MNT_UPDATE) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~MNT_RDONLY;
 		MNT_IUNLOCK(mp);
 	}
 
 	/*
 	 * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes
 	 * no sense in that context.  Also, set up appropriate retransmit
 	 * and soft timeout behavior.
 	 */
 	if (argp->sotype == SOCK_STREAM) {
 		nmp->nm_flag &= ~NFSMNT_NOCONN;
 		nmp->nm_timeo = NFS_MAXTIMEO;
 		if ((argp->flags & NFSMNT_NFSV4) != 0)
 			nmp->nm_retry = INT_MAX;
 		else
 			nmp->nm_retry = NFS_RETRANS_TCP;
 	}
 
 	/* Also clear RDIRPLUS if NFSv2, it crashes some servers */
 	if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) {
 		argp->flags &= ~NFSMNT_RDIRPLUS;
 		nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 	}
 
 	/* Clear ONEOPENOWN for NFSv2, 3 and 4.0. */
 	if (nmp->nm_minorvers == 0) {
 		argp->flags &= ~NFSMNT_ONEOPENOWN;
 		nmp->nm_flag &= ~NFSMNT_ONEOPENOWN;
 	}
 
 	/* Re-bind if rsrvd port requested and wasn't on one */
 	adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT)
 		  && (argp->flags & NFSMNT_RESVPORT);
 	/* Also re-bind if we're switching to/from a connected UDP socket */
 	adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) !=
 		    (argp->flags & NFSMNT_NOCONN));
 
 	/* Update flags atomically.  Don't change the lock bits. */
 	nmp->nm_flag = argp->flags | nmp->nm_flag;
 
 	if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) {
 		nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10;
 		if (nmp->nm_timeo < NFS_MINTIMEO)
 			nmp->nm_timeo = NFS_MINTIMEO;
 		else if (nmp->nm_timeo > NFS_MAXTIMEO)
 			nmp->nm_timeo = NFS_MAXTIMEO;
 	}
 
 	if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) {
 		nmp->nm_retry = argp->retrans;
 		if (nmp->nm_retry > NFS_MAXREXMIT)
 			nmp->nm_retry = NFS_MAXREXMIT;
 	}
 
 	if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) {
 		nmp->nm_wsize = argp->wsize;
 		/*
 		 * Clip at the power of 2 below the size. There is an
 		 * issue (not isolated) that causes intermittent page
 		 * faults if this is not done.
 		 */
 		if (nmp->nm_wsize > NFS_FABLKSIZE)
 			nmp->nm_wsize = 1 << (fls(nmp->nm_wsize) - 1);
 		else
 			nmp->nm_wsize = NFS_FABLKSIZE;
 	}
 
 	if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) {
 		nmp->nm_rsize = argp->rsize;
 		/*
 		 * Clip at the power of 2 below the size. There is an
 		 * issue (not isolated) that causes intermittent page
 		 * faults if this is not done.
 		 */
 		if (nmp->nm_rsize > NFS_FABLKSIZE)
 			nmp->nm_rsize = 1 << (fls(nmp->nm_rsize) - 1);
 		else
 			nmp->nm_rsize = NFS_FABLKSIZE;
 	}
 
 	if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) {
 		nmp->nm_readdirsize = argp->readdirsize;
 	}
 
 	if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0)
 		nmp->nm_acregmin = argp->acregmin;
 	else
 		nmp->nm_acregmin = NFS_MINATTRTIMO;
 	if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0)
 		nmp->nm_acregmax = argp->acregmax;
 	else
 		nmp->nm_acregmax = NFS_MAXATTRTIMO;
 	if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0)
 		nmp->nm_acdirmin = argp->acdirmin;
 	else
 		nmp->nm_acdirmin = NFS_MINDIRATTRTIMO;
 	if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0)
 		nmp->nm_acdirmax = argp->acdirmax;
 	else
 		nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO;
 	if (nmp->nm_acdirmin > nmp->nm_acdirmax)
 		nmp->nm_acdirmin = nmp->nm_acdirmax;
 	if (nmp->nm_acregmin > nmp->nm_acregmax)
 		nmp->nm_acregmin = nmp->nm_acregmax;
 
 	if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) {
 		if (argp->readahead <= NFS_MAXRAHEAD)
 			nmp->nm_readahead = argp->readahead;
 		else
 			nmp->nm_readahead = NFS_MAXRAHEAD;
 	}
 	if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
 		if (argp->wcommitsize < nmp->nm_wsize)
 			nmp->nm_wcommitsize = nmp->nm_wsize;
 		else
 			nmp->nm_wcommitsize = argp->wcommitsize;
 	}
 
 	adjsock |= ((nmp->nm_sotype != argp->sotype) ||
 		    (nmp->nm_soproto != argp->proto));
 
 	if (nmp->nm_client != NULL && adjsock) {
 		int haslock = 0, error = 0;
 
 		if (nmp->nm_sotype == SOCK_STREAM) {
 			error = newnfs_sndlock(&nmp->nm_sockreq.nr_lock);
 			if (!error)
 				haslock = 1;
 		}
 		if (!error) {
 		    newnfs_disconnect(&nmp->nm_sockreq);
 		    if (haslock)
 			newnfs_sndunlock(&nmp->nm_sockreq.nr_lock);
 		    nmp->nm_sotype = argp->sotype;
 		    nmp->nm_soproto = argp->proto;
 		    if (nmp->nm_sotype == SOCK_DGRAM)
 			while (newnfs_connect(nmp, &nmp->nm_sockreq,
 			    cred, td, 0, false)) {
 				printf("newnfs_args: retrying connect\n");
 				(void) nfs_catnap(PSOCK, 0, "nfscon");
 			}
 		}
 	} else {
 		nmp->nm_sotype = argp->sotype;
 		nmp->nm_soproto = argp->proto;
 	}
 
 	if (hostname != NULL) {
 		strlcpy(nmp->nm_hostname, hostname,
 		    sizeof(nmp->nm_hostname));
 		p = strchr(nmp->nm_hostname, ':');
 		if (p != NULL)
 			*p = '\0';
 	}
 }
 
 static const char *nfs_opts[] = { "from", "nfs_args",
     "noac", "noatime", "noexec", "suiddir", "nosuid", "nosymfollow", "union",
     "noclusterr", "noclusterw", "multilabel", "acls", "force", "update",
     "async", "noconn", "nolockd", "conn", "lockd", "intr", "rdirplus",
     "readdirsize", "soft", "hard", "mntudp", "tcp", "udp", "wsize", "rsize",
     "retrans", "actimeo", "acregmin", "acregmax", "acdirmin", "acdirmax",
     "resvport", "readahead", "hostname", "timeo", "timeout", "addr", "fh",
     "nfsv3", "sec", "principal", "nfsv4", "gssname", "allgssname", "dirpath",
     "minorversion", "nametimeo", "negnametimeo", "nocto", "noncontigwr",
     "pnfs", "wcommitsize", "oneopenown",
     NULL };
 
 /*
  * Parse the "from" mountarg, passed by the generic mount(8) program
  * or the mountroot code.  This is used when rerooting into NFS.
  *
  * Note that the "hostname" is actually a "hostname:/share/path" string.
  */
 static int
 nfs_mount_parse_from(struct vfsoptlist *opts, char **hostnamep,
     struct sockaddr_in **sinp, char *dirpath, size_t dirpathsize, int *dirlenp)
 {
 	char *nam, *delimp, *hostp, *spec;
 	int error, have_bracket = 0, offset, rv, speclen;
 	struct sockaddr_in *sin;
 	size_t len;
 
 	error = vfs_getopt(opts, "from", (void **)&spec, &speclen);
 	if (error != 0)
 		return (error);
 	nam = malloc(MNAMELEN + 1, M_TEMP, M_WAITOK);
 
 	/*
 	 * This part comes from sbin/mount_nfs/mount_nfs.c:getnfsargs().
 	 */
 	if (*spec == '[' && (delimp = strchr(spec + 1, ']')) != NULL &&
 	    *(delimp + 1) == ':') {
 		hostp = spec + 1;
 		spec = delimp + 2;
 		have_bracket = 1;
 	} else if ((delimp = strrchr(spec, ':')) != NULL) {
 		hostp = spec;
 		spec = delimp + 1;
 	} else if ((delimp = strrchr(spec, '@')) != NULL) {
 		printf("%s: path@server syntax is deprecated, "
 		    "use server:path\n", __func__);
 		hostp = delimp + 1;
 	} else {
 		printf("%s: no <host>:<dirpath> nfs-name\n", __func__);
 		free(nam, M_TEMP);
 		return (EINVAL);
 	}
 	*delimp = '\0';
 
 	/*
 	 * If there has been a trailing slash at mounttime it seems
 	 * that some mountd implementations fail to remove the mount
 	 * entries from their mountlist while unmounting.
 	 */
 	for (speclen = strlen(spec);
 	    speclen > 1 && spec[speclen - 1] == '/';
 	    speclen--)
 		spec[speclen - 1] = '\0';
 	if (strlen(hostp) + strlen(spec) + 1 > MNAMELEN) {
 		printf("%s: %s:%s: name too long", __func__, hostp, spec);
 		free(nam, M_TEMP);
 		return (EINVAL);
 	}
 	/* Make both '@' and ':' notations equal */
 	if (*hostp != '\0') {
 		len = strlen(hostp);
 		offset = 0;
 		if (have_bracket)
 			nam[offset++] = '[';
 		memmove(nam + offset, hostp, len);
 		if (have_bracket)
 			nam[len + offset++] = ']';
 		nam[len + offset++] = ':';
 		memmove(nam + len + offset, spec, speclen);
 		nam[len + speclen + offset] = '\0';
 	} else
 		nam[0] = '\0';
 
 	/*
 	 * XXX: IPv6
 	 */
 	sin = malloc(sizeof(*sin), M_SONAME, M_WAITOK);
 	rv = inet_pton(AF_INET, hostp, &sin->sin_addr);
 	if (rv != 1) {
 		printf("%s: cannot parse '%s', inet_pton() returned %d\n",
 		    __func__, hostp, rv);
 		free(nam, M_TEMP);
 		free(sin, M_SONAME);
 		return (EINVAL);
 	}
 
 	sin->sin_len = sizeof(*sin);
 	sin->sin_family = AF_INET;
 	/*
 	 * XXX: hardcoded port number.
 	 */
 	sin->sin_port = htons(2049);
 
 	*hostnamep = strdup(nam, M_NEWNFSMNT);
 	*sinp = sin;
 	strlcpy(dirpath, spec, dirpathsize);
 	*dirlenp = strlen(dirpath);
 
 	free(nam, M_TEMP);
 	return (0);
 }
 
 /*
  * VFS Operations.
  *
  * mount system call
  * It seems a bit dumb to copyinstr() the host and path here and then
  * bcopy() them in mountnfs(), but I wanted to detect errors before
  * doing the getsockaddr() call because getsockaddr() allocates an mbuf and
  * an error after that means that I have to release the mbuf.
  */
 /* ARGSUSED */
 static int
 nfs_mount(struct mount *mp)
 {
 	struct nfs_args args = {
 	    .version = NFS_ARGSVERSION,
 	    .addr = NULL,
 	    .addrlen = sizeof (struct sockaddr_in),
 	    .sotype = SOCK_STREAM,
 	    .proto = 0,
 	    .fh = NULL,
 	    .fhsize = 0,
 	    .flags = NFSMNT_RESVPORT,
 	    .wsize = NFS_WSIZE,
 	    .rsize = NFS_RSIZE,
 	    .readdirsize = NFS_READDIRSIZE,
 	    .timeo = 10,
 	    .retrans = NFS_RETRANS,
 	    .readahead = NFS_DEFRAHEAD,
 	    .wcommitsize = 0,			/* was: NQ_DEFLEASE */
 	    .hostname = NULL,
 	    .acregmin = NFS_MINATTRTIMO,
 	    .acregmax = NFS_MAXATTRTIMO,
 	    .acdirmin = NFS_MINDIRATTRTIMO,
 	    .acdirmax = NFS_MAXDIRATTRTIMO,
 	};
 	int error = 0, ret, len;
 	struct sockaddr *nam = NULL;
 	struct vnode *vp;
 	struct thread *td;
 	char *hst;
 	u_char nfh[NFSX_FHMAX], krbname[100], dirpath[100], srvkrbname[100];
 	char *cp, *opt, *name, *secname;
 	int nametimeo = NFS_DEFAULT_NAMETIMEO;
 	int negnametimeo = NFS_DEFAULT_NEGNAMETIMEO;
 	int minvers = 0;
 	int dirlen, has_nfs_args_opt, has_nfs_from_opt,
 	    krbnamelen, srvkrbnamelen;
 	size_t hstlen;
 
 	has_nfs_args_opt = 0;
 	has_nfs_from_opt = 0;
 	hst = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	td = curthread;
 	if ((mp->mnt_flag & (MNT_ROOTFS | MNT_UPDATE)) == MNT_ROOTFS &&
 	    nfs_diskless_valid != 0) {
 		error = nfs_mountroot(mp);
 		goto out;
 	}
 
 	nfscl_init();
 
 	/*
 	 * The old mount_nfs program passed the struct nfs_args
 	 * from userspace to kernel.  The new mount_nfs program
 	 * passes string options via nmount() from userspace to kernel
 	 * and we populate the struct nfs_args in the kernel.
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "nfs_args", NULL, NULL) == 0) {
 		error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args,
 		    sizeof(args));
 		if (error != 0)
 			goto out;
 
 		if (args.version != NFS_ARGSVERSION) {
 			error = EPROGMISMATCH;
 			goto out;
 		}
 		has_nfs_args_opt = 1;
 	}
 
 	/* Handle the new style options. */
 	if (vfs_getopt(mp->mnt_optnew, "noac", NULL, NULL) == 0) {
 		args.acdirmin = args.acdirmax =
 		    args.acregmin = args.acregmax = 0;
 		args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX |
 		    NFSMNT_ACREGMIN | NFSMNT_ACREGMAX;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "noconn", NULL, NULL) == 0)
 		args.flags |= NFSMNT_NOCONN;
 	if (vfs_getopt(mp->mnt_optnew, "conn", NULL, NULL) == 0)
 		args.flags &= ~NFSMNT_NOCONN;
 	if (vfs_getopt(mp->mnt_optnew, "nolockd", NULL, NULL) == 0)
 		args.flags |= NFSMNT_NOLOCKD;
 	if (vfs_getopt(mp->mnt_optnew, "lockd", NULL, NULL) == 0)
 		args.flags &= ~NFSMNT_NOLOCKD;
 	if (vfs_getopt(mp->mnt_optnew, "intr", NULL, NULL) == 0)
 		args.flags |= NFSMNT_INT;
 	if (vfs_getopt(mp->mnt_optnew, "rdirplus", NULL, NULL) == 0)
 		args.flags |= NFSMNT_RDIRPLUS;
 	if (vfs_getopt(mp->mnt_optnew, "resvport", NULL, NULL) == 0)
 		args.flags |= NFSMNT_RESVPORT;
 	if (vfs_getopt(mp->mnt_optnew, "noresvport", NULL, NULL) == 0)
 		args.flags &= ~NFSMNT_RESVPORT;
 	if (vfs_getopt(mp->mnt_optnew, "soft", NULL, NULL) == 0)
 		args.flags |= NFSMNT_SOFT;
 	if (vfs_getopt(mp->mnt_optnew, "hard", NULL, NULL) == 0)
 		args.flags &= ~NFSMNT_SOFT;
 	if (vfs_getopt(mp->mnt_optnew, "mntudp", NULL, NULL) == 0)
 		args.sotype = SOCK_DGRAM;
 	if (vfs_getopt(mp->mnt_optnew, "udp", NULL, NULL) == 0)
 		args.sotype = SOCK_DGRAM;
 	if (vfs_getopt(mp->mnt_optnew, "tcp", NULL, NULL) == 0)
 		args.sotype = SOCK_STREAM;
 	if (vfs_getopt(mp->mnt_optnew, "nfsv3", NULL, NULL) == 0)
 		args.flags |= NFSMNT_NFSV3;
 	if (vfs_getopt(mp->mnt_optnew, "nfsv4", NULL, NULL) == 0) {
 		args.flags |= NFSMNT_NFSV4;
 		args.sotype = SOCK_STREAM;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "allgssname", NULL, NULL) == 0)
 		args.flags |= NFSMNT_ALLGSSNAME;
 	if (vfs_getopt(mp->mnt_optnew, "nocto", NULL, NULL) == 0)
 		args.flags |= NFSMNT_NOCTO;
 	if (vfs_getopt(mp->mnt_optnew, "noncontigwr", NULL, NULL) == 0)
 		args.flags |= NFSMNT_NONCONTIGWR;
 	if (vfs_getopt(mp->mnt_optnew, "pnfs", NULL, NULL) == 0)
 		args.flags |= NFSMNT_PNFS;
 	if (vfs_getopt(mp->mnt_optnew, "oneopenown", NULL, NULL) == 0)
 		args.flags |= NFSMNT_ONEOPENOWN;
 	if (vfs_getopt(mp->mnt_optnew, "readdirsize", (void **)&opt, NULL) == 0) {
 		if (opt == NULL) { 
 			vfs_mount_error(mp, "illegal readdirsize");
 			error = EINVAL;
 			goto out;
 		}
 		ret = sscanf(opt, "%d", &args.readdirsize);
 		if (ret != 1 || args.readdirsize <= 0) {
 			vfs_mount_error(mp, "illegal readdirsize: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_READDIRSIZE;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "readahead", (void **)&opt, NULL) == 0) {
 		if (opt == NULL) { 
 			vfs_mount_error(mp, "illegal readahead");
 			error = EINVAL;
 			goto out;
 		}
 		ret = sscanf(opt, "%d", &args.readahead);
 		if (ret != 1 || args.readahead <= 0) {
 			vfs_mount_error(mp, "illegal readahead: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_READAHEAD;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "wsize", (void **)&opt, NULL) == 0) {
 		if (opt == NULL) { 
 			vfs_mount_error(mp, "illegal wsize");
 			error = EINVAL;
 			goto out;
 		}
 		ret = sscanf(opt, "%d", &args.wsize);
 		if (ret != 1 || args.wsize <= 0) {
 			vfs_mount_error(mp, "illegal wsize: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_WSIZE;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "rsize", (void **)&opt, NULL) == 0) {
 		if (opt == NULL) { 
 			vfs_mount_error(mp, "illegal rsize");
 			error = EINVAL;
 			goto out;
 		}
 		ret = sscanf(opt, "%d", &args.rsize);
 		if (ret != 1 || args.rsize <= 0) {
 			vfs_mount_error(mp, "illegal wsize: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_RSIZE;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "retrans", (void **)&opt, NULL) == 0) {
 		if (opt == NULL) { 
 			vfs_mount_error(mp, "illegal retrans");
 			error = EINVAL;
 			goto out;
 		}
 		ret = sscanf(opt, "%d", &args.retrans);
 		if (ret != 1 || args.retrans <= 0) {
 			vfs_mount_error(mp, "illegal retrans: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_RETRANS;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "actimeo", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.acregmin);
 		if (ret != 1 || args.acregmin < 0) {
 			vfs_mount_error(mp, "illegal actimeo: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.acdirmin = args.acdirmax = args.acregmax = args.acregmin;
 		args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX |
 		    NFSMNT_ACREGMIN | NFSMNT_ACREGMAX;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "acregmin", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.acregmin);
 		if (ret != 1 || args.acregmin < 0) {
 			vfs_mount_error(mp, "illegal acregmin: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_ACREGMIN;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "acregmax", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.acregmax);
 		if (ret != 1 || args.acregmax < 0) {
 			vfs_mount_error(mp, "illegal acregmax: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_ACREGMAX;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "acdirmin", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.acdirmin);
 		if (ret != 1 || args.acdirmin < 0) {
 			vfs_mount_error(mp, "illegal acdirmin: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_ACDIRMIN;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "acdirmax", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.acdirmax);
 		if (ret != 1 || args.acdirmax < 0) {
 			vfs_mount_error(mp, "illegal acdirmax: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_ACDIRMAX;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "wcommitsize", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.wcommitsize);
 		if (ret != 1 || args.wcommitsize < 0) {
 			vfs_mount_error(mp, "illegal wcommitsize: %s", opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_WCOMMITSIZE;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "timeo", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.timeo);
 		if (ret != 1 || args.timeo <= 0) {
 			vfs_mount_error(mp, "illegal timeo: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_TIMEO;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "timeout", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &args.timeo);
 		if (ret != 1 || args.timeo <= 0) {
 			vfs_mount_error(mp, "illegal timeout: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 		args.flags |= NFSMNT_TIMEO;
 	}
 	if (vfs_getopt(mp->mnt_optnew, "nametimeo", (void **)&opt, NULL) == 0) {
 		ret = sscanf(opt, "%d", &nametimeo);
 		if (ret != 1 || nametimeo < 0) {
 			vfs_mount_error(mp, "illegal nametimeo: %s", opt);
 			error = EINVAL;
 			goto out;
 		}
 	}
 	if (vfs_getopt(mp->mnt_optnew, "negnametimeo", (void **)&opt, NULL)
 	    == 0) {
 		ret = sscanf(opt, "%d", &negnametimeo);
 		if (ret != 1 || negnametimeo < 0) {
 			vfs_mount_error(mp, "illegal negnametimeo: %s",
 			    opt);
 			error = EINVAL;
 			goto out;
 		}
 	}
 	if (vfs_getopt(mp->mnt_optnew, "minorversion", (void **)&opt, NULL) ==
 	    0) {
 		ret = sscanf(opt, "%d", &minvers);
 		if (ret != 1 || minvers < 0 || minvers > 2 ||
 		    (args.flags & NFSMNT_NFSV4) == 0) {
 			vfs_mount_error(mp, "illegal minorversion: %s", opt);
 			error = EINVAL;
 			goto out;
 		}
 	}
 	if (vfs_getopt(mp->mnt_optnew, "sec",
 		(void **) &secname, NULL) == 0)
 		nfs_sec_name(secname, &args.flags);
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		struct nfsmount *nmp = VFSTONFS(mp);
 
 		if (nmp == NULL) {
 			error = EIO;
 			goto out;
 		}
 
 		/*
 		 * If a change from TCP->UDP is done and there are thread(s)
 		 * that have I/O RPC(s) in progress with a transfer size
 		 * greater than NFS_MAXDGRAMDATA, those thread(s) will be
 		 * hung, retrying the RPC(s) forever. Usually these threads
 		 * will be seen doing an uninterruptible sleep on wait channel
 		 * "nfsreq".
 		 */
 		if (args.sotype == SOCK_DGRAM && nmp->nm_sotype == SOCK_STREAM)
 			tprintf(td->td_proc, LOG_WARNING,
 	"Warning: mount -u that changes TCP->UDP can result in hung threads\n");
 
 		/*
 		 * When doing an update, we can't change version,
 		 * security, switch lockd strategies, change cookie
 		 * translation or switch oneopenown.
 		 */
 		args.flags = (args.flags &
 		    ~(NFSMNT_NFSV3 |
 		      NFSMNT_NFSV4 |
 		      NFSMNT_KERB |
 		      NFSMNT_INTEGRITY |
 		      NFSMNT_PRIVACY |
 		      NFSMNT_ONEOPENOWN |
 		      NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)) |
 		    (nmp->nm_flag &
 			(NFSMNT_NFSV3 |
 			 NFSMNT_NFSV4 |
 			 NFSMNT_KERB |
 			 NFSMNT_INTEGRITY |
 			 NFSMNT_PRIVACY |
 			 NFSMNT_ONEOPENOWN |
 			 NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/));
 		nfs_decode_args(mp, nmp, &args, NULL, td->td_ucred, td);
 		goto out;
 	}
 
 	/*
 	 * Make the nfs_ip_paranoia sysctl serve as the default connection
 	 * or no-connection mode for those protocols that support 
 	 * no-connection mode (the flag will be cleared later for protocols
 	 * that do not support no-connection mode).  This will allow a client
 	 * to receive replies from a different IP then the request was
 	 * sent to.  Note: default value for nfs_ip_paranoia is 1 (paranoid),
 	 * not 0.
 	 */
 	if (nfs_ip_paranoia == 0)
 		args.flags |= NFSMNT_NOCONN;
 
 	if (has_nfs_args_opt != 0) {
 		/*
 		 * In the 'nfs_args' case, the pointers in the args
 		 * structure are in userland - we copy them in here.
 		 */
 		if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) {
 			vfs_mount_error(mp, "Bad file handle");
 			error = EINVAL;
 			goto out;
 		}
 		error = copyin((caddr_t)args.fh, (caddr_t)nfh,
 		    args.fhsize);
 		if (error != 0)
 			goto out;
 		error = copyinstr(args.hostname, hst, MNAMELEN - 1, &hstlen);
 		if (error != 0)
 			goto out;
 		bzero(&hst[hstlen], MNAMELEN - hstlen);
 		args.hostname = hst;
 		/* getsockaddr() call must be after above copyin() calls */
 		error = getsockaddr(&nam, args.addr, args.addrlen);
 		if (error != 0)
 			goto out;
 	} else if (nfs_mount_parse_from(mp->mnt_optnew,
 	    &args.hostname, (struct sockaddr_in **)&nam, dirpath,
 	    sizeof(dirpath), &dirlen) == 0) {
 		has_nfs_from_opt = 1;
 		bcopy(args.hostname, hst, MNAMELEN);
 		hst[MNAMELEN - 1] = '\0';
 
 		/*
 		 * This only works with NFSv4 for now.
 		 */
 		args.fhsize = 0;
 		args.flags |= NFSMNT_NFSV4;
 		args.sotype = SOCK_STREAM;
 	} else {
 		if (vfs_getopt(mp->mnt_optnew, "fh", (void **)&args.fh,
 		    &args.fhsize) == 0) {
 			if (args.fhsize < 0 || args.fhsize > NFSX_FHMAX) {
 				vfs_mount_error(mp, "Bad file handle");
 				error = EINVAL;
 				goto out;
 			}
 			bcopy(args.fh, nfh, args.fhsize);
 		} else {
 			args.fhsize = 0;
 		}
 		(void) vfs_getopt(mp->mnt_optnew, "hostname",
 		    (void **)&args.hostname, &len);
 		if (args.hostname == NULL) {
 			vfs_mount_error(mp, "Invalid hostname");
 			error = EINVAL;
 			goto out;
 		}
 		if (len >= MNAMELEN) {
 			vfs_mount_error(mp, "Hostname too long");
 			error = EINVAL;
 			goto out;
 		}
 		bcopy(args.hostname, hst, len);
 		hst[len] = '\0';
 	}
 
 	if (vfs_getopt(mp->mnt_optnew, "principal", (void **)&name, NULL) == 0)
 		strlcpy(srvkrbname, name, sizeof (srvkrbname));
 	else {
 		snprintf(srvkrbname, sizeof (srvkrbname), "nfs@%s", hst);
 		cp = strchr(srvkrbname, ':');
 		if (cp != NULL)
 			*cp = '\0';
 	}
 	srvkrbnamelen = strlen(srvkrbname);
 
 	if (vfs_getopt(mp->mnt_optnew, "gssname", (void **)&name, NULL) == 0)
 		strlcpy(krbname, name, sizeof (krbname));
 	else
 		krbname[0] = '\0';
 	krbnamelen = strlen(krbname);
 
 	if (has_nfs_from_opt == 0) {
 		if (vfs_getopt(mp->mnt_optnew,
 		    "dirpath", (void **)&name, NULL) == 0)
 			strlcpy(dirpath, name, sizeof (dirpath));
 		else
 			dirpath[0] = '\0';
 		dirlen = strlen(dirpath);
 	}
 
 	if (has_nfs_args_opt == 0 && has_nfs_from_opt == 0) {
 		if (vfs_getopt(mp->mnt_optnew, "addr",
 		    (void **)&args.addr, &args.addrlen) == 0) {
 			if (args.addrlen > SOCK_MAXADDRLEN) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			nam = malloc(args.addrlen, M_SONAME, M_WAITOK);
 			bcopy(args.addr, nam, args.addrlen);
 			nam->sa_len = args.addrlen;
 		} else {
 			vfs_mount_error(mp, "No server address");
 			error = EINVAL;
 			goto out;
 		}
 	}
 
 	args.fh = nfh;
 	error = mountnfs(&args, mp, nam, hst, krbname, krbnamelen, dirpath,
 	    dirlen, srvkrbname, srvkrbnamelen, &vp, td->td_ucred, td,
 	    nametimeo, negnametimeo, minvers);
 out:
 	if (!error) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_NO_IOPF |
 		    MNTK_USES_BCACHE;
 		if ((VFSTONFS(mp)->nm_flag & NFSMNT_NFSV4) != 0)
 			mp->mnt_kern_flag |= MNTK_NULL_NOCACHE;
 		MNT_IUNLOCK(mp);
 	}
 	free(hst, M_TEMP);
 	return (error);
 }
 
 
 /*
  * VFS Operations.
  *
  * mount system call
  * It seems a bit dumb to copyinstr() the host and path here and then
  * bcopy() them in mountnfs(), but I wanted to detect errors before
  * doing the getsockaddr() call because getsockaddr() allocates an mbuf and
  * an error after that means that I have to release the mbuf.
  */
 /* ARGSUSED */
 static int
 nfs_cmount(struct mntarg *ma, void *data, uint64_t flags)
 {
 	int error;
 	struct nfs_args args;
 
 	error = copyin(data, &args, sizeof (struct nfs_args));
 	if (error)
 		return error;
 
 	ma = mount_arg(ma, "nfs_args", &args, sizeof args);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
     char *hst, u_char *krbname, int krbnamelen, u_char *dirpath, int dirlen,
     u_char *srvkrbname, int srvkrbnamelen, struct vnode **vpp,
     struct ucred *cred, struct thread *td, int nametimeo, int negnametimeo,
     int minvers)
 {
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	int error, trycnt, ret;
 	struct nfsvattr nfsva;
 	struct nfsclclient *clp;
 	struct nfsclds *dsp, *tdsp;
 	uint32_t lease;
 	static u_int64_t clval = 0;
 
 	NFSCL_DEBUG(3, "in mnt\n");
 	clp = NULL;
 	if (mp->mnt_flag & MNT_UPDATE) {
 		nmp = VFSTONFS(mp);
 		printf("%s: MNT_UPDATE is no longer handled here\n", __func__);
 		free(nam, M_SONAME);
 		return (0);
 	} else {
 		nmp = malloc(sizeof (struct nfsmount) +
 		    krbnamelen + dirlen + srvkrbnamelen + 2,
 		    M_NEWNFSMNT, M_WAITOK | M_ZERO);
 		TAILQ_INIT(&nmp->nm_bufq);
 		TAILQ_INIT(&nmp->nm_sess);
 		if (clval == 0)
 			clval = (u_int64_t)nfsboottime.tv_sec;
 		nmp->nm_clval = clval++;
 		nmp->nm_krbnamelen = krbnamelen;
 		nmp->nm_dirpathlen = dirlen;
 		nmp->nm_srvkrbnamelen = srvkrbnamelen;
 		if (td->td_ucred->cr_uid != (uid_t)0) {
 			/*
 			 * nm_uid is used to get KerberosV credentials for
 			 * the nfsv4 state handling operations if there is
 			 * no host based principal set. Use the uid of
 			 * this user if not root, since they are doing the
 			 * mount. I don't think setting this for root will
 			 * work, since root normally does not have user
 			 * credentials in a credentials cache.
 			 */
 			nmp->nm_uid = td->td_ucred->cr_uid;
 		} else {
 			/*
 			 * Just set to -1, so it won't be used.
 			 */
 			nmp->nm_uid = (uid_t)-1;
 		}
 
 		/* Copy and null terminate all the names */
 		if (nmp->nm_krbnamelen > 0) {
 			bcopy(krbname, nmp->nm_krbname, nmp->nm_krbnamelen);
 			nmp->nm_name[nmp->nm_krbnamelen] = '\0';
 		}
 		if (nmp->nm_dirpathlen > 0) {
 			bcopy(dirpath, NFSMNT_DIRPATH(nmp),
 			    nmp->nm_dirpathlen);
 			nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen
 			    + 1] = '\0';
 		}
 		if (nmp->nm_srvkrbnamelen > 0) {
 			bcopy(srvkrbname, NFSMNT_SRVKRBNAME(nmp),
 			    nmp->nm_srvkrbnamelen);
 			nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen
 			    + nmp->nm_srvkrbnamelen + 2] = '\0';
 		}
 		nmp->nm_sockreq.nr_cred = crhold(cred);
 		mtx_init(&nmp->nm_sockreq.nr_mtx, "nfssock", NULL, MTX_DEF);
 		mp->mnt_data = nmp;
 		nmp->nm_getinfo = nfs_getnlminfo;
 		nmp->nm_vinvalbuf = ncl_vinvalbuf;
 	}
 	vfs_getnewfsid(mp);
 	nmp->nm_mountp = mp;
 	mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF | MTX_DUPOK);
 
 	/*
 	 * Since nfs_decode_args() might optionally set them, these
 	 * need to be set to defaults before the call, so that the
 	 * optional settings aren't overwritten.
 	 */
 	nmp->nm_nametimeo = nametimeo;
 	nmp->nm_negnametimeo = negnametimeo;
 	nmp->nm_timeo = NFS_TIMEO;
 	nmp->nm_retry = NFS_RETRANS;
 	nmp->nm_readahead = NFS_DEFRAHEAD;
 
 	/* This is empirical approximation of sqrt(hibufspace) * 256. */
 	nmp->nm_wcommitsize = NFS_MAXBSIZE / 256;
 	while ((long)nmp->nm_wcommitsize * nmp->nm_wcommitsize < hibufspace)
 		nmp->nm_wcommitsize *= 2;
 	nmp->nm_wcommitsize *= 256;
 
 	if ((argp->flags & NFSMNT_NFSV4) != 0)
 		nmp->nm_minorvers = minvers;
 	else
 		nmp->nm_minorvers = 0;
 
 	nfs_decode_args(mp, nmp, argp, hst, cred, td);
 
 	/*
 	 * V2 can only handle 32 bit filesizes.  A 4GB-1 limit may be too
 	 * high, depending on whether we end up with negative offsets in
 	 * the client or server somewhere.  2GB-1 may be safer.
 	 *
 	 * For V3, ncl_fsinfo will adjust this as necessary.  Assume maximum
 	 * that we can handle until we find out otherwise.
 	 */
 	if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0)
 		nmp->nm_maxfilesize = 0xffffffffLL;
 	else
 		nmp->nm_maxfilesize = OFF_MAX;
 
 	if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) {
 		nmp->nm_wsize = NFS_WSIZE;
 		nmp->nm_rsize = NFS_RSIZE;
 		nmp->nm_readdirsize = NFS_READDIRSIZE;
 	}
 	nmp->nm_numgrps = NFS_MAXGRPS;
 	nmp->nm_tprintf_delay = nfs_tprintf_delay;
 	if (nmp->nm_tprintf_delay < 0)
 		nmp->nm_tprintf_delay = 0;
 	nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay;
 	if (nmp->nm_tprintf_initial_delay < 0)
 		nmp->nm_tprintf_initial_delay = 0;
 	nmp->nm_fhsize = argp->fhsize;
 	if (nmp->nm_fhsize > 0)
 		bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize);
 	bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN);
 	nmp->nm_nam = nam;
 	/* Set up the sockets and per-host congestion */
 	nmp->nm_sotype = argp->sotype;
 	nmp->nm_soproto = argp->proto;
 	nmp->nm_sockreq.nr_prog = NFS_PROG;
 	if ((argp->flags & NFSMNT_NFSV4))
 		nmp->nm_sockreq.nr_vers = NFS_VER4;
 	else if ((argp->flags & NFSMNT_NFSV3))
 		nmp->nm_sockreq.nr_vers = NFS_VER3;
 	else
 		nmp->nm_sockreq.nr_vers = NFS_VER2;
 
 
 	if ((error = newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0, false)))
 		goto bad;
 	/* For NFSv4.1, get the clientid now. */
 	if (nmp->nm_minorvers > 0) {
 		NFSCL_DEBUG(3, "at getcl\n");
 		error = nfscl_getcl(mp, cred, td, 0, &clp);
 		NFSCL_DEBUG(3, "aft getcl=%d\n", error);
 		if (error != 0)
 			goto bad;
 	}
 
 	if (nmp->nm_fhsize == 0 && (nmp->nm_flag & NFSMNT_NFSV4) &&
 	    nmp->nm_dirpathlen > 0) {
 		NFSCL_DEBUG(3, "in dirp\n");
 		/*
 		 * If the fhsize on the mount point == 0 for V4, the mount
 		 * path needs to be looked up.
 		 */
 		trycnt = 3;
 		do {
 			error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp),
 			    cred, td);
 			NFSCL_DEBUG(3, "aft dirp=%d\n", error);
 			if (error)
 				(void) nfs_catnap(PZERO, error, "nfsgetdirp");
 		} while (error && --trycnt > 0);
 		if (error)
 			goto bad;
 	}
 
 	/*
 	 * A reference count is needed on the nfsnode representing the
 	 * remote root.  If this object is not persistent, then backward
 	 * traversals of the mount point (i.e. "..") will not work if
 	 * the nfsnode gets flushed out of the cache. Ufs does not have
 	 * this problem, because one can identify root inodes by their
 	 * number == UFS_ROOTINO (2).
 	 */
 	if (nmp->nm_fhsize > 0) {
 		/*
 		 * Set f_iosize to NFS_DIRBLKSIZ so that bo_bsize gets set
 		 * non-zero for the root vnode. f_iosize will be set correctly
 		 * by nfs_statfs() before any I/O occurs.
 		 */
 		mp->mnt_stat.f_iosize = NFS_DIRBLKSIZ;
 		error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np,
 		    LK_EXCLUSIVE);
 		if (error)
 			goto bad;
 		*vpp = NFSTOV(np);
 	
 		/*
 		 * Get file attributes and transfer parameters for the
 		 * mountpoint.  This has the side effect of filling in
 		 * (*vpp)->v_type with the correct value.
 		 */
 		ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1,
 		    cred, td, &nfsva, NULL, &lease);
 		if (ret) {
 			/*
 			 * Just set default values to get things going.
 			 */
 			NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr));
 			nfsva.na_vattr.va_type = VDIR;
 			nfsva.na_vattr.va_mode = 0777;
 			nfsva.na_vattr.va_nlink = 100;
 			nfsva.na_vattr.va_uid = (uid_t)0;
 			nfsva.na_vattr.va_gid = (gid_t)0;
 			nfsva.na_vattr.va_fileid = 2;
 			nfsva.na_vattr.va_gen = 1;
 			nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE;
 			nfsva.na_vattr.va_size = 512 * 1024;
 			lease = 60;
 		}
 		(void) nfscl_loadattrcache(vpp, &nfsva, NULL, NULL, 0, 1);
 		if (nmp->nm_minorvers > 0) {
 			NFSCL_DEBUG(3, "lease=%d\n", (int)lease);
 			NFSLOCKCLSTATE();
 			clp->nfsc_renew = NFSCL_RENEW(lease);
 			clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew;
 			clp->nfsc_clientidrev++;
 			if (clp->nfsc_clientidrev == 0)
 				clp->nfsc_clientidrev++;
 			NFSUNLOCKCLSTATE();
 			/*
 			 * Mount will succeed, so the renew thread can be
 			 * started now.
 			 */
 			nfscl_start_renewthread(clp);
 			nfscl_clientrelease(clp);
 		}
 		if (argp->flags & NFSMNT_NFSV3)
 			ncl_fsinfo(nmp, *vpp, cred, td);
 	
 		/* Mark if the mount point supports NFSv4 ACLs. */
 		if ((argp->flags & NFSMNT_NFSV4) != 0 && nfsrv_useacl != 0 &&
 		    ret == 0 &&
 		    NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL)) {
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_NFS4ACLS;
 			MNT_IUNLOCK(mp);
 		}
 	
 		/*
 		 * Lose the lock but keep the ref.
 		 */
 		NFSVOPUNLOCK(*vpp);
 		vfs_cache_root_set(mp, *vpp);
 		return (0);
 	}
 	error = EIO;
 
 bad:
 	if (clp != NULL)
 		nfscl_clientrelease(clp);
 	newnfs_disconnect(&nmp->nm_sockreq);
 	crfree(nmp->nm_sockreq.nr_cred);
 	if (nmp->nm_sockreq.nr_auth != NULL)
 		AUTH_DESTROY(nmp->nm_sockreq.nr_auth);
 	mtx_destroy(&nmp->nm_sockreq.nr_mtx);
 	mtx_destroy(&nmp->nm_mtx);
 	if (nmp->nm_clp != NULL) {
 		NFSLOCKCLSTATE();
 		LIST_REMOVE(nmp->nm_clp, nfsc_list);
 		NFSUNLOCKCLSTATE();
 		free(nmp->nm_clp, M_NFSCLCLIENT);
 	}
 	TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) {
 		if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
 		    dsp->nfsclds_sockp != NULL)
 			newnfs_disconnect(dsp->nfsclds_sockp);
 		nfscl_freenfsclds(dsp);
 	}
 	free(nmp, M_NEWNFSMNT);
 	free(nam, M_SONAME);
 	return (error);
 }
 
 /*
  * unmount system call
  */
 static int
 nfs_unmount(struct mount *mp, int mntflags)
 {
 	struct thread *td;
 	struct nfsmount *nmp;
 	int error, flags = 0, i, trycnt = 0;
 	struct nfsclds *dsp, *tdsp;
 
 	td = curthread;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 	nmp = VFSTONFS(mp);
 	error = 0;
 	/*
 	 * Goes something like this..
 	 * - Call vflush() to clear out vnodes for this filesystem
 	 * - Close the socket
 	 * - Free up the data structures
 	 */
 	/* In the forced case, cancel any outstanding requests. */
 	if (mntflags & MNT_FORCE) {
 		NFSDDSLOCK();
 		if (nfsv4_findmirror(nmp) != NULL)
 			error = ENXIO;
 		NFSDDSUNLOCK();
 		if (error)
 			goto out;
 		error = newnfs_nmcancelreqs(nmp);
 		if (error)
 			goto out;
 		/* For a forced close, get rid of the renew thread now */
 		nfscl_umount(nmp, td);
 	}
 	/* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */
 	do {
 		error = vflush(mp, 1, flags, td);
 		if ((mntflags & MNT_FORCE) && error != 0 && ++trycnt < 30)
 			(void) nfs_catnap(PSOCK, error, "newndm");
 	} while ((mntflags & MNT_FORCE) && error != 0 && trycnt < 30);
 	if (error)
 		goto out;
 
 	/*
 	 * We are now committed to the unmount.
 	 */
 	if ((mntflags & MNT_FORCE) == 0)
 		nfscl_umount(nmp, td);
 	else {
 		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_privflag |= NFSMNTP_FORCEDISM;
 		mtx_unlock(&nmp->nm_mtx);
 	}
 	/* Make sure no nfsiods are assigned to this mount. */
 	NFSLOCKIOD();
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 		if (ncl_iodmount[i] == nmp) {
 			ncl_iodwant[i] = NFSIOD_AVAILABLE;
 			ncl_iodmount[i] = NULL;
 		}
 	NFSUNLOCKIOD();
 
 	/*
 	 * We can now set mnt_data to NULL and wait for
 	 * nfssvc(NFSSVC_FORCEDISM) to complete.
 	 */
 	mtx_lock(&mountlist_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	mp->mnt_data = NULL;
 	mtx_unlock(&mountlist_mtx);
 	while ((nmp->nm_privflag & NFSMNTP_CANCELRPCS) != 0)
 		msleep(nmp, &nmp->nm_mtx, PVFS, "nfsfdism", 0);
 	mtx_unlock(&nmp->nm_mtx);
 
 	newnfs_disconnect(&nmp->nm_sockreq);
 	crfree(nmp->nm_sockreq.nr_cred);
 	free(nmp->nm_nam, M_SONAME);
 	if (nmp->nm_sockreq.nr_auth != NULL)
 		AUTH_DESTROY(nmp->nm_sockreq.nr_auth);
 	mtx_destroy(&nmp->nm_sockreq.nr_mtx);
 	mtx_destroy(&nmp->nm_mtx);
 	TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) {
 		if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
 		    dsp->nfsclds_sockp != NULL)
 			newnfs_disconnect(dsp->nfsclds_sockp);
 		nfscl_freenfsclds(dsp);
 	}
 	free(nmp, M_NEWNFSMNT);
 out:
 	return (error);
 }
 
 /*
  * Return root of a filesystem
  */
 static int
 nfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	int error;
 
 	nmp = VFSTONFS(mp);
 	error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, flags);
 	if (error)
 		return error;
 	vp = NFSTOV(np);
 	/*
 	 * Get transfer parameters and attributes for root vnode once.
 	 */
 	mtx_lock(&nmp->nm_mtx);
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) {
 		mtx_unlock(&nmp->nm_mtx);
 		ncl_fsinfo(nmp, vp, curthread->td_ucred, curthread);
 	} else 
 		mtx_unlock(&nmp->nm_mtx);
 	if (vp->v_type == VNON)
 	    vp->v_type = VDIR;
 	vp->v_vflag |= VV_ROOT;
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Flush out the buffer cache
  */
 /* ARGSUSED */
 static int
 nfs_sync(struct mount *mp, int waitfor)
 {
 	struct vnode *vp, *mvp;
 	struct thread *td;
 	int error, allerror = 0;
 
 	td = curthread;
 
 	MNT_ILOCK(mp);
 	/*
 	 * If a forced dismount is in progress, return from here so that
 	 * the umount(2) syscall doesn't get stuck in VFS_SYNC() before
 	 * calling VFS_UNMOUNT().
 	 */
 	if (NFSCL_FORCEDISM(mp)) {
 		MNT_IUNLOCK(mp);
 		return (EBADF);
 	}
 	MNT_IUNLOCK(mp);
 
 	/*
 	 * Force stale buffer cache information to be flushed.
 	 */
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/* XXX Racy bv_cnt check. */
 		if (NFSVOPISLOCKED(vp) || vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		NFSVOPUNLOCK(vp);
 		vrele(vp);
 	}
 	return (allerror);
 }
 
 static int
 nfs_sysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req)
 {
 	struct nfsmount *nmp = VFSTONFS(mp);
 	struct vfsquery vq;
 	int error;
 
 	bzero(&vq, sizeof(vq));
 	switch (op) {
 #if 0
 	case VFS_CTL_NOLOCKS:
 		val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0;
  		if (req->oldptr != NULL) {
  			error = SYSCTL_OUT(req, &val, sizeof(val));
  			if (error)
  				return (error);
  		}
  		if (req->newptr != NULL) {
  			error = SYSCTL_IN(req, &val, sizeof(val));
  			if (error)
  				return (error);
 			if (val)
 				nmp->nm_flag |= NFSMNT_NOLOCKS;
 			else
 				nmp->nm_flag &= ~NFSMNT_NOLOCKS;
  		}
 		break;
 #endif
 	case VFS_CTL_QUERY:
 		mtx_lock(&nmp->nm_mtx);
 		if (nmp->nm_state & NFSSTA_TIMEO)
 			vq.vq_flags |= VQ_NOTRESP;
 		mtx_unlock(&nmp->nm_mtx);
 #if 0
 		if (!(nmp->nm_flag & NFSMNT_NOLOCKS) &&
 		    (nmp->nm_state & NFSSTA_LOCKTIMEO))
 			vq.vq_flags |= VQ_NOTRESPLOCK;
 #endif
 		error = SYSCTL_OUT(req, &vq, sizeof(vq));
 		break;
  	case VFS_CTL_TIMEO:
  		if (req->oldptr != NULL) {
  			error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay,
  			    sizeof(nmp->nm_tprintf_initial_delay));
  			if (error)
  				return (error);
  		}
  		if (req->newptr != NULL) {
 			error = vfs_suser(mp, req->td);
 			if (error)
 				return (error);
  			error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay,
  			    sizeof(nmp->nm_tprintf_initial_delay));
  			if (error)
  				return (error);
  			if (nmp->nm_tprintf_initial_delay < 0)
  				nmp->nm_tprintf_initial_delay = 0;
  		}
 		break;
 	default:
 		return (ENOTSUP);
 	}
 	return (0);
 }
 
 /*
  * Purge any RPCs in progress, so that they will all return errors.
  * This allows dounmount() to continue as far as VFS_UNMOUNT() for a
  * forced dismount.
  */
 static void
 nfs_purge(struct mount *mp)
 {
 	struct nfsmount *nmp = VFSTONFS(mp);
 
 	newnfs_nmcancelreqs(nmp);
 }
 
 /*
  * Extract the information needed by the nlm from the nfs vnode.
  */
 static void
 nfs_getnlminfo(struct vnode *vp, uint8_t *fhp, size_t *fhlenp,
     struct sockaddr_storage *sp, int *is_v3p, off_t *sizep,
     struct timeval *timeop)
 {
 	struct nfsmount *nmp;
 	struct nfsnode *np = VTONFS(vp);
 
 	nmp = VFSTONFS(vp->v_mount);
 	if (fhlenp != NULL)
 		*fhlenp = (size_t)np->n_fhp->nfh_len;
 	if (fhp != NULL)
 		bcopy(np->n_fhp->nfh_fh, fhp, np->n_fhp->nfh_len);
 	if (sp != NULL)
 		bcopy(nmp->nm_nam, sp, min(nmp->nm_nam->sa_len, sizeof(*sp)));
 	if (is_v3p != NULL)
 		*is_v3p = NFS_ISV3(vp);
 	if (sizep != NULL)
 		*sizep = np->n_size;
 	if (timeop != NULL) {
 		timeop->tv_sec = nmp->nm_timeo / NFS_HZ;
 		timeop->tv_usec = (nmp->nm_timeo % NFS_HZ) * (1000000 / NFS_HZ);
 	}
 }
 
 /*
  * This function prints out an option name, based on the conditional
  * argument.
  */
 static __inline void nfscl_printopt(struct nfsmount *nmp, int testval,
     char *opt, char **buf, size_t *blen)
 {
 	int len;
 
 	if (testval != 0 && *blen > strlen(opt)) {
 		len = snprintf(*buf, *blen, "%s", opt);
 		if (len != strlen(opt))
 			printf("EEK!!\n");
 		*buf += len;
 		*blen -= len;
 	}
 }
 
 /*
  * This function printf out an options integer value.
  */
 static __inline void nfscl_printoptval(struct nfsmount *nmp, int optval,
     char *opt, char **buf, size_t *blen)
 {
 	int len;
 
 	if (*blen > strlen(opt) + 1) {
 		/* Could result in truncated output string. */
 		len = snprintf(*buf, *blen, "%s=%d", opt, optval);
 		if (len < *blen) {
 			*buf += len;
 			*blen -= len;
 		}
 	}
 }
 
 /*
  * Load the option flags and values into the buffer.
  */
 void nfscl_retopts(struct nfsmount *nmp, char *buffer, size_t buflen)
 {
 	char *buf;
 	size_t blen;
 
 	buf = buffer;
 	blen = buflen;
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV4) != 0, "nfsv4", &buf,
 	    &blen);
 	if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) {
 		nfscl_printoptval(nmp, nmp->nm_minorvers, ",minorversion", &buf,
 		    &blen);
 		nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_PNFS) != 0, ",pnfs",
 		    &buf, &blen);
 		nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_ONEOPENOWN) != 0 &&
 		    nmp->nm_minorvers > 0, ",oneopenown", &buf, &blen);
 	}
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV3) != 0, "nfsv3", &buf,
 	    &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0,
 	    "nfsv2", &buf, &blen);
 	nfscl_printopt(nmp, nmp->nm_sotype == SOCK_STREAM, ",tcp", &buf, &blen);
 	nfscl_printopt(nmp, nmp->nm_sotype != SOCK_STREAM, ",udp", &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RESVPORT) != 0, ",resvport",
 	    &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCONN) != 0, ",noconn",
 	    &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) == 0, ",hard", &buf,
 	    &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) != 0, ",soft", &buf,
 	    &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_INT) != 0, ",intr", &buf,
 	    &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) == 0, ",cto", &buf,
 	    &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) != 0, ",nocto", &buf,
 	    &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0,
 	    ",noncontigwr", &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) ==
 	    0, ",lockd", &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) ==
 	    NFSMNT_NOLOCKD, ",nolockd", &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RDIRPLUS) != 0, ",rdirplus",
 	    &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_KERB) == 0, ",sec=sys",
 	    &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY |
 	    NFSMNT_PRIVACY)) == NFSMNT_KERB, ",sec=krb5", &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY |
 	    NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_INTEGRITY), ",sec=krb5i",
 	    &buf, &blen);
 	nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY |
 	    NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_PRIVACY), ",sec=krb5p",
 	    &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_acdirmin, ",acdirmin", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_acdirmax, ",acdirmax", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_acregmin, ",acregmin", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_acregmax, ",acregmax", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_nametimeo, ",nametimeo", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_negnametimeo, ",negnametimeo", &buf,
 	    &blen);
 	nfscl_printoptval(nmp, nmp->nm_rsize, ",rsize", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_wsize, ",wsize", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_readdirsize, ",readdirsize", &buf,
 	    &blen);
 	nfscl_printoptval(nmp, nmp->nm_readahead, ",readahead", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_wcommitsize, ",wcommitsize", &buf,
 	    &blen);
 	nfscl_printoptval(nmp, nmp->nm_timeo, ",timeout", &buf, &blen);
 	nfscl_printoptval(nmp, nmp->nm_retry, ",retrans", &buf, &blen);
 }
 
Index: projects/clang1100-import/sys/fs/nullfs/null_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/nullfs/null_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/nullfs/null_vfsops.c	(revision 364279)
@@ -1,468 +1,468 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)null_vfsops.c	8.2 (Berkeley) 1/21/94
  *
  * @(#)lofs_vfsops.c	1.2 (Berkeley) 6/18/92
  * $FreeBSD$
  */
 
 /*
  * Null Layer
  * (See null_vnops.c for a description of what this does.)
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/jail.h>
 
 #include <fs/nullfs/null.h>
 
 static MALLOC_DEFINE(M_NULLFSMNT, "nullfs_mount", "NULLFS mount structure");
 
 static vfs_fhtovp_t	nullfs_fhtovp;
 static vfs_mount_t	nullfs_mount;
 static vfs_quotactl_t	nullfs_quotactl;
 static vfs_root_t	nullfs_root;
 static vfs_sync_t	nullfs_sync;
 static vfs_statfs_t	nullfs_statfs;
 static vfs_unmount_t	nullfs_unmount;
 static vfs_vget_t	nullfs_vget;
 static vfs_extattrctl_t	nullfs_extattrctl;
 
 /*
  * Mount null layer
  */
 static int
 nullfs_mount(struct mount *mp)
 {
 	struct vnode *lowerrootvp;
 	struct vnode *nullm_rootvp;
 	struct null_mount *xmp;
 	struct null_node *nn;
 	struct nameidata nd, *ndp;
 	char *target;
 	int error, len;
 	bool isvnunlocked;
 
 	NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
 
 	if (mp->mnt_flag & MNT_ROOTFS)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Only support update mounts for NFS export.
 		 */
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
 			return (0);
 		else
 			return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Get argument
 	 */
 	error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len);
 	if (error || target[len - 1] != '\0')
 		return (EINVAL);
 
 	/*
 	 * Unlock lower node to avoid possible deadlock.
 	 */
 	if (mp->mnt_vnodecovered->v_op == &null_vnodeops &&
 	    VOP_ISLOCKED(mp->mnt_vnodecovered) == LK_EXCLUSIVE) {
 		VOP_UNLOCK(mp->mnt_vnodecovered);
 		isvnunlocked = true;
 	} else {
 		isvnunlocked = false;
 	}
 
 	/*
 	 * Find lower node
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, curthread);
 	error = namei(ndp);
 
 	/*
 	 * Re-lock vnode.
 	 * XXXKIB This is deadlock-prone as well.
 	 */
 	if (isvnunlocked)
 		vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY);
 
 	if (error)
 		return (error);
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 
 	/*
 	 * Sanity check on lower vnode
 	 */
 	lowerrootvp = ndp->ni_vp;
 
 	/*
 	 * Check multi null mount to avoid `lock against myself' panic.
 	 */
 	if (mp->mnt_vnodecovered->v_op == &null_vnodeops) {
 		nn = VTONULL(mp->mnt_vnodecovered);
 		if (nn == NULL || lowerrootvp == nn->null_lowervp) {
 			NULLFSDEBUG("nullfs_mount: multi null mount?\n");
 			vput(lowerrootvp);
 			return (EDEADLK);
 		}
 	}
 
 	xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
 	    M_NULLFSMNT, M_WAITOK | M_ZERO);
 
 	/*
 	 * Save pointer to underlying FS and the reference to the
 	 * lower root vnode.
 	 */
 	xmp->nullm_vfs = lowerrootvp->v_mount;
 	vref(lowerrootvp);
 	xmp->nullm_lowerrootvp = lowerrootvp;
 	mp->mnt_data = xmp;
 
 	/*
 	 * Make sure the node alias worked.
 	 */
 	error = null_nodeget(mp, lowerrootvp, &nullm_rootvp);
 	if (error != 0) {
 		vrele(lowerrootvp);
 		free(xmp, M_NULLFSMNT);
 		return (error);
 	}
 
 	if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_LOCAL;
 		MNT_IUNLOCK(mp);
 	}
 
 	xmp->nullm_flags |= NULLM_CACHE;
 	if (vfs_getopt(mp->mnt_optnew, "nocache", NULL, NULL) == 0 ||
 	    (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) != 0)
 		xmp->nullm_flags &= ~NULLM_CACHE;
 
 	MNT_ILOCK(mp);
 	if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
 		mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
 		    (MNTK_SHARED_WRITES | MNTK_LOOKUP_SHARED |
 		    MNTK_EXTENDED_SHARED);
 	}
 	mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT | MNTK_NOMSYNC;
 	mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
 	    (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS);
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 	if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
 		MNT_ILOCK(xmp->nullm_vfs);
 		TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp,
 		    mnt_upper_link);
 		MNT_IUNLOCK(xmp->nullm_vfs);
 	}
 
 	vfs_mountedfrom(mp, target);
 	vput(nullm_rootvp);
 
 	NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
 		mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
 	return (0);
 }
 
 /*
  * Free reference to null layer
  */
 static int
 nullfs_unmount(mp, mntflags)
 	struct mount *mp;
 	int mntflags;
 {
 	struct null_mount *mntdata;
 	struct mount *ump;
 	int error, flags;
 
 	NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
 
 	if (mntflags & MNT_FORCE)
 		flags = FORCECLOSE;
 	else
 		flags = 0;
 
 	for (;;) {
 		/* There is 1 extra root vnode reference (nullm_rootvp). */
 		error = vflush(mp, 0, flags, curthread);
 		if (error)
 			return (error);
 		MNT_ILOCK(mp);
 		if (mp->mnt_nvnodelistsize == 0) {
 			MNT_IUNLOCK(mp);
 			break;
 		}
 		MNT_IUNLOCK(mp);
 		if ((mntflags & MNT_FORCE) == 0)
 			return (EBUSY);
 	}
 
 	/*
 	 * Finally, throw away the null_mount structure
 	 */
 	mntdata = mp->mnt_data;
 	ump = mntdata->nullm_vfs;
 	if ((mntdata->nullm_flags & NULLM_CACHE) != 0) {
 		MNT_ILOCK(ump);
 		while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) {
 			ump->mnt_kern_flag |= MNTK_VGONE_WAITER;
 			msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0);
 		}
 		TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link);
 		MNT_IUNLOCK(ump);
 	}
 	vrele(mntdata->nullm_lowerrootvp);
 	mp->mnt_data = NULL;
 	free(mntdata, M_NULLFSMNT);
 	return (0);
 }
 
 static int
 nullfs_root(mp, flags, vpp)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 {
 	struct vnode *vp;
 	struct null_mount *mntdata;
 	int error;
 
 	mntdata = MOUNTTONULLMOUNT(mp);
 	NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", mp,
 	    mntdata->nullm_lowerrootvp);
 
-	error = vget(mntdata->nullm_lowerrootvp, flags, curthread);
+	error = vget(mntdata->nullm_lowerrootvp, flags);
 	if (error == 0) {
 		error = null_nodeget(mp, mntdata->nullm_lowerrootvp, &vp);
 		if (error == 0) {
 			*vpp = vp;
 		}
 	}
 	return (error);
 }
 
 static int
 nullfs_quotactl(mp, cmd, uid, arg)
 	struct mount *mp;
 	int cmd;
 	uid_t uid;
 	void *arg;
 {
 	return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg);
 }
 
 static int
 nullfs_statfs(mp, sbp)
 	struct mount *mp;
 	struct statfs *sbp;
 {
 	int error;
 	struct statfs *mstat;
 
 	NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p->%p)\n", (void *)mp,
 	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp,
 	    (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp));
 
 	mstat = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK | M_ZERO);
 
 	error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, mstat);
 	if (error) {
 		free(mstat, M_STATFS);
 		return (error);
 	}
 
 	/* now copy across the "interesting" information and fake the rest */
 	sbp->f_type = mstat->f_type;
 	sbp->f_flags = (sbp->f_flags & (MNT_RDONLY | MNT_NOEXEC | MNT_NOSUID |
 	    MNT_UNION | MNT_NOSYMFOLLOW | MNT_AUTOMOUNTED)) |
 	    (mstat->f_flags & ~(MNT_ROOTFS | MNT_AUTOMOUNTED));
 	sbp->f_bsize = mstat->f_bsize;
 	sbp->f_iosize = mstat->f_iosize;
 	sbp->f_blocks = mstat->f_blocks;
 	sbp->f_bfree = mstat->f_bfree;
 	sbp->f_bavail = mstat->f_bavail;
 	sbp->f_files = mstat->f_files;
 	sbp->f_ffree = mstat->f_ffree;
 
 	free(mstat, M_STATFS);
 	return (0);
 }
 
 static int
 nullfs_sync(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	/*
 	 * XXX - Assumes no data cached at null layer.
 	 */
 	return (0);
 }
 
 static int
 nullfs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	int error;
 
 	KASSERT((flags & LK_TYPE_MASK) != 0,
 	    ("nullfs_vget: no lock requested"));
 
 	error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp);
 	if (error != 0)
 		return (error);
 	return (null_nodeget(mp, *vpp, vpp));
 }
 
 static int
 nullfs_fhtovp(mp, fidp, flags, vpp)
 	struct mount *mp;
 	struct fid *fidp;
 	int flags;
 	struct vnode **vpp;
 {
 	int error;
 
 	error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, flags,
 	    vpp);
 	if (error != 0)
 		return (error);
 	return (null_nodeget(mp, *vpp, vpp));
 }
 
 static int                        
 nullfs_extattrctl(mp, cmd, filename_vp, namespace, attrname)
 	struct mount *mp;
 	int cmd;
 	struct vnode *filename_vp;
 	int namespace;
 	const char *attrname;
 {
 
 	return (VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd,
 	    filename_vp, namespace, attrname));
 }
 
 static void
 nullfs_reclaim_lowervp(struct mount *mp, struct vnode *lowervp)
 {
 	struct vnode *vp;
 
 	vp = null_hashget(mp, lowervp);
 	if (vp == NULL)
 		return;
 	VTONULL(vp)->null_flags |= NULLV_NOUNLOCK;
 	vgone(vp);
 	vput(vp);
 }
 
 static void
 nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp)
 {
 	struct vnode *vp;
 	struct null_node *xp;
 
 	vp = null_hashget(mp, lowervp);
 	if (vp == NULL)
 		return;
 	xp = VTONULL(vp);
 	xp->null_flags |= NULLV_DROP | NULLV_NOUNLOCK;
 	vhold(vp);
 	vunref(vp);
 
 	if (vp->v_usecount == 0) {
 		/*
 		 * If vunref() dropped the last use reference on the
 		 * nullfs vnode, it must be reclaimed, and its lock
 		 * was split from the lower vnode lock.  Need to do
 		 * extra unlock before allowing the final vdrop() to
 		 * free the vnode.
 		 */
 		KASSERT(VN_IS_DOOMED(vp),
 		    ("not reclaimed nullfs vnode %p", vp));
 		VOP_UNLOCK(vp);
 	} else {
 		/*
 		 * Otherwise, the nullfs vnode still shares the lock
 		 * with the lower vnode, and must not be unlocked.
 		 * Also clear the NULLV_NOUNLOCK, the flag is not
 		 * relevant for future reclamations.
 		 */
 		ASSERT_VOP_ELOCKED(vp, "unlink_lowervp");
 		KASSERT(!VN_IS_DOOMED(vp),
 		    ("reclaimed nullfs vnode %p", vp));
 		xp->null_flags &= ~NULLV_NOUNLOCK;
 	}
 	vdrop(vp);
 }
 
 static struct vfsops null_vfsops = {
 	.vfs_extattrctl =	nullfs_extattrctl,
 	.vfs_fhtovp =		nullfs_fhtovp,
 	.vfs_init =		nullfs_init,
 	.vfs_mount =		nullfs_mount,
 	.vfs_quotactl =		nullfs_quotactl,
 	.vfs_root =		nullfs_root,
 	.vfs_statfs =		nullfs_statfs,
 	.vfs_sync =		nullfs_sync,
 	.vfs_uninit =		nullfs_uninit,
 	.vfs_unmount =		nullfs_unmount,
 	.vfs_vget =		nullfs_vget,
 	.vfs_reclaim_lowervp =	nullfs_reclaim_lowervp,
 	.vfs_unlink_lowervp =	nullfs_unlink_lowervp,
 };
 
 VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL);
Index: projects/clang1100-import/sys/fs/pseudofs/pseudofs_vncache.c
===================================================================
--- projects/clang1100-import/sys/fs/pseudofs/pseudofs_vncache.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/pseudofs/pseudofs_vncache.c	(revision 364279)
@@ -1,361 +1,361 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_pseudofs.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/pseudofs/pseudofs_internal.h>
 
 static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache");
 
 static struct mtx pfs_vncache_mutex;
 static eventhandler_tag pfs_exit_tag;
 static void pfs_exit(void *arg, struct proc *p);
 static void pfs_purge_all(void);
 
 static SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "pseudofs vnode cache");
 
 static int pfs_vncache_entries;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD,
     &pfs_vncache_entries, 0,
     "number of entries in the vnode cache");
 
 static int pfs_vncache_maxentries;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD,
     &pfs_vncache_maxentries, 0,
     "highest number of entries in the vnode cache");
 
 static int pfs_vncache_hits;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD,
     &pfs_vncache_hits, 0,
     "number of cache hits since initialization");
 
 static int pfs_vncache_misses;
 SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD,
     &pfs_vncache_misses, 0,
     "number of cache misses since initialization");
 
 extern struct vop_vector pfs_vnodeops;	/* XXX -> .h file */
 
 static SLIST_HEAD(pfs_vncache_head, pfs_vdata) *pfs_vncache_hashtbl;
 static u_long pfs_vncache_hash;
 #define PFS_VNCACHE_HASH(pid)	(&pfs_vncache_hashtbl[(pid) & pfs_vncache_hash])
 
 /*
  * Initialize vnode cache
  */
 void
 pfs_vncache_load(void)
 {
 
 	mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF);
 	pfs_vncache_hashtbl = hashinit(maxproc / 4, M_PFSVNCACHE, &pfs_vncache_hash);
 	pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Tear down vnode cache
  */
 void
 pfs_vncache_unload(void)
 {
 
 	EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag);
 	pfs_purge_all();
 	KASSERT(pfs_vncache_entries == 0,
 	    ("%d vncache entries remaining", pfs_vncache_entries));
 	mtx_destroy(&pfs_vncache_mutex);
 }
 
 /*
  * Allocate a vnode
  */
 int
 pfs_vncache_alloc(struct mount *mp, struct vnode **vpp,
 		  struct pfs_node *pn, pid_t pid)
 {
 	struct pfs_vncache_head *hash;
 	struct pfs_vdata *pvd, *pvd2;
 	struct vnode *vp;
 	int error;
 
 	/*
 	 * See if the vnode is in the cache.
 	 */
 	hash = PFS_VNCACHE_HASH(pid);
 	if (SLIST_EMPTY(hash))
 		goto alloc;
 retry:
 	mtx_lock(&pfs_vncache_mutex);
 	SLIST_FOREACH(pvd, hash, pvd_hash) {
 		if (pvd->pvd_pn == pn && pvd->pvd_pid == pid &&
 		    pvd->pvd_vnode->v_mount == mp) {
 			vp = pvd->pvd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&pfs_vncache_mutex);
-			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
+			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
 				++pfs_vncache_hits;
 				*vpp = vp;
 				/*
 				 * Some callers cache_enter(vp) later, so
 				 * we have to make sure it's not in the
 				 * VFS cache so it doesn't get entered
 				 * twice.  A better solution would be to
 				 * make pfs_vncache_alloc() responsible
 				 * for entering the vnode in the VFS
 				 * cache.
 				 */
 				cache_purge(vp);
 				return (0);
 			}
 			goto retry;
 		}
 	}
 	mtx_unlock(&pfs_vncache_mutex);
 alloc:
 	/* nope, get a new one */
 	pvd = malloc(sizeof *pvd, M_PFSVNCACHE, M_WAITOK);
 	error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp);
 	if (error) {
 		free(pvd, M_PFSVNCACHE);
 		return (error);
 	}
 	pvd->pvd_pn = pn;
 	pvd->pvd_pid = pid;
 	(*vpp)->v_data = pvd;
 	switch (pn->pn_type) {
 	case pfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 #if 0
 		printf("root vnode allocated\n");
 #endif
 		/* fall through */
 	case pfstype_dir:
 	case pfstype_this:
 	case pfstype_parent:
 	case pfstype_procdir:
 		(*vpp)->v_type = VDIR;
 		break;
 	case pfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case pfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case pfstype_none:
 		KASSERT(0, ("pfs_vncache_alloc called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type);
 	}
 	/*
 	 * Propagate flag through to vnode so users know it can change
 	 * if the process changes (i.e. execve)
 	 */
 	if ((pn->pn_flags & PFS_PROCDEP) != 0)
 		(*vpp)->v_vflag |= VV_PROCDEP;
 	pvd->pvd_vnode = *vpp;
 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	VN_LOCK_AREC(*vpp);
 	error = insmntque(*vpp, mp);
 	if (error != 0) {
 		free(pvd, M_PFSVNCACHE);
 		*vpp = NULLVP;
 		return (error);
 	}
 retry2:
 	mtx_lock(&pfs_vncache_mutex);
 	/*
 	 * Other thread may race with us, creating the entry we are
 	 * going to insert into the cache. Recheck after
 	 * pfs_vncache_mutex is reacquired.
 	 */
 	SLIST_FOREACH(pvd2, hash, pvd_hash) {
 		if (pvd2->pvd_pn == pn && pvd2->pvd_pid == pid &&
 		    pvd2->pvd_vnode->v_mount == mp) {
 			vp = pvd2->pvd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&pfs_vncache_mutex);
-			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
+			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
 				++pfs_vncache_hits;
 				vgone(*vpp);
 				vput(*vpp);
 				*vpp = vp;
 				cache_purge(vp);
 				return (0);
 			}
 			goto retry2;
 		}
 	}
 	++pfs_vncache_misses;
 	if (++pfs_vncache_entries > pfs_vncache_maxentries)
 		pfs_vncache_maxentries = pfs_vncache_entries;
 	SLIST_INSERT_HEAD(hash, pvd, pvd_hash);
 	mtx_unlock(&pfs_vncache_mutex);
 	return (0);
 }
 
 /*
  * Free a vnode
  */
 int
 pfs_vncache_free(struct vnode *vp)
 {
 	struct pfs_vdata *pvd, *pvd2;
 
 	mtx_lock(&pfs_vncache_mutex);
 	pvd = (struct pfs_vdata *)vp->v_data;
 	KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n"));
 	SLIST_FOREACH(pvd2, PFS_VNCACHE_HASH(pvd->pvd_pid), pvd_hash) {
 		if (pvd2 != pvd)
 			continue;
 		SLIST_REMOVE(PFS_VNCACHE_HASH(pvd->pvd_pid), pvd, pfs_vdata, pvd_hash);
 		--pfs_vncache_entries;
 		break;
 	}
 	mtx_unlock(&pfs_vncache_mutex);
 
 	free(pvd, M_PFSVNCACHE);
 	vp->v_data = NULL;
 	return (0);
 }
 
 /*
  * Purge the cache of dead entries
  *
  * The code is not very efficient and this perhaps can be addressed without
  * a complete rewrite. Previous iteration was walking a linked list from
  * scratch every time. This code only walks the relevant hash chain (if pid
  * is provided), but still resorts to scanning the entire cache at least twice
  * if a specific component is to be removed which is slower. This can be
  * augmented with resizing the hash.
  *
  * Explanation of the previous state:
  *
  * This is extremely inefficient due to the fact that vgone() not only
  * indirectly modifies the vnode cache, but may also sleep.  We can
  * neither hold pfs_vncache_mutex across a vgone() call, nor make any
  * assumptions about the state of the cache after vgone() returns.  In
  * consequence, we must start over after every vgone() call, and keep
  * trying until we manage to traverse the entire cache.
  *
  * The only way to improve this situation is to change the data structure
  * used to implement the cache.
  */
 
 static void
 pfs_purge_one(struct vnode *vnp)
 {
 
 	VOP_LOCK(vnp, LK_EXCLUSIVE);
 	vgone(vnp);
 	VOP_UNLOCK(vnp);
 	vdrop(vnp);
 }
 
 void
 pfs_purge(struct pfs_node *pn)
 {
 	struct pfs_vdata *pvd;
 	struct vnode *vnp;
 	u_long i, removed;
 
 	mtx_lock(&pfs_vncache_mutex);
 restart:
 	removed = 0;
 	for (i = 0; i < pfs_vncache_hash; i++) {
 restart_chain:
 		SLIST_FOREACH(pvd, &pfs_vncache_hashtbl[i], pvd_hash) {
 			if (pn != NULL && pvd->pvd_pn != pn)
 				continue;
 			vnp = pvd->pvd_vnode;
 			vhold(vnp);
 			mtx_unlock(&pfs_vncache_mutex);
 			pfs_purge_one(vnp);
 			removed++;
 			mtx_lock(&pfs_vncache_mutex);
 			goto restart_chain;
 		}
 	}
 	if (removed > 0)
 		goto restart;
 	mtx_unlock(&pfs_vncache_mutex);
 }
 
 static void
 pfs_purge_all(void)
 {
 
 	pfs_purge(NULL);
 }
 
 /*
  * Free all vnodes associated with a defunct process
  */
 static void
 pfs_exit(void *arg, struct proc *p)
 {
 	struct pfs_vncache_head *hash;
 	struct pfs_vdata *pvd;
 	struct vnode *vnp;
 	int pid;
 
 	pid = p->p_pid;
 	hash = PFS_VNCACHE_HASH(pid);
 	if (SLIST_EMPTY(hash))
 		return;
 restart:
 	mtx_lock(&pfs_vncache_mutex);
 	SLIST_FOREACH(pvd, hash, pvd_hash) {
 		if (pvd->pvd_pid != pid)
 			continue;
 		vnp = pvd->pvd_vnode;
 		vhold(vnp);
 		mtx_unlock(&pfs_vncache_mutex);
 		pfs_purge_one(vnp);
 		goto restart;
 	}
 	mtx_unlock(&pfs_vncache_mutex);
 }
Index: projects/clang1100-import/sys/fs/smbfs/smbfs_node.c
===================================================================
--- projects/clang1100-import/sys/fs/smbfs/smbfs_node.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/smbfs/smbfs_node.c	(revision 364279)
@@ -1,408 +1,408 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 /*#include <vm/vm_page.h>
 #include <vm/vm_object.h>*/
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 extern struct vop_vector smbfs_vnodeops;	/* XXX -> .h file */
 
 static MALLOC_DEFINE(M_SMBNODE, "smbufs_node", "SMBFS vnode private part");
 static MALLOC_DEFINE(M_SMBNODENAME, "smbufs_nname", "SMBFS node name");
 
 u_int32_t __inline
 smbfs_hash(const u_char *name, int nmlen)
 {
 	return (fnv_32_buf(name, nmlen, FNV1_32_INIT)); 
 }
 
 static char *
 smbfs_name_alloc(const u_char *name, int nmlen)
 {
 	u_char *cp;
 
 	nmlen++;
 	cp = malloc(nmlen, M_SMBNODENAME, M_WAITOK);
 	bcopy(name, cp, nmlen - 1);
 	cp[nmlen - 1] = 0;
 	return cp;
 }
 
 static void
 smbfs_name_free(u_char *name)
 {
 
 	free(name, M_SMBNODENAME);
 }
 
 static int __inline
 smbfs_vnode_cmp(struct vnode *vp, void *_sc) 
 {
 	struct smbnode *np;
 	struct smbcmp *sc;
 
 	np = (struct smbnode *) vp->v_data;
 	sc = (struct smbcmp *) _sc;
 	if (np->n_parent != sc->n_parent || np->n_nmlen != sc->n_nmlen ||
 	    bcmp(sc->n_name, np->n_name, sc->n_nmlen) != 0)
 		return 1;
 	return 0;
 }
 
 static int
 smbfs_node_alloc(struct mount *mp, struct vnode *dvp, const char *dirnm, 
 	int dirlen, const char *name, int nmlen, char sep, 
 	struct smbfattr *fap, struct vnode **vpp)
 {
 	struct vattr vattr;
 	struct thread *td = curthread;	/* XXX */
 	struct smbmount *smp = VFSTOSMBFS(mp);
 	struct smbnode *np, *dnp;
 	struct vnode *vp, *vp2;
 	struct smbcmp sc;
 	char *p, *rpath;
 	int error, rplen;
 
 	sc.n_parent = dvp;
 	sc.n_nmlen = nmlen;
 	sc.n_name = name;	
 	if (smp->sm_root != NULL && dvp == NULL) {
 		SMBERROR("do not allocate root vnode twice!\n");
 		return EINVAL;
 	}
 	if (nmlen == 2 && bcmp(name, "..", 2) == 0) {
 		if (dvp == NULL)
 			return EINVAL;
 		vp = VTOSMB(VTOSMB(dvp)->n_parent)->n_vnode;
-		error = vget(vp, LK_EXCLUSIVE, td);
+		error = vget(vp, LK_EXCLUSIVE);
 		if (error == 0)
 			*vpp = vp;
 		return error;
 	} else if (nmlen == 1 && name[0] == '.') {
 		SMBERROR("do not call me with dot!\n");
 		return EINVAL;
 	}
 	dnp = dvp ? VTOSMB(dvp) : NULL;
 	if (dnp == NULL && dvp != NULL) {
 		vn_printf(dvp, "smbfs_node_alloc: dead parent vnode ");
 		return EINVAL;
 	}
 	error = vfs_hash_get(mp, smbfs_hash(name, nmlen), LK_EXCLUSIVE, td,
 	    vpp, smbfs_vnode_cmp, &sc);
 	if (error)
 		return (error);
 	if (*vpp) {
 		np = VTOSMB(*vpp);
 		/* Force cached attributes to be refreshed if stale. */
 		(void)VOP_GETATTR(*vpp, &vattr, td->td_ucred);
 		/*
 		 * If the file type on the server is inconsistent with
 		 * what it was when we created the vnode, kill the
 		 * bogus vnode now and fall through to the code below
 		 * to create a new one with the right type.
 		 */
 		if (((*vpp)->v_type == VDIR && 
 		    (np->n_dosattr & SMB_FA_DIR) == 0) ||
 	    	    ((*vpp)->v_type == VREG && 
 		    (np->n_dosattr & SMB_FA_DIR) != 0)) {
 			vgone(*vpp);
 			vput(*vpp);
 		}
 		else {
 			SMBVDEBUG("vnode taken from the hashtable\n");
 			return (0);
 		}
 	}
 	/*
 	 * If we don't have node attributes, then it is an explicit lookup
 	 * for an existing vnode.
 	 */
 	if (fap == NULL)
 		return ENOENT;
 
 	error = getnewvnode("smbfs", mp, &smbfs_vnodeops, vpp);
 	if (error)
 		return (error);
 	vp = *vpp;
 	np = malloc(sizeof *np, M_SMBNODE, M_WAITOK | M_ZERO);
 	rplen = dirlen;
 	if (sep != '\0')
 		rplen++;
 	rplen += nmlen;
 	rpath = malloc(rplen + 1, M_SMBNODENAME, M_WAITOK);
 	p = rpath;
 	bcopy(dirnm, p, dirlen);
 	p += dirlen;
 	if (sep != '\0')
 		*p++ = sep;
 	if (name != NULL) {
 		bcopy(name, p, nmlen);
 		p += nmlen;
 	}
 	*p = '\0';
 	MPASS(p == rpath + rplen);
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 	/* Vnode initialization */
 	vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG;
 	vp->v_data = np;
 	np->n_vnode = vp;
 	np->n_mount = VFSTOSMBFS(mp);
 	np->n_rpath = rpath;
 	np->n_rplen = rplen;
 	np->n_nmlen = nmlen;
 	np->n_name = smbfs_name_alloc(name, nmlen);
 	np->n_ino = fap->fa_ino;
 	if (dvp) {
 		ASSERT_VOP_LOCKED(dvp, "smbfs_node_alloc");
 		np->n_parent = dvp;
 		np->n_parentino = VTOSMB(dvp)->n_ino;
 		if (/*vp->v_type == VDIR &&*/ (dvp->v_vflag & VV_ROOT) == 0) {
 			vref(dvp);
 			np->n_flag |= NREFPARENT;
 		}
 	} else if (vp->v_type == VREG)
 		SMBERROR("new vnode '%s' born without parent ?\n", np->n_name);
 	error = insmntque(vp, mp);
 	if (error) {
 		free(np, M_SMBNODE);
 		return (error);
 	}
 	error = vfs_hash_insert(vp, smbfs_hash(name, nmlen), LK_EXCLUSIVE,
 	    td, &vp2, smbfs_vnode_cmp, &sc);
 	if (error) 
 		return (error);
 	if (vp2 != NULL)
 		*vpp = vp2;
 	return (0);
 }
 
 int
 smbfs_nget(struct mount *mp, struct vnode *dvp, const char *name, int nmlen,
 	struct smbfattr *fap, struct vnode **vpp)
 {
 	struct smbnode *dnp, *np;
 	struct vnode *vp;
 	int error, sep;
 
 	dnp = (dvp) ? VTOSMB(dvp) : NULL;
 	sep = 0;
 	if (dnp != NULL) {
 		sep = SMBFS_DNP_SEP(dnp); 
 		error = smbfs_node_alloc(mp, dvp, dnp->n_rpath, dnp->n_rplen, 
 		    name, nmlen, sep, fap, &vp); 
 	} else
 		error = smbfs_node_alloc(mp, NULL, "\\", 1, name, nmlen, 
 		    sep, fap, &vp); 
 	if (error)
 		return error;
 	MPASS(vp != NULL);
 	np = VTOSMB(vp);
 	if (fap)
 		smbfs_attr_cacheenter(vp, fap);
 	*vpp = vp;
 	return 0;
 }
 
 /*
  * Free smbnode, and give vnode back to system
  */
 int
 smbfs_reclaim(ap)                     
         struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_p;
         } */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smbmount *smp = VTOSMBFS(vp);
 	
 	SMBVDEBUG("%s,%d\n", np->n_name, vrefcnt(vp));
 
 	KASSERT((np->n_flag & NOPEN) == 0, ("file not closed before reclaim"));
 
 	dvp = (np->n_parent && (np->n_flag & NREFPARENT)) ?
 	    np->n_parent : NULL;
 	
 	/*
 	 * Remove the vnode from its hash chain.
 	 */
 	vfs_hash_remove(vp);
 	if (np->n_name)
 		smbfs_name_free(np->n_name);
 	if (np->n_rpath)
 		free(np->n_rpath, M_SMBNODENAME);
 	free(np, M_SMBNODE);
 	vp->v_data = NULL;
 	if (dvp != NULL) {
 		vrele(dvp);
 		/*
 		 * Indicate that we released something; see comment
 		 * in smbfs_unmount().
 		 */
 		smp->sm_didrele = 1;
 	}
 	return 0;
 }
 
 int
 smbfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct thread *td = ap->a_td;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred *scred;
 	struct vattr va;
 
 	SMBVDEBUG("%s: %d\n", VTOSMB(vp)->n_name, vrefcnt(vp));
 	if ((np->n_flag & NOPEN) != 0) {
 		scred = smbfs_malloc_scred();
 		smb_makescred(scred, td, cred);
 		smbfs_vinvalbuf(vp, td);
 		if (vp->v_type == VREG) {
 			VOP_GETATTR(vp, &va, cred);
 			smbfs_smb_close(np->n_mount->sm_share, np->n_fid,
 			    &np->n_mtime, scred);
 		} else if (vp->v_type == VDIR) {
 			if (np->n_dirseq != NULL) {
 				smbfs_findclose(np->n_dirseq, scred);
 				np->n_dirseq = NULL;
 			}
 		}
 		np->n_flag &= ~NOPEN;
 		smbfs_attr_cacheremove(vp);
 		smbfs_free_scred(scred);
 	}
 	if (np->n_flag & NGONE)
 		vrecycle(vp);
 	return (0);
 }
 /*
  * routines to maintain vnode attributes cache
  * smbfs_attr_cacheenter: unpack np.i to vattr structure
  */
 void
 smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap)
 {
 	struct smbnode *np = VTOSMB(vp);
 
 	if (vp->v_type == VREG) {
 		if (np->n_size != fap->fa_size) {
 			np->n_size = fap->fa_size;
 			vnode_pager_setsize(vp, np->n_size);
 		}
 	} else if (vp->v_type == VDIR) {
 		np->n_size = 16384; 		/* should be a better way ... */
 	} else
 		return;
 	np->n_mtime = fap->fa_mtime;
 	np->n_dosattr = fap->fa_attr;
 	np->n_attrage = time_second;
 	return;
 }
 
 int
 smbfs_attr_cachelookup(struct vnode *vp, struct vattr *va)
 {
 	struct smbnode *np = VTOSMB(vp);
 	struct smbmount *smp = VTOSMBFS(vp);
 	int diff;
 
 	diff = time_second - np->n_attrage;
 	if (diff > 2)	/* XXX should be configurable */
 		return ENOENT;
 	va->va_type = vp->v_type;		/* vnode type (for create) */
 	va->va_flags = 0;			/* flags defined for file */
 	if (vp->v_type == VREG) {
 		va->va_mode = smp->sm_file_mode; /* files access mode and type */
 		if (np->n_dosattr & SMB_FA_RDONLY) {
 			va->va_mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH);
 			va->va_flags |= UF_READONLY;
 		}
 	} else if (vp->v_type == VDIR) {
 		va->va_mode = smp->sm_dir_mode;	/* files access mode and type */
 	} else
 		return EINVAL;
 	va->va_size = np->n_size;
 	va->va_nlink = 1;		/* number of references to file */
 	va->va_uid = smp->sm_uid;	/* owner user id */
 	va->va_gid = smp->sm_gid;	/* owner group id */
 	va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	va->va_fileid = np->n_ino;	/* file id */
 	if (va->va_fileid == 0)
 		va->va_fileid = 2;
 	va->va_blocksize = SSTOVC(smp->sm_share)->vc_txmax;
 	va->va_mtime = np->n_mtime;
 	va->va_atime = va->va_ctime = va->va_mtime;	/* time file changed */
 	va->va_gen = VNOVAL;		/* generation number of file */
 	if (np->n_dosattr & SMB_FA_HIDDEN)
 		va->va_flags |= UF_HIDDEN;
 	if (np->n_dosattr & SMB_FA_SYSTEM)
 		va->va_flags |= UF_SYSTEM;
 	/*
 	 * We don't set the archive bit for directories.
 	 */
 	if ((vp->v_type != VDIR) && (np->n_dosattr & SMB_FA_ARCHIVE))
 		va->va_flags |= UF_ARCHIVE;
 	va->va_rdev = NODEV;		/* device the special file represents */
 	va->va_bytes = va->va_size;	/* bytes of disk space held by file */
 	va->va_filerev = 0;		/* file modification number */
 	va->va_vaflags = 0;		/* operations flags */
 	return 0;
 }
Index: projects/clang1100-import/sys/fs/smbfs/smbfs_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/smbfs/smbfs_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/smbfs/smbfs_vfsops.c	(revision 364279)
@@ -1,411 +1,411 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sx.h>
 
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 #include <netsmb/smb_dev.h>
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 static int smbfs_debuglevel = 0;
 
 static int smbfs_version = SMBFS_VERSION;
 
 SYSCTL_NODE(_vfs, OID_AUTO, smbfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SMB/CIFS filesystem");
 SYSCTL_INT(_vfs_smbfs, OID_AUTO, version, CTLFLAG_RD, &smbfs_version, 0, "");
 SYSCTL_INT(_vfs_smbfs, OID_AUTO, debuglevel, CTLFLAG_RW, &smbfs_debuglevel, 0, "");
 
 static vfs_init_t       smbfs_init;
 static vfs_uninit_t     smbfs_uninit;
 static vfs_cmount_t     smbfs_cmount;
 static vfs_mount_t      smbfs_mount;
 static vfs_root_t       smbfs_root;
 static vfs_quotactl_t   smbfs_quotactl;
 static vfs_statfs_t     smbfs_statfs;
 static vfs_unmount_t    smbfs_unmount;
 
 static struct vfsops smbfs_vfsops = {
 	.vfs_init =		smbfs_init,
 	.vfs_cmount =		smbfs_cmount,
 	.vfs_mount =		smbfs_mount,
 	.vfs_quotactl =		smbfs_quotactl,
 	.vfs_root =		smbfs_root,
 	.vfs_statfs =		smbfs_statfs,
 	.vfs_sync =		vfs_stdsync,
 	.vfs_uninit =		smbfs_uninit,
 	.vfs_unmount =		smbfs_unmount,
 };
 
 
 VFS_SET(smbfs_vfsops, smbfs, VFCF_NETWORK);
 
 MODULE_DEPEND(smbfs, netsmb, NSMB_VERSION, NSMB_VERSION, NSMB_VERSION);
 MODULE_DEPEND(smbfs, libiconv, 1, 1, 2);
 MODULE_DEPEND(smbfs, libmchain, 1, 1, 1);
 
 uma_zone_t smbfs_pbuf_zone;
 
 static int
 smbfs_cmount(struct mntarg *ma, void * data, uint64_t flags)
 {
 	struct smbfs_args args;
 	int error;
 
 	error = copyin(data, &args, sizeof(struct smbfs_args));
 	if (error)
 		return error;
 
 	if (args.version != SMBFS_VERSION) {
 		printf("mount version mismatch: kernel=%d, mount=%d\n",
 		    SMBFS_VERSION, args.version);
 		return EINVAL;
 	}
 	ma = mount_argf(ma, "dev", "%d", args.dev);
 	ma = mount_argb(ma, args.flags & SMBFS_MOUNT_SOFT, "nosoft");
 	ma = mount_argb(ma, args.flags & SMBFS_MOUNT_INTR, "nointr");
 	ma = mount_argb(ma, args.flags & SMBFS_MOUNT_STRONG, "nostrong");
 	ma = mount_argb(ma, args.flags & SMBFS_MOUNT_HAVE_NLS, "nohave_nls");
 	ma = mount_argb(ma, !(args.flags & SMBFS_MOUNT_NO_LONG), "nolong");
 	ma = mount_arg(ma, "rootpath", args.root_path, -1);
 	ma = mount_argf(ma, "uid", "%d", args.uid);
 	ma = mount_argf(ma, "gid", "%d", args.gid);
 	ma = mount_argf(ma, "file_mode", "%d", args.file_mode);
 	ma = mount_argf(ma, "dir_mode", "%d", args.dir_mode);
 	ma = mount_argf(ma, "caseopt", "%d", args.caseopt);
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 static const char *smbfs_opts[] = {
 	"fd", "soft", "intr", "strong", "have_nls", "long",
 	"mountpoint", "rootpath", "uid", "gid", "file_mode", "dir_mode",
 	"caseopt", "errmsg", NULL
 };
 
 static int
 smbfs_mount(struct mount *mp)
 {
 	struct smbmount *smp = NULL;
 	struct smb_vc *vcp;
 	struct smb_share *ssp = NULL;
 	struct vnode *vp;
 	struct thread *td;
 	struct smb_dev *dev;
 	struct smb_cred *scred;
 	int error, v;
 	char *pc, *pe;
 
 	dev = NULL;
 	td = curthread;
 	if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS))
 		return EOPNOTSUPP;
 
 	if (vfs_filteropt(mp->mnt_optnew, smbfs_opts)) {
 		vfs_mount_error(mp, "%s", "Invalid option");
 		return (EINVAL);
 	}
 
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, td->td_ucred);
 	
 	/* Ask userspace of `fd`, the file descriptor of this session */
 	if (1 != vfs_scanopt(mp->mnt_optnew, "fd", "%d", &v)) {
 		vfs_mount_error(mp, "No fd option");
 		smbfs_free_scred(scred);
 		return (EINVAL);
 	}
 	error = smb_dev2share(v, SMBM_EXEC, scred, &ssp, &dev);
 	smp = malloc(sizeof(*smp), M_SMBFSDATA, M_WAITOK | M_ZERO);
 	if (error) {
 		printf("invalid device handle %d (%d)\n", v, error);
 		vfs_mount_error(mp, "invalid device handle %d %d\n", v, error);
 		smbfs_free_scred(scred);
 		free(smp, M_SMBFSDATA);
 		return error;
 	}
 	vcp = SSTOVC(ssp);
 	smb_share_unlock(ssp);
 	mp->mnt_stat.f_iosize = SSTOVC(ssp)->vc_txmax;
 	mp->mnt_data = smp;
 	smp->sm_share = ssp;
 	smp->sm_root = NULL;
 	smp->sm_dev = dev;
 	if (1 != vfs_scanopt(mp->mnt_optnew,
 	    "caseopt", "%d", &smp->sm_caseopt)) {
 		vfs_mount_error(mp, "Invalid caseopt");
 		error = EINVAL;
 		goto bad;
 	}
 	if (1 != vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v)) {
 		vfs_mount_error(mp, "Invalid uid");
 		error = EINVAL;
 		goto bad;
 	}
 	smp->sm_uid = v;
 
 	if (1 != vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v)) {
 		vfs_mount_error(mp, "Invalid gid");
 		error = EINVAL;
 		goto bad;
 	}
 	smp->sm_gid = v;
 
 	if (1 != vfs_scanopt(mp->mnt_optnew, "file_mode", "%d", &v)) {
 		vfs_mount_error(mp, "Invalid file_mode");
 		error = EINVAL;
 		goto bad;
 	}
 	smp->sm_file_mode = (v & (S_IRWXU|S_IRWXG|S_IRWXO)) | S_IFREG;
 
 	if (1 != vfs_scanopt(mp->mnt_optnew, "dir_mode", "%d", &v)) {
 		vfs_mount_error(mp, "Invalid dir_mode");
 		error = EINVAL;
 		goto bad;
 	}
 	smp->sm_dir_mode  = (v & (S_IRWXU|S_IRWXG|S_IRWXO)) | S_IFDIR;
 
 	vfs_flagopt(mp->mnt_optnew,
 	    "nolong", &smp->sm_flags, SMBFS_MOUNT_NO_LONG);
 
 	pc = mp->mnt_stat.f_mntfromname;
 	pe = pc + sizeof(mp->mnt_stat.f_mntfromname);
 	bzero(pc, MNAMELEN);
 	*pc++ = '/';
 	*pc++ = '/';
 	pc = strchr(strncpy(pc, vcp->vc_username, pe - pc - 2), 0);
 	if (pc < pe-1) {
 		*(pc++) = '@';
 		pc = strchr(strncpy(pc, vcp->vc_srvname, pe - pc - 2), 0);
 		if (pc < pe - 1) {
 			*(pc++) = '/';
 			strncpy(pc, ssp->ss_name, pe - pc - 2);
 		}
 	}
 	vfs_getnewfsid(mp);
 	error = smbfs_root(mp, LK_EXCLUSIVE, &vp);
 	if (error) {
 		vfs_mount_error(mp, "smbfs_root error: %d", error);
 		goto bad;
 	}
 	VOP_UNLOCK(vp);
 	SMBVDEBUG("root.v_usecount = %d\n", vrefcnt(vp));
 
 #ifdef DIAGNOSTIC
 	SMBERROR("mp=%p\n", mp);
 #endif
 	smbfs_free_scred(scred);
 	return error;
 bad:
 	if (ssp)
 		smb_share_put(ssp, scred);
 	smbfs_free_scred(scred);	
 	SMB_LOCK();
 	if (error && smp->sm_dev == dev) {
 		smp->sm_dev = NULL;
 		sdp_trydestroy(dev);
 	}
 	SMB_UNLOCK();
 	free(smp, M_SMBFSDATA);
 	return error;
 }
 
 /* Unmount the filesystem described by mp. */
 static int
 smbfs_unmount(struct mount *mp, int mntflags)
 {
 	struct thread *td;
 	struct smbmount *smp = VFSTOSMBFS(mp);
 	struct smb_cred *scred;
 	struct smb_dev *dev;
 	int error, flags;
 
 	SMBVDEBUG("smbfs_unmount: flags=%04x\n", mntflags);
 	td = curthread;
 	flags = 0;
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 	/*
 	 * Keep trying to flush the vnode list for the mount while 
 	 * some are still busy and we are making progress towards
 	 * making them not busy. This is needed because smbfs vnodes
 	 * reference their parent directory but may appear after their
 	 * parent in the list; one pass over the vnode list is not
 	 * sufficient in this case.
 	 */
 	do {
 		smp->sm_didrele = 0;
 		/* There is 1 extra root vnode reference from smbfs_mount(). */
 		error = vflush(mp, 1, flags, td);
 	} while (error == EBUSY && smp->sm_didrele != 0);
 	if (error)
 		return error;
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, td->td_ucred);
 	error = smb_share_lock(smp->sm_share);
 	if (error)
 		goto out;
 	smb_share_put(smp->sm_share, scred);
 	SMB_LOCK();
 	dev = smp->sm_dev;
 	if (!dev)
 		panic("No private data for mount point");
 	sdp_trydestroy(dev);
 	mp->mnt_data = NULL;
 	SMB_UNLOCK();
 	free(smp, M_SMBFSDATA);
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 out:
 	smbfs_free_scred(scred);
 	return error;
 }
 
 /* 
  * Return locked root vnode of a filesystem
  */
 static int
 smbfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct smbmount *smp = VFSTOSMBFS(mp);
 	struct vnode *vp;
 	struct smbnode *np;
 	struct smbfattr fattr;
 	struct thread *td;
 	struct ucred *cred;
 	struct smb_cred *scred;
 	int error;
 
 	td = curthread;
 	cred = td->td_ucred;
 
 	if (smp->sm_root) {
 		*vpp = SMBTOV(smp->sm_root);
-		return vget(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+		return vget(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	}
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, cred);
 	error = smbfs_smb_lookup(NULL, NULL, 0, &fattr, scred);
 	if (error)
 		goto out;
 	error = smbfs_nget(mp, NULL, NULL, 0, &fattr, &vp);
 	if (error)
 		goto out;
 	ASSERT_VOP_LOCKED(vp, "smbfs_root");
 	vp->v_vflag |= VV_ROOT;
 	np = VTOSMB(vp);
 	smp->sm_root = np;
 	*vpp = vp;
 out:
 	smbfs_free_scred(scred);
 	return error;
 }
 
 /*
  * Do operations associated with quotas, not supported
  */
 /* ARGSUSED */
 static int
 smbfs_quotactl(mp, cmd, uid, arg)
 	struct mount *mp;
 	int cmd;
 	uid_t uid;
 	void *arg;
 {
 	SMBVDEBUG("return EOPNOTSUPP\n");
 	return EOPNOTSUPP;
 }
 
 /*ARGSUSED*/
 int
 smbfs_init(struct vfsconf *vfsp)
 {
 
 	smbfs_pbuf_zone = pbuf_zsecond_create("smbpbuf", nswbuf / 2);
 	SMBVDEBUG("done.\n");
 	return 0;
 }
 
 /*ARGSUSED*/
 int
 smbfs_uninit(struct vfsconf *vfsp)
 {
 
 	uma_zdestroy(smbfs_pbuf_zone);
 	SMBVDEBUG("done.\n");
 	return 0;
 }
 
 /*
  * smbfs_statfs call
  */
 int
 smbfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct thread *td = curthread;
 	struct smbmount *smp = VFSTOSMBFS(mp);
 	struct smbnode *np = smp->sm_root;
 	struct smb_share *ssp = smp->sm_share;
 	struct smb_cred *scred;
 	int error;
 
 	if (np == NULL) {
 		vfs_mount_error(mp, "np == NULL");
 		return EINVAL;
 	}
 	
 	sbp->f_iosize = SSTOVC(ssp)->vc_txmax;		/* optimal transfer block size */
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, td->td_ucred);
 	error = smbfs_smb_statfs(ssp, sbp, scred);
 	smbfs_free_scred(scred);
 	return (error);
 }
Index: projects/clang1100-import/sys/fs/tmpfs/tmpfs_subr.c
===================================================================
--- projects/clang1100-import/sys/fs/tmpfs/tmpfs_subr.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/tmpfs/tmpfs_subr.c	(revision 364279)
@@ -1,1939 +1,1938 @@
 /*	$NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Efficient memory file system supporting functions.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fnv_hash.h>
 #include <sys/lock.h>
 #include <sys/limits.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/swap_pager.h>
 
 #include <fs/tmpfs/tmpfs.h>
 #include <fs/tmpfs/tmpfs_fifoops.h>
 #include <fs/tmpfs/tmpfs_vnops.h>
 
 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "tmpfs file system");
 
 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED;
 
 static uma_zone_t tmpfs_dirent_pool;
 static uma_zone_t tmpfs_node_pool;
 VFS_SMR_DECLARE;
 
 static int
 tmpfs_node_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	node->tn_gen++;
 	node->tn_size = 0;
 	node->tn_status = 0;
 	node->tn_flags = 0;
 	node->tn_links = 0;
 	node->tn_vnode = NULL;
 	node->tn_vpstate = 0;
 	return (0);
 }
 
 static void
 tmpfs_node_dtor(void *mem, int size, void *arg)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	node->tn_type = VNON;
 }
 
 static int
 tmpfs_node_init(void *mem, int size, int flags)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	node->tn_id = 0;
 	mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF);
 	node->tn_gen = arc4random();
 	return (0);
 }
 
 static void
 tmpfs_node_fini(void *mem, int size)
 {
 	struct tmpfs_node *node;
 
 	node = mem;
 	mtx_destroy(&node->tn_interlock);
 }
 
 void
 tmpfs_subr_init(void)
 {
 	tmpfs_dirent_pool = uma_zcreate("TMPFS dirent",
 	    sizeof(struct tmpfs_dirent), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	tmpfs_node_pool = uma_zcreate("TMPFS node",
 	    sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor,
 	    tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0);
 	VFS_SMR_ZONE_SET(tmpfs_node_pool);
 }
 
 void
 tmpfs_subr_uninit(void)
 {
 	uma_zdestroy(tmpfs_node_pool);
 	uma_zdestroy(tmpfs_dirent_pool);
 }
 
 static int
 sysctl_mem_reserved(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long pages, bytes;
 
 	pages = *(long *)arg1;
 	bytes = pages * PAGE_SIZE;
 
 	error = sysctl_handle_long(oidp, &bytes, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	pages = bytes / PAGE_SIZE;
 	if (pages < TMPFS_PAGES_MINRESERVED)
 		return (EINVAL);
 
 	*(long *)arg1 = pages;
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved,
     CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &tmpfs_pages_reserved, 0,
     sysctl_mem_reserved, "L",
     "Amount of available memory and swap below which tmpfs growth stops");
 
 static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a,
     struct tmpfs_dirent *b);
 RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp);
 
 size_t
 tmpfs_mem_avail(void)
 {
 	vm_ooffset_t avail;
 
 	avail = swap_pager_avail + vm_free_count() - tmpfs_pages_reserved;
 	if (__predict_false(avail < 0))
 		avail = 0;
 	return (avail);
 }
 
 size_t
 tmpfs_pages_used(struct tmpfs_mount *tmp)
 {
 	const size_t node_size = sizeof(struct tmpfs_node) +
 	    sizeof(struct tmpfs_dirent);
 	size_t meta_pages;
 
 	meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size,
 	    PAGE_SIZE);
 	return (meta_pages + tmp->tm_pages_used);
 }
 
 static size_t
 tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages)
 {
 	if (tmpfs_mem_avail() < req_pages)
 		return (0);
 
 	if (tmp->tm_pages_max != ULONG_MAX &&
 	    tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp))
 			return (0);
 
 	return (1);
 }
 
 void
 tmpfs_ref_node(struct tmpfs_node *node)
 {
 
 	TMPFS_NODE_LOCK(node);
 	tmpfs_ref_node_locked(node);
 	TMPFS_NODE_UNLOCK(node);
 }
 
 void
 tmpfs_ref_node_locked(struct tmpfs_node *node)
 {
 
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	KASSERT(node->tn_refcount > 0, ("node %p zero refcount", node));
 	KASSERT(node->tn_refcount < UINT_MAX, ("node %p refcount %u", node,
 	    node->tn_refcount));
 	node->tn_refcount++;
 }
 
 /*
  * Allocates a new node of type 'type' inside the 'tmp' mount point, with
  * its owner set to 'uid', its group to 'gid' and its mode set to 'mode',
  * using the credentials of the process 'p'.
  *
  * If the node type is set to 'VDIR', then the parent parameter must point
  * to the parent directory of the node being created.  It may only be NULL
  * while allocating the root node.
  *
  * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter
  * specifies the device the node represents.
  *
  * If the node type is set to 'VLNK', then the parameter target specifies
  * the file name of the target file for the symbolic link that is being
  * created.
  *
  * Note that new nodes are retrieved from the available list if it has
  * items or, if it is empty, from the node pool as long as there is enough
  * space to create them.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type,
     uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent,
     const char *target, dev_t rdev, struct tmpfs_node **node)
 {
 	struct tmpfs_node *nnode;
 	vm_object_t obj;
 
 	/* If the root directory of the 'tmp' file system is not yet
 	 * allocated, this must be the request to do it. */
 	MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR));
 
 	MPASS(IFF(type == VLNK, target != NULL));
 	MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL));
 
 	if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max)
 		return (ENOSPC);
 	if (tmpfs_pages_check_avail(tmp, 1) == 0)
 		return (ENOSPC);
 
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		/*
 		 * When a new tmpfs node is created for fully
 		 * constructed mount point, there must be a parent
 		 * node, which vnode is locked exclusively.  As
 		 * consequence, if the unmount is executing in
 		 * parallel, vflush() cannot reclaim the parent vnode.
 		 * Due to this, the check for MNTK_UNMOUNT flag is not
 		 * racy: if we did not see MNTK_UNMOUNT flag, then tmp
 		 * cannot be destroyed until node construction is
 		 * finished and the parent vnode unlocked.
 		 *
 		 * Tmpfs does not need to instantiate new nodes during
 		 * unmount.
 		 */
 		return (EBUSY);
 	}
 	if ((mp->mnt_kern_flag & MNT_RDONLY) != 0)
 		return (EROFS);
 
 	nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK);
 
 	/* Generic initialization. */
 	nnode->tn_type = type;
 	vfs_timestamp(&nnode->tn_atime);
 	nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime =
 	    nnode->tn_atime;
 	nnode->tn_uid = uid;
 	nnode->tn_gid = gid;
 	nnode->tn_mode = mode;
 	nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr);
 	nnode->tn_refcount = 1;
 
 	/* Type-specific initialization. */
 	switch (nnode->tn_type) {
 	case VBLK:
 	case VCHR:
 		nnode->tn_rdev = rdev;
 		break;
 
 	case VDIR:
 		RB_INIT(&nnode->tn_dir.tn_dirhead);
 		LIST_INIT(&nnode->tn_dir.tn_dupindex);
 		MPASS(parent != nnode);
 		MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL));
 		nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent;
 		nnode->tn_dir.tn_readdir_lastn = 0;
 		nnode->tn_dir.tn_readdir_lastp = NULL;
 		nnode->tn_links++;
 		TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent);
 		nnode->tn_dir.tn_parent->tn_links++;
 		TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent);
 		break;
 
 	case VFIFO:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 
 	case VLNK:
 		MPASS(strlen(target) < MAXPATHLEN);
 		nnode->tn_size = strlen(target);
 		nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME,
 		    M_WAITOK);
 		memcpy(nnode->tn_link, target, nnode->tn_size);
 		break;
 
 	case VREG:
 		obj = nnode->tn_reg.tn_aobj =
 		    vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0,
 			NULL /* XXXKIB - tmpfs needs swap reservation */);
 		VM_OBJECT_WLOCK(obj);
 		/* OBJ_TMPFS is set together with the setting of vp->v_object */
 		vm_object_set_flag(obj, OBJ_TMPFS_NODE);
 		VM_OBJECT_WUNLOCK(obj);
 		break;
 
 	default:
 		panic("tmpfs_alloc_node: type %p %d", nnode,
 		    (int)nnode->tn_type);
 	}
 
 	TMPFS_LOCK(tmp);
 	LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries);
 	nnode->tn_attached = true;
 	tmp->tm_nodes_inuse++;
 	tmp->tm_refcount++;
 	TMPFS_UNLOCK(tmp);
 
 	*node = nnode;
 	return (0);
 }
 
 /*
  * Destroys the node pointed to by node from the file system 'tmp'.
  * If the node references a directory, no entries are allowed.
  */
 void
 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node)
 {
 
 	TMPFS_LOCK(tmp);
 	TMPFS_NODE_LOCK(node);
 	if (!tmpfs_free_node_locked(tmp, node, false)) {
 		TMPFS_NODE_UNLOCK(node);
 		TMPFS_UNLOCK(tmp);
 	}
 }
 
 bool
 tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node,
     bool detach)
 {
 	vm_object_t uobj;
 
 	TMPFS_MP_ASSERT_LOCKED(tmp);
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	KASSERT(node->tn_refcount > 0, ("node %p refcount zero", node));
 
 	node->tn_refcount--;
 	if (node->tn_attached && (detach || node->tn_refcount == 0)) {
 		MPASS(tmp->tm_nodes_inuse > 0);
 		tmp->tm_nodes_inuse--;
 		LIST_REMOVE(node, tn_entries);
 		node->tn_attached = false;
 	}
 	if (node->tn_refcount > 0)
 		return (false);
 
 #ifdef INVARIANTS
 	MPASS(node->tn_vnode == NULL);
 	MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0);
 #endif
 	TMPFS_NODE_UNLOCK(node);
 	TMPFS_UNLOCK(tmp);
 
 	switch (node->tn_type) {
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VDIR:
 		/* FALLTHROUGH */
 	case VFIFO:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 
 	case VLNK:
 		free(node->tn_link, M_TMPFSNAME);
 		break;
 
 	case VREG:
 		uobj = node->tn_reg.tn_aobj;
 		if (uobj != NULL) {
 			if (uobj->size != 0)
 				atomic_subtract_long(&tmp->tm_pages_used, uobj->size);
 			KASSERT((uobj->flags & OBJ_TMPFS) == 0,
 			    ("leaked OBJ_TMPFS node %p vm_obj %p", node, uobj));
 			vm_object_deallocate(uobj);
 		}
 		break;
 
 	default:
 		panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type);
 	}
 
 	uma_zfree_smr(tmpfs_node_pool, node);
 	TMPFS_LOCK(tmp);
 	tmpfs_free_tmp(tmp);
 	return (true);
 }
 
 static __inline uint32_t
 tmpfs_dirent_hash(const char *name, u_int len)
 {
 	uint32_t hash;
 
 	hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK;
 #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP
 	hash &= 0xf;
 #endif
 	if (hash < TMPFS_DIRCOOKIE_MIN)
 		hash += TMPFS_DIRCOOKIE_MIN;
 
 	return (hash);
 }
 
 static __inline off_t
 tmpfs_dirent_cookie(struct tmpfs_dirent *de)
 {
 	if (de == NULL)
 		return (TMPFS_DIRCOOKIE_EOF);
 
 	MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN);
 
 	return (de->td_cookie);
 }
 
 static __inline boolean_t
 tmpfs_dirent_dup(struct tmpfs_dirent *de)
 {
 	return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0);
 }
 
 static __inline boolean_t
 tmpfs_dirent_duphead(struct tmpfs_dirent *de)
 {
 	return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0);
 }
 
 void
 tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen)
 {
 	de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen);
 	memcpy(de->ud.td_name, name, namelen);
 	de->td_namelen = namelen;
 }
 
 /*
  * Allocates a new directory entry for the node node with a name of name.
  * The new directory entry is returned in *de.
  *
  * The link count of node is increased by one to reflect the new object
  * referencing it.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node,
     const char *name, u_int len, struct tmpfs_dirent **de)
 {
 	struct tmpfs_dirent *nde;
 
 	nde = uma_zalloc(tmpfs_dirent_pool, M_WAITOK);
 	nde->td_node = node;
 	if (name != NULL) {
 		nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK);
 		tmpfs_dirent_init(nde, name, len);
 	} else
 		nde->td_namelen = 0;
 	if (node != NULL)
 		node->tn_links++;
 
 	*de = nde;
 
 	return 0;
 }
 
 /*
  * Frees a directory entry.  It is the caller's responsibility to destroy
  * the node referenced by it if needed.
  *
  * The link count of node is decreased by one to reflect the removal of an
  * object that referenced it.  This only happens if 'node_exists' is true;
  * otherwise the function will not access the node referred to by the
  * directory entry, as it may already have been released from the outside.
  */
 void
 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *node;
 
 	node = de->td_node;
 	if (node != NULL) {
 		MPASS(node->tn_links > 0);
 		node->tn_links--;
 	}
 	if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL)
 		free(de->ud.td_name, M_TMPFSNAME);
 	uma_zfree(tmpfs_dirent_pool, de);
 }
 
 void
 tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj)
 {
 
 	ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject");
 	if (vp->v_type != VREG || obj == NULL)
 		return;
 
 	VM_OBJECT_WLOCK(obj);
 	VI_LOCK(vp);
 	vm_object_clear_flag(obj, OBJ_TMPFS);
 	obj->un_pager.swp.swp_tmpfs = NULL;
 	if (vp->v_writecount < 0)
 		vp->v_writecount = 0;
 	VI_UNLOCK(vp);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * Need to clear v_object for insmntque failure.
  */
 static void
 tmpfs_insmntque_dtr(struct vnode *vp, void *dtr_arg)
 {
 
 	tmpfs_destroy_vobject(vp, vp->v_object);
 	vp->v_object = NULL;
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Allocates a new vnode for the node node or returns a new reference to
  * an existing one if the node had already a vnode referencing it.  The
  * resulting locked vnode is returned in *vpp.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag,
     struct vnode **vpp)
 {
 	struct vnode *vp;
+	enum vgetstate vs;
 	struct tmpfs_mount *tm;
 	vm_object_t object;
 	int error;
 
 	error = 0;
 	tm = VFS_TO_TMPFS(mp);
 	TMPFS_NODE_LOCK(node);
 	tmpfs_ref_node_locked(node);
 loop:
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	if ((vp = node->tn_vnode) != NULL) {
 		MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0);
-		VI_LOCK(vp);
 		if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) ||
 		    (VN_IS_DOOMED(vp) &&
 		     (lkflag & LK_NOWAIT) != 0)) {
-			VI_UNLOCK(vp);
 			TMPFS_NODE_UNLOCK(node);
 			error = ENOENT;
 			vp = NULL;
 			goto out;
 		}
 		if (VN_IS_DOOMED(vp)) {
-			VI_UNLOCK(vp);
 			node->tn_vpstate |= TMPFS_VNODE_WRECLAIM;
 			while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) {
 				msleep(&node->tn_vnode, TMPFS_NODE_MTX(node),
 				    0, "tmpfsE", 0);
 			}
 			goto loop;
 		}
+		vs = vget_prep(vp);
 		TMPFS_NODE_UNLOCK(node);
-		error = vget(vp, lkflag | LK_INTERLOCK, curthread);
+		error = vget_finish(vp, lkflag, vs);
 		if (error == ENOENT) {
 			TMPFS_NODE_LOCK(node);
 			goto loop;
 		}
 		if (error != 0) {
 			vp = NULL;
 			goto out;
 		}
 
 		/*
 		 * Make sure the vnode is still there after
 		 * getting the interlock to avoid racing a free.
 		 */
 		if (node->tn_vnode == NULL || node->tn_vnode != vp) {
 			vput(vp);
 			TMPFS_NODE_LOCK(node);
 			goto loop;
 		}
 
 		goto out;
 	}
 
 	if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) ||
 	    (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) {
 		TMPFS_NODE_UNLOCK(node);
 		error = ENOENT;
 		vp = NULL;
 		goto out;
 	}
 
 	/*
 	 * otherwise lock the vp list while we call getnewvnode
 	 * since that can block.
 	 */
 	if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) {
 		node->tn_vpstate |= TMPFS_VNODE_WANT;
 		error = msleep((caddr_t) &node->tn_vpstate,
 		    TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0);
 		if (error != 0)
 			goto out;
 		goto loop;
 	} else
 		node->tn_vpstate |= TMPFS_VNODE_ALLOCATING;
 	
 	TMPFS_NODE_UNLOCK(node);
 
 	/* Get a new vnode and associate it with our node. */
 	error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ?
 	    &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp);
 	if (error != 0)
 		goto unlock;
 	MPASS(vp != NULL);
 
 	/* lkflag is ignored, the lock is exclusive */
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	vp->v_data = node;
 	vp->v_type = node->tn_type;
 
 	/* Type-specific initialization. */
 	switch (node->tn_type) {
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VLNK:
 		/* FALLTHROUGH */
 	case VSOCK:
 		break;
 	case VFIFO:
 		vp->v_op = &tmpfs_fifoop_entries;
 		break;
 	case VREG:
 		object = node->tn_reg.tn_aobj;
 		VM_OBJECT_WLOCK(object);
 		VI_LOCK(vp);
 		KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs"));
 		vp->v_object = object;
 		object->un_pager.swp.swp_tmpfs = vp;
 		vm_object_set_flag(object, OBJ_TMPFS);
 		VI_UNLOCK(vp);
 		VM_OBJECT_WUNLOCK(object);
 		break;
 	case VDIR:
 		MPASS(node->tn_dir.tn_parent != NULL);
 		if (node->tn_dir.tn_parent == node)
 			vp->v_vflag |= VV_ROOT;
 		break;
 
 	default:
 		panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type);
 	}
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
 
 	error = insmntque1(vp, mp, tmpfs_insmntque_dtr, NULL);
 	if (error != 0)
 		vp = NULL;
 
 unlock:
 	TMPFS_NODE_LOCK(node);
 
 	MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING);
 	node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING;
 	node->tn_vnode = vp;
 
 	if (node->tn_vpstate & TMPFS_VNODE_WANT) {
 		node->tn_vpstate &= ~TMPFS_VNODE_WANT;
 		TMPFS_NODE_UNLOCK(node);
 		wakeup((caddr_t) &node->tn_vpstate);
 	} else
 		TMPFS_NODE_UNLOCK(node);
 
 out:
 	if (error == 0) {
 		*vpp = vp;
 
 #ifdef INVARIANTS
 		MPASS(*vpp != NULL && VOP_ISLOCKED(*vpp));
 		TMPFS_NODE_LOCK(node);
 		MPASS(*vpp == node->tn_vnode);
 		TMPFS_NODE_UNLOCK(node);
 #endif
 	}
 	tmpfs_free_node(tm, node);
 
 	return (error);
 }
 
 /*
  * Destroys the association between the vnode vp and the node it
  * references.
  */
 void
 tmpfs_free_vp(struct vnode *vp)
 {
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	TMPFS_NODE_ASSERT_LOCKED(node);
 	node->tn_vnode = NULL;
 	if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0)
 		wakeup(&node->tn_vnode);
 	node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM;
 	vp->v_data = NULL;
 }
 
 /*
  * Allocates a new file of type 'type' and adds it to the parent directory
  * 'dvp'; this addition is done using the component name given in 'cnp'.
  * The ownership of the new file is automatically assigned based on the
  * credentials of the caller (through 'cnp'), the group is set based on
  * the parent directory and the mode is determined from the 'vap' argument.
  * If successful, *vpp holds a vnode to the newly created file and zero
  * is returned.  Otherwise *vpp is NULL and the function returns an
  * appropriate error code.
  */
 int
 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap,
     struct componentname *cnp, const char *target)
 {
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 	struct tmpfs_node *parent;
 
 	ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file");
 	MPASS(cnp->cn_flags & HASBUF);
 
 	tmp = VFS_TO_TMPFS(dvp->v_mount);
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	*vpp = NULL;
 
 	/* If the entry we are creating is a directory, we cannot overflow
 	 * the number of links of its parent, because it will get a new
 	 * link. */
 	if (vap->va_type == VDIR) {
 		/* Ensure that we do not overflow the maximum number of links
 		 * imposed by the system. */
 		MPASS(dnode->tn_links <= TMPFS_LINK_MAX);
 		if (dnode->tn_links == TMPFS_LINK_MAX) {
 			return (EMLINK);
 		}
 
 		parent = dnode;
 		MPASS(parent != NULL);
 	} else
 		parent = NULL;
 
 	/* Allocate a node that represents the new file. */
 	error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type,
 	    cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent,
 	    target, vap->va_rdev, &node);
 	if (error != 0)
 		return (error);
 
 	/* Allocate a directory entry that points to the new file. */
 	error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen,
 	    &de);
 	if (error != 0) {
 		tmpfs_free_node(tmp, node);
 		return (error);
 	}
 
 	/* Allocate a vnode for the new file. */
 	error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp);
 	if (error != 0) {
 		tmpfs_free_dirent(tmp, de);
 		tmpfs_free_node(tmp, node);
 		return (error);
 	}
 
 	/* Now that all required items are allocated, we can proceed to
 	 * insert the new node into the directory, an operation that
 	 * cannot fail. */
 	if (cnp->cn_flags & ISWHITEOUT)
 		tmpfs_dir_whiteout_remove(dvp, cnp);
 	tmpfs_dir_attach(dvp, de);
 	return (0);
 }
 
 struct tmpfs_dirent *
 tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc)
 {
 	struct tmpfs_dirent *de;
 
 	de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead);
 	dc->tdc_tree = de;
 	if (de != NULL && tmpfs_dirent_duphead(de))
 		de = LIST_FIRST(&de->ud.td_duphead);
 	dc->tdc_current = de;
 
 	return (dc->tdc_current);
 }
 
 struct tmpfs_dirent *
 tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc)
 {
 	struct tmpfs_dirent *de;
 
 	MPASS(dc->tdc_tree != NULL);
 	if (tmpfs_dirent_dup(dc->tdc_current)) {
 		dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries);
 		if (dc->tdc_current != NULL)
 			return (dc->tdc_current);
 	}
 	dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir,
 	    &dnode->tn_dir.tn_dirhead, dc->tdc_tree);
 	if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) {
 		dc->tdc_current = LIST_FIRST(&de->ud.td_duphead);
 		MPASS(dc->tdc_current != NULL);
 	}
 
 	return (dc->tdc_current);
 }
 
 /* Lookup directory entry in RB-Tree. Function may return duphead entry. */
 static struct tmpfs_dirent *
 tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash)
 {
 	struct tmpfs_dirent *de, dekey;
 
 	dekey.td_hash = hash;
 	de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey);
 	return (de);
 }
 
 /* Lookup directory entry by cookie, initialize directory cursor accordingly. */
 static struct tmpfs_dirent *
 tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie,
     struct tmpfs_dir_cursor *dc)
 {
 	struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead;
 	struct tmpfs_dirent *de, dekey;
 
 	MPASS(cookie >= TMPFS_DIRCOOKIE_MIN);
 
 	if (cookie == node->tn_dir.tn_readdir_lastn &&
 	    (de = node->tn_dir.tn_readdir_lastp) != NULL) {
 		/* Protect against possible race, tn_readdir_last[pn]
 		 * may be updated with only shared vnode lock held. */
 		if (cookie == tmpfs_dirent_cookie(de))
 			goto out;
 	}
 
 	if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) {
 		LIST_FOREACH(de, &node->tn_dir.tn_dupindex,
 		    uh.td_dup.index_entries) {
 			MPASS(tmpfs_dirent_dup(de));
 			if (de->td_cookie == cookie)
 				goto out;
 			/* dupindex list is sorted. */
 			if (de->td_cookie < cookie) {
 				de = NULL;
 				goto out;
 			}
 		}
 		MPASS(de == NULL);
 		goto out;
 	}
 
 	if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) {
 		de = NULL;
 	} else {
 		dekey.td_hash = cookie;
 		/* Recover if direntry for cookie was removed */
 		de = RB_NFIND(tmpfs_dir, dirhead, &dekey);
 	}
 	dc->tdc_tree = de;
 	dc->tdc_current = de;
 	if (de != NULL && tmpfs_dirent_duphead(de)) {
 		dc->tdc_current = LIST_FIRST(&de->ud.td_duphead);
 		MPASS(dc->tdc_current != NULL);
 	}
 	return (dc->tdc_current);
 
 out:
 	dc->tdc_tree = de;
 	dc->tdc_current = de;
 	if (de != NULL && tmpfs_dirent_dup(de))
 		dc->tdc_tree = tmpfs_dir_xlookup_hash(node,
 		    de->td_hash);
 	return (dc->tdc_current);
 }
 
 /*
  * Looks for a directory entry in the directory represented by node.
  * 'cnp' describes the name of the entry to look for.  Note that the .
  * and .. components are not allowed as they do not physically exist
  * within directories.
  *
  * Returns a pointer to the entry when found, otherwise NULL.
  */
 struct tmpfs_dirent *
 tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f,
     struct componentname *cnp)
 {
 	struct tmpfs_dir_duphead *duphead;
 	struct tmpfs_dirent *de;
 	uint32_t hash;
 
 	MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
 	MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
 	    cnp->cn_nameptr[1] == '.')));
 	TMPFS_VALIDATE_DIR(node);
 
 	hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen);
 	de = tmpfs_dir_xlookup_hash(node, hash);
 	if (de != NULL && tmpfs_dirent_duphead(de)) {
 		duphead = &de->ud.td_duphead;
 		LIST_FOREACH(de, duphead, uh.td_dup.entries) {
 			if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr,
 			    cnp->cn_namelen))
 				break;
 		}
 	} else if (de != NULL) {
 		if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr,
 		    cnp->cn_namelen))
 			de = NULL;
 	}
 	if (de != NULL && f != NULL && de->td_node != f)
 		de = NULL;
 
 	return (de);
 }
 
 /*
  * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex
  * list, allocate new cookie value.
  */
 static void
 tmpfs_dir_attach_dup(struct tmpfs_node *dnode,
     struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde)
 {
 	struct tmpfs_dir_duphead *dupindex;
 	struct tmpfs_dirent *de, *pde;
 
 	dupindex = &dnode->tn_dir.tn_dupindex;
 	de = LIST_FIRST(dupindex);
 	if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) {
 		if (de == NULL)
 			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN;
 		else
 			nde->td_cookie = de->td_cookie + 1;
 		MPASS(tmpfs_dirent_dup(nde));
 		LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries);
 		LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 		return;
 	}
 
 	/*
 	 * Cookie numbers are near exhaustion. Scan dupindex list for unused
 	 * numbers. dupindex list is sorted in descending order. Keep it so
 	 * after inserting nde.
 	 */
 	while (1) {
 		pde = de;
 		de = LIST_NEXT(de, uh.td_dup.index_entries);
 		if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) {
 			/*
 			 * Last element of the index doesn't have minimal cookie
 			 * value, use it.
 			 */
 			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN;
 			LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries);
 			LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 			return;
 		} else if (de == NULL) {
 			/*
 			 * We are so lucky have 2^30 hash duplicates in single
 			 * directory :) Return largest possible cookie value.
 			 * It should be fine except possible issues with
 			 * VOP_READDIR restart.
 			 */
 			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX;
 			LIST_INSERT_HEAD(dupindex, nde,
 			    uh.td_dup.index_entries);
 			LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 			return;
 		}
 		if (de->td_cookie + 1 == pde->td_cookie ||
 		    de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX)
 			continue;	/* No hole or invalid cookie. */
 		nde->td_cookie = de->td_cookie + 1;
 		MPASS(tmpfs_dirent_dup(nde));
 		MPASS(pde->td_cookie > nde->td_cookie);
 		MPASS(nde->td_cookie > de->td_cookie);
 		LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries);
 		LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
 		return;
 	}
 }
 
 /*
  * Attaches the directory entry de to the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
  * the directory entry, as this is done by tmpfs_alloc_dirent.
  */
 void
 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *dnode;
 	struct tmpfs_dirent *xde, *nde;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	MPASS(de->td_namelen > 0);
 	MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN);
 	MPASS(de->td_cookie == de->td_hash);
 
 	dnode = VP_TO_TMPFS_DIR(vp);
 	dnode->tn_dir.tn_readdir_lastn = 0;
 	dnode->tn_dir.tn_readdir_lastp = NULL;
 
 	MPASS(!tmpfs_dirent_dup(de));
 	xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de);
 	if (xde != NULL && tmpfs_dirent_duphead(xde))
 		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de);
 	else if (xde != NULL) {
 		/*
 		 * Allocate new duphead. Swap xde with duphead to avoid
 		 * adding/removing elements with the same hash.
 		 */
 		MPASS(!tmpfs_dirent_dup(xde));
 		tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0,
 		    &nde);
 		/* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */
 		memcpy(nde, xde, sizeof(*xde));
 		xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD;
 		LIST_INIT(&xde->ud.td_duphead);
 		xde->td_namelen = 0;
 		xde->td_node = NULL;
 		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde);
 		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de);
 	}
 	dnode->tn_size += sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 	tmpfs_update(vp);
 }
 
 /*
  * Detaches the directory entry de from the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
  * the directory entry, as this is done by tmpfs_free_dirent.
  */
 void
 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_mount *tmp;
 	struct tmpfs_dir *head;
 	struct tmpfs_node *dnode;
 	struct tmpfs_dirent *xde;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	dnode = VP_TO_TMPFS_DIR(vp);
 	head = &dnode->tn_dir.tn_dirhead;
 	dnode->tn_dir.tn_readdir_lastn = 0;
 	dnode->tn_dir.tn_readdir_lastp = NULL;
 
 	if (tmpfs_dirent_dup(de)) {
 		/* Remove duphead if de was last entry. */
 		if (LIST_NEXT(de, uh.td_dup.entries) == NULL) {
 			xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash);
 			MPASS(tmpfs_dirent_duphead(xde));
 		} else
 			xde = NULL;
 		LIST_REMOVE(de, uh.td_dup.entries);
 		LIST_REMOVE(de, uh.td_dup.index_entries);
 		if (xde != NULL) {
 			if (LIST_EMPTY(&xde->ud.td_duphead)) {
 				RB_REMOVE(tmpfs_dir, head, xde);
 				tmp = VFS_TO_TMPFS(vp->v_mount);
 				MPASS(xde->td_node == NULL);
 				tmpfs_free_dirent(tmp, xde);
 			}
 		}
 		de->td_cookie = de->td_hash;
 	} else
 		RB_REMOVE(tmpfs_dir, head, de);
 
 	dnode->tn_size -= sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 	tmpfs_update(vp);
 }
 
 void
 tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode)
 {
 	struct tmpfs_dirent *de, *dde, *nde;
 
 	RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) {
 		RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de);
 		/* Node may already be destroyed. */
 		de->td_node = NULL;
 		if (tmpfs_dirent_duphead(de)) {
 			while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) {
 				LIST_REMOVE(dde, uh.td_dup.entries);
 				dde->td_node = NULL;
 				tmpfs_free_dirent(tmp, dde);
 			}
 		}
 		tmpfs_free_dirent(tmp, de);
 	}
 }
 
 /*
  * Helper function for tmpfs_readdir.  Creates a '.' entry for the given
  * directory and returns it in the uio space.  The function returns 0
  * on success, -1 if there was not enough space in the uio structure to
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
 static int
 tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node,
     struct uio *uio)
 {
 	int error;
 	struct dirent dent;
 
 	TMPFS_VALIDATE_DIR(node);
 	MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT);
 
 	dent.d_fileno = node->tn_id;
 	dent.d_type = DT_DIR;
 	dent.d_namlen = 1;
 	dent.d_name[0] = '.';
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 	dirent_terminate(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
 		error = EJUSTRETURN;
 	else
 		error = uiomove(&dent, dent.d_reclen, uio);
 
 	tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED);
 
 	return (error);
 }
 
 /*
  * Helper function for tmpfs_readdir.  Creates a '..' entry for the given
  * directory and returns it in the uio space.  The function returns 0
  * on success, -1 if there was not enough space in the uio structure to
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
 static int
 tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node,
     struct uio *uio)
 {
 	struct tmpfs_node *parent;
 	struct dirent dent;
 	int error;
 
 	TMPFS_VALIDATE_DIR(node);
 	MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT);
 
 	/*
 	 * Return ENOENT if the current node is already removed.
 	 */
 	TMPFS_ASSERT_LOCKED(node);
 	parent = node->tn_dir.tn_parent;
 	if (parent == NULL)
 		return (ENOENT);
 
 	TMPFS_NODE_LOCK(parent);
 	dent.d_fileno = parent->tn_id;
 	TMPFS_NODE_UNLOCK(parent);
 
 	dent.d_type = DT_DIR;
 	dent.d_namlen = 2;
 	dent.d_name[0] = '.';
 	dent.d_name[1] = '.';
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 	dirent_terminate(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
 		error = EJUSTRETURN;
 	else
 		error = uiomove(&dent, dent.d_reclen, uio);
 
 	tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED);
 
 	return (error);
 }
 
 /*
  * Helper function for tmpfs_readdir.  Returns as much directory entries
  * as can fit in the uio space.  The read starts at uio->uio_offset.
  * The function returns 0 on success, -1 if there was not enough space
  * in the uio structure to hold the directory entry or an appropriate
  * error code if another error happens.
  */
 int
 tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node,
     struct uio *uio, int maxcookies, u_long *cookies, int *ncookies)
 {
 	struct tmpfs_dir_cursor dc;
 	struct tmpfs_dirent *de;
 	off_t off;
 	int error;
 
 	TMPFS_VALIDATE_DIR(node);
 
 	off = 0;
 
 	/*
 	 * Lookup the node from the current offset.  The starting offset of
 	 * 0 will lookup both '.' and '..', and then the first real entry,
 	 * or EOF if there are none.  Then find all entries for the dir that
 	 * fit into the buffer.  Once no more entries are found (de == NULL),
 	 * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next
 	 * call to return 0.
 	 */
 	switch (uio->uio_offset) {
 	case TMPFS_DIRCOOKIE_DOT:
 		error = tmpfs_dir_getdotdent(tm, node, uio);
 		if (error != 0)
 			return (error);
 		uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT;
 		if (cookies != NULL)
 			cookies[(*ncookies)++] = off = uio->uio_offset;
 		/* FALLTHROUGH */
 	case TMPFS_DIRCOOKIE_DOTDOT:
 		error = tmpfs_dir_getdotdotdent(tm, node, uio);
 		if (error != 0)
 			return (error);
 		de = tmpfs_dir_first(node, &dc);
 		uio->uio_offset = tmpfs_dirent_cookie(de);
 		if (cookies != NULL)
 			cookies[(*ncookies)++] = off = uio->uio_offset;
 		/* EOF. */
 		if (de == NULL)
 			return (0);
 		break;
 	case TMPFS_DIRCOOKIE_EOF:
 		return (0);
 	default:
 		de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc);
 		if (de == NULL)
 			return (EINVAL);
 		if (cookies != NULL)
 			off = tmpfs_dirent_cookie(de);
 	}
 
 	/* Read as much entries as possible; i.e., until we reach the end of
 	 * the directory or we exhaust uio space. */
 	do {
 		struct dirent d;
 
 		/* Create a dirent structure representing the current
 		 * tmpfs_node and fill it. */
 		if (de->td_node == NULL) {
 			d.d_fileno = 1;
 			d.d_type = DT_WHT;
 		} else {
 			d.d_fileno = de->td_node->tn_id;
 			switch (de->td_node->tn_type) {
 			case VBLK:
 				d.d_type = DT_BLK;
 				break;
 
 			case VCHR:
 				d.d_type = DT_CHR;
 				break;
 
 			case VDIR:
 				d.d_type = DT_DIR;
 				break;
 
 			case VFIFO:
 				d.d_type = DT_FIFO;
 				break;
 
 			case VLNK:
 				d.d_type = DT_LNK;
 				break;
 
 			case VREG:
 				d.d_type = DT_REG;
 				break;
 
 			case VSOCK:
 				d.d_type = DT_SOCK;
 				break;
 
 			default:
 				panic("tmpfs_dir_getdents: type %p %d",
 				    de->td_node, (int)de->td_node->tn_type);
 			}
 		}
 		d.d_namlen = de->td_namelen;
 		MPASS(de->td_namelen < sizeof(d.d_name));
 		(void)memcpy(d.d_name, de->ud.td_name, de->td_namelen);
 		d.d_reclen = GENERIC_DIRSIZ(&d);
 		dirent_terminate(&d);
 
 		/* Stop reading if the directory entry we are treating is
 		 * bigger than the amount of data that can be returned. */
 		if (d.d_reclen > uio->uio_resid) {
 			error = EJUSTRETURN;
 			break;
 		}
 
 		/* Copy the new dirent structure into the output buffer and
 		 * advance pointers. */
 		error = uiomove(&d, d.d_reclen, uio);
 		if (error == 0) {
 			de = tmpfs_dir_next(node, &dc);
 			if (cookies != NULL) {
 				off = tmpfs_dirent_cookie(de);
 				MPASS(*ncookies < maxcookies);
 				cookies[(*ncookies)++] = off;
 			}
 		}
 	} while (error == 0 && uio->uio_resid > 0 && de != NULL);
 
 	/* Skip setting off when using cookies as it is already done above. */
 	if (cookies == NULL)
 		off = tmpfs_dirent_cookie(de);
 
 	/* Update the offset and cache. */
 	uio->uio_offset = off;
 	node->tn_dir.tn_readdir_lastn = off;
 	node->tn_dir.tn_readdir_lastp = de;
 
 	tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED);
 	return error;
 }
 
 int
 tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp)
 {
 	struct tmpfs_dirent *de;
 	int error;
 
 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL,
 	    cnp->cn_nameptr, cnp->cn_namelen, &de);
 	if (error != 0)
 		return (error);
 	tmpfs_dir_attach(dvp, de);
 	return (0);
 }
 
 void
 tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp)
 {
 	struct tmpfs_dirent *de;
 
 	de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp);
 	MPASS(de != NULL && de->td_node == NULL);
 	tmpfs_dir_detach(dvp, de);
 	tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de);
 }
 
 /*
  * Resizes the aobj associated with the regular file pointed to by 'vp' to the
  * size 'newsize'.  'vp' must point to a vnode that represents a regular file.
  * 'newsize' must be positive.
  *
  * Returns zero on success or an appropriate error code on failure.
  */
 int
 tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr)
 {
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	vm_object_t uobj;
 	vm_page_t m;
 	vm_pindex_t idx, newpages, oldpages;
 	off_t oldsize;
 	int base, rv;
 
 	MPASS(vp->v_type == VREG);
 	MPASS(newsize >= 0);
 
 	node = VP_TO_TMPFS_NODE(vp);
 	uobj = node->tn_reg.tn_aobj;
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 
 	/*
 	 * Convert the old and new sizes to the number of pages needed to
 	 * store them.  It may happen that we do not need to do anything
 	 * because the last allocated page can accommodate the change on
 	 * its own.
 	 */
 	oldsize = node->tn_size;
 	oldpages = OFF_TO_IDX(oldsize + PAGE_MASK);
 	MPASS(oldpages == uobj->size);
 	newpages = OFF_TO_IDX(newsize + PAGE_MASK);
 
 	if (__predict_true(newpages == oldpages && newsize >= oldsize)) {
 		node->tn_size = newsize;
 		return (0);
 	}
 
 	if (newpages > oldpages &&
 	    tmpfs_pages_check_avail(tmp, newpages - oldpages) == 0)
 		return (ENOSPC);
 
 	VM_OBJECT_WLOCK(uobj);
 	if (newsize < oldsize) {
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = newsize & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(newsize);
 retry:
 			m = vm_page_grab(uobj, idx, VM_ALLOC_NOCREAT);
 			if (m != NULL) {
 				MPASS(vm_page_all_valid(m));
 			} else if (vm_pager_has_page(uobj, idx, NULL, NULL)) {
 				m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL |
 				    VM_ALLOC_WAITFAIL);
 				if (m == NULL)
 					goto retry;
 				vm_object_pip_add(uobj, 1);
 				VM_OBJECT_WUNLOCK(uobj);
 				rv = vm_pager_get_pages(uobj, &m, 1, NULL,
 				    NULL);
 				VM_OBJECT_WLOCK(uobj);
 				vm_object_pip_wakeup(uobj);
 				if (rv == VM_PAGER_OK) {
 					/*
 					 * Since the page was not resident,
 					 * and therefore not recently
 					 * accessed, immediately enqueue it
 					 * for asynchronous laundering.  The
 					 * current operation is not regarded
 					 * as an access.
 					 */
 					vm_page_launder(m);
 				} else {
 					vm_page_free(m);
 					if (ignerr)
 						m = NULL;
 					else {
 						VM_OBJECT_WUNLOCK(uobj);
 						return (EIO);
 					}
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
 			}
 		}
 
 		/*
 		 * Release any swap space and free any whole pages.
 		 */
 		if (newpages < oldpages)
 			vm_object_page_remove(uobj, newpages, 0, 0);
 	}
 	uobj->size = newpages;
 	VM_OBJECT_WUNLOCK(uobj);
 
 	atomic_add_long(&tmp->tm_pages_used, newpages - oldpages);
 
 	node->tn_size = newsize;
 	return (0);
 }
 
 void
 tmpfs_check_mtime(struct vnode *vp)
 {
 	struct tmpfs_node *node;
 	struct vm_object *obj;
 
 	ASSERT_VOP_ELOCKED(vp, "check_mtime");
 	if (vp->v_type != VREG)
 		return;
 	obj = vp->v_object;
 	KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) ==
 	    (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj"));
 	/* unlocked read */
 	if (obj->generation != obj->cleangeneration) {
 		VM_OBJECT_WLOCK(obj);
 		if (obj->generation != obj->cleangeneration) {
 			obj->cleangeneration = obj->generation;
 			node = VP_TO_TMPFS_NODE(vp);
 			node->tn_status |= TMPFS_NODE_MODIFIED |
 			    TMPFS_NODE_CHANGED;
 		}
 		VM_OBJECT_WUNLOCK(obj);
 	}
 }
 
 /*
  * Change flags of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chflags");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK |
 	    UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP |
 	    UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE |
 	    UF_SPARSE | UF_SYSTEM)) != 0)
 		return (EOPNOTSUPP);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/*
 	 * Callers may only modify the file flags on objects they
 	 * have VADMIN rights for.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 	/*
 	 * Unprivileged processes are not permitted to unset system
 	 * flags, or modify flags if any system flags are set.
 	 */
 	if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) {
 		if (node->tn_flags &
 		    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 			error = securelevel_gt(cred, 0);
 			if (error)
 				return (error);
 		}
 	} else {
 		if (node->tn_flags &
 		    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 		    ((flags ^ node->tn_flags) & SF_SETTABLE))
 			return (EPERM);
 	}
 	node->tn_flags = flags;
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	ASSERT_VOP_ELOCKED(vp, "chflags2");
 
 	return (0);
 }
 
 /*
  * Change access mode on the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 	mode_t newmode;
 
 	ASSERT_VOP_ELOCKED(vp, "chmod");
 	ASSERT_VOP_IN_SEQC(vp);
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE))
 			return (EFTYPE);
 	}
 	if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID);
 		if (error)
 			return (error);
 	}
 
 	newmode = node->tn_mode & ~ALLPERMS;
 	newmode |= mode & ALLPERMS;
 	atomic_store_short(&node->tn_mode, newmode);
 
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	ASSERT_VOP_ELOCKED(vp, "chmod2");
 
 	return (0);
 }
 
 /*
  * Change ownership of the given vnode.  At least one of uid or gid must
  * be different than VNOVAL.  If one is set to that value, the attribute
  * is unchanged.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 	uid_t ouid;
 	gid_t ogid;
 	mode_t newmode;
 
 	ASSERT_VOP_ELOCKED(vp, "chown");
 	ASSERT_VOP_IN_SEQC(vp);
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Assign default values if they are unknown. */
 	MPASS(uid != VNOVAL || gid != VNOVAL);
 	if (uid == VNOVAL)
 		uid = node->tn_uid;
 	if (gid == VNOVAL)
 		gid = node->tn_gid;
 	MPASS(uid != VNOVAL && gid != VNOVAL);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, p)))
 		return (error);
 
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if ((uid != node->tn_uid ||
 	    (gid != node->tn_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN)))
 		return (error);
 
 	ogid = node->tn_gid;
 	ouid = node->tn_uid;
 
 	node->tn_uid = uid;
 	node->tn_gid = gid;
 
 	node->tn_status |= TMPFS_NODE_CHANGED;
 
 	if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) {
 			newmode = node->tn_mode & ~(S_ISUID | S_ISGID);
 			atomic_store_short(&node->tn_mode, newmode);
 		}
 	}
 
 	ASSERT_VOP_ELOCKED(vp, "chown2");
 
 	return (0);
 }
 
 /*
  * Change size of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred,
     struct thread *p)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chsize");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Decide whether this is a valid operation based on the file type. */
 	error = 0;
 	switch (vp->v_type) {
 	case VDIR:
 		return EISDIR;
 
 	case VREG:
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return EROFS;
 		break;
 
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VFIFO:
 		/* Allow modifications of special files even if in the file
 		 * system is mounted read-only (we are not modifying the
 		 * files themselves, but the objects they represent). */
 		return 0;
 
 	default:
 		/* Anything else is unsupported. */
 		return EOPNOTSUPP;
 	}
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	error = tmpfs_truncate(vp, size);
 	/* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
 	 * for us, as will update tn_status; no need to do that here. */
 
 	ASSERT_VOP_ELOCKED(vp, "chsize2");
 
 	return (error);
 }
 
 /*
  * Change access and modification times of the given vnode.
  * Caller should execute tmpfs_update on vp after a successful execution.
  * The vnode must be locked on entry and remain locked on exit.
  */
 int
 tmpfs_chtimes(struct vnode *vp, struct vattr *vap,
     struct ucred *cred, struct thread *l)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	ASSERT_VOP_ELOCKED(vp, "chtimes");
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Disallow this operation if the file system is mounted read-only. */
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return EROFS;
 
 	/* Immutable or append-only files cannot be modified, either. */
 	if (node->tn_flags & (IMMUTABLE | APPEND))
 		return EPERM;
 
 	error = vn_utimes_perm(vp, vap, cred, l);
 	if (error != 0)
 		return (error);
 
 	if (vap->va_atime.tv_sec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_ACCESSED;
 
 	if (vap->va_mtime.tv_sec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_MODIFIED;
 
 	if (vap->va_birthtime.tv_sec != VNOVAL)
 		node->tn_status |= TMPFS_NODE_MODIFIED;
 
 	tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime);
 
 	if (vap->va_birthtime.tv_sec != VNOVAL)
 		node->tn_birthtime = vap->va_birthtime;
 	ASSERT_VOP_ELOCKED(vp, "chtimes2");
 
 	return (0);
 }
 
 void
 tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status)
 {
 
 	if ((node->tn_status & status) == status || tm->tm_ronly)
 		return;
 	TMPFS_NODE_LOCK(node);
 	node->tn_status |= status;
 	TMPFS_NODE_UNLOCK(node);
 }
 
 /* Sync timestamps */
 void
 tmpfs_itimes(struct vnode *vp, const struct timespec *acc,
     const struct timespec *mod)
 {
 	struct tmpfs_node *node;
 	struct timespec now;
 
 	ASSERT_VOP_LOCKED(vp, "tmpfs_itimes");
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    TMPFS_NODE_CHANGED)) == 0)
 		return;
 
 	vfs_timestamp(&now);
 	TMPFS_NODE_LOCK(node);
 	if (node->tn_status & TMPFS_NODE_ACCESSED) {
 		if (acc == NULL)
 			 acc = &now;
 		node->tn_atime = *acc;
 	}
 	if (node->tn_status & TMPFS_NODE_MODIFIED) {
 		if (mod == NULL)
 			mod = &now;
 		node->tn_mtime = *mod;
 	}
 	if (node->tn_status & TMPFS_NODE_CHANGED)
 		node->tn_ctime = now;
 	node->tn_status &= ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    TMPFS_NODE_CHANGED);
 	TMPFS_NODE_UNLOCK(node);
 
 	/* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */
 	random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME);
 }
 
 int
 tmpfs_truncate(struct vnode *vp, off_t length)
 {
 	int error;
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (length < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if (node->tn_size == length) {
 		error = 0;
 		goto out;
 	}
 
 	if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize)
 		return (EFBIG);
 
 	error = tmpfs_reg_resize(vp, length, FALSE);
 	if (error == 0)
 		node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED;
 
 out:
 	tmpfs_update(vp);
 
 	return (error);
 }
 
 static __inline int
 tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b)
 {
 	if (a->td_hash > b->td_hash)
 		return (1);
 	else if (a->td_hash < b->td_hash)
 		return (-1);
 	return (0);
 }
 
 RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp);
Index: projects/clang1100-import/sys/fs/tmpfs/tmpfs_vfsops.c
===================================================================
--- projects/clang1100-import/sys/fs/tmpfs/tmpfs_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/fs/tmpfs/tmpfs_vfsops.c	(revision 364279)
@@ -1,690 +1,689 @@
 /*	$NetBSD: tmpfs_vfsops.c,v 1.10 2005/12/11 12:24:29 christos Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Efficient memory file system.
  *
  * tmpfs is a file system that uses FreeBSD's virtual memory
  * sub-system to store file data and metadata in an efficient way.
  * This means that it does not follow the structure of an on-disk file
  * system because it simply does not need to.  Instead, it uses
  * memory-specific data structures and algorithms to automatically
  * allocate and release resources.
  */
 
 #include "opt_tmpfs.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
 #include <fs/tmpfs/tmpfs.h>
 
 /*
  * Default permission for root node
  */
 #define TMPFS_DEFAULT_ROOT_MODE	(S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
 
 MALLOC_DEFINE(M_TMPFSMNT, "tmpfs mount", "tmpfs mount structures");
 MALLOC_DEFINE(M_TMPFSNAME, "tmpfs name", "tmpfs file names");
 
 static int	tmpfs_mount(struct mount *);
 static int	tmpfs_unmount(struct mount *, int);
 static int	tmpfs_root(struct mount *, int flags, struct vnode **);
 static int	tmpfs_fhtovp(struct mount *, struct fid *, int,
 		    struct vnode **);
 static int	tmpfs_statfs(struct mount *, struct statfs *);
 
 static const char *tmpfs_opts[] = {
 	"from", "size", "maxfilesize", "inodes", "uid", "gid", "mode", "export",
 	"union", "nonc", "nomtime", NULL
 };
 
 static const char *tmpfs_updateopts[] = {
 	"from", "export", "nomtime", "size", NULL
 };
 
 /*
  * Handle updates of time from writes to mmaped regions, if allowed.
  * Use MNT_VNODE_FOREACH_ALL instead of MNT_VNODE_FOREACH_LAZY, since
  * unmap of the tmpfs-backed vnode does not call vinactive(), due to
  * vm object type is OBJT_SWAP.  If lazy, only handle delayed update
  * of mtime due to the writes to mapped files.
  */
 static void
 tmpfs_update_mtime(struct mount *mp, bool lazy)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	if (VFS_TO_TMPFS(mp)->tm_nomtime)
 		return;
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_type != VREG) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		obj = vp->v_object;
 		KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) ==
 		    (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj"));
 
 		/*
 		 * In lazy case, do unlocked read, avoid taking vnode
 		 * lock if not needed.  Lost update will be handled on
 		 * the next call.
 		 * For non-lazy case, we must flush all pending
 		 * metadata changes now.
 		 */
 		if (!lazy || obj->generation != obj->cleangeneration) {
-			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK,
-			    curthread) != 0)
+			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK) != 0)
 				continue;
 			tmpfs_check_mtime(vp);
 			if (!lazy)
 				tmpfs_update(vp);
 			vput(vp);
 		} else {
 			VI_UNLOCK(vp);
 			continue;
 		}
 	}
 }
 
 struct tmpfs_check_rw_maps_arg {
 	bool found;
 };
 
 static bool
 tmpfs_check_rw_maps_cb(struct mount *mp __unused, vm_map_t map __unused,
     vm_map_entry_t entry __unused, void *arg)
 {
 	struct tmpfs_check_rw_maps_arg *a;
 
 	a = arg;
 	a->found = true;
 	return (true);
 }
 
 /*
  * Revoke write permissions from all mappings of regular files
  * belonging to the specified tmpfs mount.
  */
 static bool
 tmpfs_revoke_rw_maps_cb(struct mount *mp __unused, vm_map_t map,
     vm_map_entry_t entry, void *arg __unused)
 {
 
 	/*
 	 * XXXKIB: might be invalidate the mapping
 	 * instead ?  The process is not going to be
 	 * happy in any case.
 	 */
 	entry->max_protection &= ~VM_PROT_WRITE;
 	if ((entry->protection & VM_PROT_WRITE) != 0) {
 		entry->protection &= ~VM_PROT_WRITE;
 		pmap_protect(map->pmap, entry->start, entry->end,
 		    entry->protection);
 	}
 	return (false);
 }
 
 static void
 tmpfs_all_rw_maps(struct mount *mp, bool (*cb)(struct mount *mp, vm_map_t,
     vm_map_entry_t, void *), void *cb_arg)
 {
 	struct proc *p;
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	struct vnode *vp;
 	int gen;
 	bool terminate;
 
 	terminate = false;
 	sx_slock(&allproc_lock);
 again:
 	gen = allproc_gen;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 		    P_SYSTEM | P_WEXIT)) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		vm = vmspace_acquire_ref(p);
 		_PHOLD_LITE(p);
 		PROC_UNLOCK(p);
 		if (vm == NULL) {
 			PRELE(p);
 			continue;
 		}
 		sx_sunlock(&allproc_lock);
 		map = &vm->vm_map;
 
 		vm_map_lock(map);
 		if (map->busy)
 			vm_map_wait_busy(map);
 		VM_MAP_ENTRY_FOREACH(entry, map) {
 			if ((entry->eflags & (MAP_ENTRY_GUARD |
 			    MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_COW)) != 0 ||
 			    (entry->max_protection & VM_PROT_WRITE) == 0)
 				continue;
 			object = entry->object.vm_object;
 			if (object == NULL || object->type != OBJT_SWAP ||
 			    (object->flags & OBJ_TMPFS_NODE) == 0)
 				continue;
 			/*
 			 * No need to dig into shadow chain, mapping
 			 * of the object not at top is readonly.
 			 */
 
 			VM_OBJECT_RLOCK(object);
 			if (object->type == OBJT_DEAD) {
 				VM_OBJECT_RUNLOCK(object);
 				continue;
 			}
 			MPASS(object->ref_count > 1);
 			if ((object->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) !=
 			    (OBJ_TMPFS_NODE | OBJ_TMPFS)) {
 				VM_OBJECT_RUNLOCK(object);
 				continue;
 			}
 			vp = object->un_pager.swp.swp_tmpfs;
 			if (vp->v_mount != mp) {
 				VM_OBJECT_RUNLOCK(object);
 				continue;
 			}
 
 			terminate = cb(mp, map, entry, cb_arg);
 			VM_OBJECT_RUNLOCK(object);
 			if (terminate)
 				break;
 		}
 		vm_map_unlock(map);
 
 		vmspace_free(vm);
 		sx_slock(&allproc_lock);
 		PRELE(p);
 		if (terminate)
 			break;
 	}
 	if (!terminate && gen != allproc_gen)
 		goto again;
 	sx_sunlock(&allproc_lock);
 }
 
 static bool
 tmpfs_check_rw_maps(struct mount *mp)
 {
 	struct tmpfs_check_rw_maps_arg ca;
 
 	ca.found = false;
 	tmpfs_all_rw_maps(mp, tmpfs_check_rw_maps_cb, &ca);
 	return (ca.found);
 }
 
 static int
 tmpfs_rw_to_ro(struct mount *mp)
 {
 	int error, flags;
 	bool forced;
 
 	forced = (mp->mnt_flag & MNT_FORCE) != 0;
 	flags = WRITECLOSE | (forced ? FORCECLOSE : 0);
 
 	if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 		return (error);
 	error = vfs_write_suspend_umnt(mp);
 	if (error != 0)
 		return (error);
 	if (!forced && tmpfs_check_rw_maps(mp)) {
 		error = EBUSY;
 		goto out;
 	}
 	VFS_TO_TMPFS(mp)->tm_ronly = 1;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_RDONLY;
 	MNT_IUNLOCK(mp);
 	for (;;) {
 		tmpfs_all_rw_maps(mp, tmpfs_revoke_rw_maps_cb, NULL);
 		tmpfs_update_mtime(mp, false);
 		error = vflush(mp, 0, flags, curthread);
 		if (error != 0) {
 			VFS_TO_TMPFS(mp)->tm_ronly = 0;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			goto out;
 		}
 		if (!tmpfs_check_rw_maps(mp))
 			break;
 	}
 out:
 	vfs_write_resume(mp, 0);
 	return (error);
 }
 
 static int
 tmpfs_mount(struct mount *mp)
 {
 	const size_t nodes_per_page = howmany(PAGE_SIZE,
 	    sizeof(struct tmpfs_dirent) + sizeof(struct tmpfs_node));
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *root;
 	int error;
 	bool nomtime, nonc;
 	/* Size counters. */
 	u_quad_t pages;
 	off_t nodes_max, size_max, maxfilesize;
 
 	/* Root node attributes. */
 	uid_t root_uid;
 	gid_t root_gid;
 	mode_t root_mode;
 
 	struct vattr va;
 
 	if (vfs_filteropt(mp->mnt_optnew, tmpfs_opts))
 		return (EINVAL);
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/* Only support update mounts for certain options. */
 		if (vfs_filteropt(mp->mnt_optnew, tmpfs_updateopts) != 0)
 			return (EOPNOTSUPP);
 		tmp = VFS_TO_TMPFS(mp);
 		if (vfs_getopt_size(mp->mnt_optnew, "size", &size_max) == 0) {
 			/*
 			 * On-the-fly resizing is not supported (yet). We still
 			 * need to have "size" listed as "supported", otherwise
 			 * trying to update fs that is listed in fstab with size
 			 * parameter, say trying to change rw to ro or vice
 			 * versa, would cause vfs_filteropt() to bail.
 			 */
 			if (size_max != tmp->tm_size_max)
 				return (EOPNOTSUPP);
 		}
 		if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) &&
 		    !tmp->tm_ronly) {
 			/* RW -> RO */
 			return (tmpfs_rw_to_ro(mp));
 		} else if (!vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) &&
 		    tmp->tm_ronly) {
 			/* RO -> RW */
 			tmp->tm_ronly = 0;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		}
 		tmp->tm_nomtime = vfs_getopt(mp->mnt_optnew, "nomtime", NULL,
 		    0) == 0;
 		MNT_ILOCK(mp);
 		if ((mp->mnt_flag & MNT_UNION) == 0) {
 			mp->mnt_kern_flag |= MNTK_FPLOOKUP;
 		} else {
 			mp->mnt_kern_flag &= ~MNTK_FPLOOKUP;
 		}
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 
 	vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred);
 	VOP_UNLOCK(mp->mnt_vnodecovered);
 	if (error)
 		return (error);
 
 	if (mp->mnt_cred->cr_ruid != 0 ||
 	    vfs_scanopt(mp->mnt_optnew, "gid", "%d", &root_gid) != 1)
 		root_gid = va.va_gid;
 	if (mp->mnt_cred->cr_ruid != 0 ||
 	    vfs_scanopt(mp->mnt_optnew, "uid", "%d", &root_uid) != 1)
 		root_uid = va.va_uid;
 	if (mp->mnt_cred->cr_ruid != 0 ||
 	    vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1)
 		root_mode = va.va_mode;
 	if (vfs_getopt_size(mp->mnt_optnew, "inodes", &nodes_max) != 0)
 		nodes_max = 0;
 	if (vfs_getopt_size(mp->mnt_optnew, "size", &size_max) != 0)
 		size_max = 0;
 	if (vfs_getopt_size(mp->mnt_optnew, "maxfilesize", &maxfilesize) != 0)
 		maxfilesize = 0;
 	nonc = vfs_getopt(mp->mnt_optnew, "nonc", NULL, NULL) == 0;
 	nomtime = vfs_getopt(mp->mnt_optnew, "nomtime", NULL, NULL) == 0;
 
 	/* Do not allow mounts if we do not have enough memory to preserve
 	 * the minimum reserved pages. */
 	if (tmpfs_mem_avail() < TMPFS_PAGES_MINRESERVED)
 		return (ENOSPC);
 
 	/* Get the maximum number of memory pages this file system is
 	 * allowed to use, based on the maximum size the user passed in
 	 * the mount structure.  A value of zero is treated as if the
 	 * maximum available space was requested. */
 	if (size_max == 0 || size_max > OFF_MAX - PAGE_SIZE ||
 	    (SIZE_MAX < OFF_MAX && size_max / PAGE_SIZE >= SIZE_MAX))
 		pages = SIZE_MAX;
 	else {
 		size_max = roundup(size_max, PAGE_SIZE);
 		pages = howmany(size_max, PAGE_SIZE);
 	}
 	MPASS(pages > 0);
 
 	if (nodes_max <= 3) {
 		if (pages < INT_MAX / nodes_per_page)
 			nodes_max = pages * nodes_per_page;
 		else
 			nodes_max = INT_MAX;
 	}
 	if (nodes_max > INT_MAX)
 		nodes_max = INT_MAX;
 	MPASS(nodes_max >= 3);
 
 	/* Allocate the tmpfs mount structure and fill it. */
 	tmp = (struct tmpfs_mount *)malloc(sizeof(struct tmpfs_mount),
 	    M_TMPFSMNT, M_WAITOK | M_ZERO);
 
 	mtx_init(&tmp->tm_allnode_lock, "tmpfs allnode lock", NULL, MTX_DEF);
 	tmp->tm_nodes_max = nodes_max;
 	tmp->tm_nodes_inuse = 0;
 	tmp->tm_refcount = 1;
 	tmp->tm_maxfilesize = maxfilesize > 0 ? maxfilesize : OFF_MAX;
 	LIST_INIT(&tmp->tm_nodes_used);
 
 	tmp->tm_size_max = size_max;
 	tmp->tm_pages_max = pages;
 	tmp->tm_pages_used = 0;
 	new_unrhdr64(&tmp->tm_ino_unr, 2);
 	tmp->tm_ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	tmp->tm_nonc = nonc;
 	tmp->tm_nomtime = nomtime;
 
 	/* Allocate the root node. */
 	error = tmpfs_alloc_node(mp, tmp, VDIR, root_uid, root_gid,
 	    root_mode & ALLPERMS, NULL, NULL, VNOVAL, &root);
 
 	if (error != 0 || root == NULL) {
 		free(tmp, M_TMPFSMNT);
 		return (error);
 	}
 	KASSERT(root->tn_id == 2,
 	    ("tmpfs root with invalid ino: %ju", (uintmax_t)root->tn_id));
 	tmp->tm_root = root;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
 	    MNTK_TEXT_REFS | MNTK_NOMSYNC;
 	if (!nonc && (mp->mnt_flag & MNT_UNION) == 0)
 		mp->mnt_kern_flag |= MNTK_FPLOOKUP;
 	MNT_IUNLOCK(mp);
 
 	mp->mnt_data = tmp;
 	mp->mnt_stat.f_namemax = MAXNAMLEN;
 	vfs_getnewfsid(mp);
 	vfs_mountedfrom(mp, "tmpfs");
 
 	return 0;
 }
 
 /* ARGSUSED2 */
 static int
 tmpfs_unmount(struct mount *mp, int mntflags)
 {
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	int error, flags;
 
 	flags = (mntflags & MNT_FORCE) != 0 ? FORCECLOSE : 0;
 	tmp = VFS_TO_TMPFS(mp);
 
 	/* Stop writers */
 	error = vfs_write_suspend_umnt(mp);
 	if (error != 0)
 		return (error);
 	/*
 	 * At this point, nodes cannot be destroyed by any other
 	 * thread because write suspension is started.
 	 */
 
 	for (;;) {
 		error = vflush(mp, 0, flags, curthread);
 		if (error != 0) {
 			vfs_write_resume(mp, VR_START_WRITE);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if (mp->mnt_nvnodelistsize == 0) {
 			MNT_IUNLOCK(mp);
 			break;
 		}
 		MNT_IUNLOCK(mp);
 		if ((mntflags & MNT_FORCE) == 0) {
 			vfs_write_resume(mp, VR_START_WRITE);
 			return (EBUSY);
 		}
 	}
 
 	TMPFS_LOCK(tmp);
 	while ((node = LIST_FIRST(&tmp->tm_nodes_used)) != NULL) {
 		TMPFS_NODE_LOCK(node);
 		if (node->tn_type == VDIR)
 			tmpfs_dir_destroy(tmp, node);
 		if (tmpfs_free_node_locked(tmp, node, true))
 			TMPFS_LOCK(tmp);
 		else
 			TMPFS_NODE_UNLOCK(node);
 	}
 
 	mp->mnt_data = NULL;
 	tmpfs_free_tmp(tmp);
 	vfs_write_resume(mp, VR_START_WRITE);
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 
 	return (0);
 }
 
 void
 tmpfs_free_tmp(struct tmpfs_mount *tmp)
 {
 
 	MPASS(tmp->tm_refcount > 0);
 	tmp->tm_refcount--;
 	if (tmp->tm_refcount > 0) {
 		TMPFS_UNLOCK(tmp);
 		return;
 	}
 	TMPFS_UNLOCK(tmp);
 
 	mtx_destroy(&tmp->tm_allnode_lock);
 	MPASS(tmp->tm_pages_used == 0);
 	MPASS(tmp->tm_nodes_inuse == 0);
 
 	free(tmp, M_TMPFSMNT);
 }
 
 static int
 tmpfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	int error;
 
 	error = tmpfs_alloc_vp(mp, VFS_TO_TMPFS(mp)->tm_root, flags, vpp);
 	if (error == 0)
 		(*vpp)->v_vflag |= VV_ROOT;
 	return (error);
 }
 
 static int
 tmpfs_fhtovp(struct mount *mp, struct fid *fhp, int flags,
     struct vnode **vpp)
 {
 	struct tmpfs_fid_data tfd;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	int error;
 
 	if (fhp->fid_len != sizeof(tfd))
 		return (EINVAL);
 
 	/*
 	 * Copy from fid_data onto the stack to avoid unaligned pointer use.
 	 * See the comment in sys/mount.h on struct fid for details.
 	 */
 	memcpy(&tfd, fhp->fid_data, fhp->fid_len);
 
 	tmp = VFS_TO_TMPFS(mp);
 
 	if (tfd.tfd_id >= tmp->tm_nodes_max)
 		return (EINVAL);
 
 	TMPFS_LOCK(tmp);
 	LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) {
 		if (node->tn_id == tfd.tfd_id &&
 		    node->tn_gen == tfd.tfd_gen) {
 			tmpfs_ref_node(node);
 			break;
 		}
 	}
 	TMPFS_UNLOCK(tmp);
 
 	if (node != NULL) {
 		error = tmpfs_alloc_vp(mp, node, LK_EXCLUSIVE, vpp);
 		tmpfs_free_node(tmp, node);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 /* ARGSUSED2 */
 static int
 tmpfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct tmpfs_mount *tmp;
 	size_t used;
 
 	tmp = VFS_TO_TMPFS(mp);
 
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_bsize = PAGE_SIZE;
 
 	used = tmpfs_pages_used(tmp);
 	if (tmp->tm_pages_max != ULONG_MAX)
 		 sbp->f_blocks = tmp->tm_pages_max;
 	else
 		 sbp->f_blocks = used + tmpfs_mem_avail();
 	if (sbp->f_blocks <= used)
 		sbp->f_bavail = 0;
 	else
 		sbp->f_bavail = sbp->f_blocks - used;
 	sbp->f_bfree = sbp->f_bavail;
 	used = tmp->tm_nodes_inuse;
 	sbp->f_files = tmp->tm_nodes_max;
 	if (sbp->f_files <= used)
 		sbp->f_ffree = 0;
 	else
 		sbp->f_ffree = sbp->f_files - used;
 	/* sbp->f_owner = tmp->tn_uid; */
 
 	return 0;
 }
 
 static int
 tmpfs_sync(struct mount *mp, int waitfor)
 {
 
 	if (waitfor == MNT_SUSPEND) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
 		MNT_IUNLOCK(mp);
 	} else if (waitfor == MNT_LAZY) {
 		tmpfs_update_mtime(mp, true);
 	}
 	return (0);
 }
 
 static int
 tmpfs_init(struct vfsconf *conf)
 {
 	tmpfs_subr_init();
 	return (0);
 }
 
 static int
 tmpfs_uninit(struct vfsconf *conf)
 {
 	tmpfs_subr_uninit();
 	return (0);
 }
 
 /*
  * tmpfs vfs operations.
  */
 struct vfsops tmpfs_vfsops = {
 	.vfs_mount =			tmpfs_mount,
 	.vfs_unmount =			tmpfs_unmount,
 	.vfs_root =			vfs_cache_root,
 	.vfs_cachedroot =		tmpfs_root,
 	.vfs_statfs =			tmpfs_statfs,
 	.vfs_fhtovp =			tmpfs_fhtovp,
 	.vfs_sync =			tmpfs_sync,
 	.vfs_init =			tmpfs_init,
 	.vfs_uninit =			tmpfs_uninit,
 };
 VFS_SET(tmpfs_vfsops, tmpfs, VFCF_JAIL);
Index: projects/clang1100-import/sys/kern/uipc_mqueue.c
===================================================================
--- projects/clang1100-import/sys/kern/uipc_mqueue.c	(revision 364278)
+++ projects/clang1100-import/sys/kern/uipc_mqueue.c	(revision 364279)
@@ -1,2945 +1,2945 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
 struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_add_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
-		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
+		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid,
 	    ap->a_accmode, ap->a_cred);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		dirent_terminate(&entry);
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	const struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
 	int found;
 
 	found = 0;
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0)
 			found = 1;
 	}
 	if (!found) {
 		/*
 		 * No jails are rooted in this directory anymore,
 		 * so no queues should be either.
 		 */
 		sx_xlock(&mqfs_data.mi_lock);
 		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
 		    mn_sibling, tpn) {
 			if (pn->mn_pr_root == pr->pr_root)
 				(void)do_unlink(pn, curthread->td_ucred);
 		}
 		sx_xunlock(&mqfs_data.mi_lock);
 	}
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 	
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	/*
 	 * "." and ".." are magic directories, populated on the fly, and cannot
 	 * be opened as queues.
 	 */
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 			    pn->mn_gid, accmode, td->td_ucred);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_event_rights, fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_read_rights, fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_write_rights, fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd), &cap_event_rights);
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct mqueue *mq;
 #ifdef INVARIANTS
 	struct filedesc *fdp;
  
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 #endif
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 	.fo_flags		= DFLAG_PASSABLE,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops);
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
Index: projects/clang1100-import/sys/kern/vfs_cache.c
===================================================================
--- projects/clang1100-import/sys/kern/vfs_cache.c	(revision 364278)
+++ projects/clang1100-import/sys/kern/vfs_cache.c	(revision 364279)
@@ -1,4181 +1,4179 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/seqc.h>
 #include <sys/sdt.h>
 #include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <ck_queue.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <sys/capsicum.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
     "char *", "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
     "char *");
 
 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 struct negstate {
 	u_char neg_flag;
 };
 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
     "the state must fit in a union with a pointer without growing it");
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	union {
 		struct	vnode *nu_vp;	/* vnode the name refers to */
 		struct	negstate nu_neg;/* negative entry state */
 	} n_un;
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name + nul */
 };
 
 /*
  * struct namecache_ts repeats struct namecache layout up to the
  * nc_nlen member.
  * struct namecache_ts is used in place of struct namecache when time(s) need
  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  * both a non-dotdot directory name plus dotdot for the directory's
  * parent.
  *
  * See below for alignment requirement.
  */
 struct	namecache_ts {
 	struct	timespec nc_time;	/* timespec provided by fs */
 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
 	int	nc_ticks;		/* ticks value when entry was added */
 	struct namecache nc_nc;
 };
 
 /*
  * At least mips n32 performs 64-bit accesses to timespec as found
  * in namecache_ts and requires them to be aligned. Since others
  * may be in the same spot suffer a little bit and enforce the
  * alignment for everyone. Note this is a nop for 64-bit platforms.
  */
 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
 #define	CACHE_PATH_CUTOFF	39
 
 #define CACHE_ZONE_SMALL_SIZE		(sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 #define CACHE_ZONE_SMALL_TS_SIZE	(sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 #define CACHE_ZONE_LARGE_SIZE		(sizeof(struct namecache) + NAME_MAX + 1)
 #define CACHE_ZONE_LARGE_TS_SIZE	(sizeof(struct namecache_ts) + NAME_MAX + 1)
 
 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 
 #define	nc_vp		n_un.nu_vp
 #define	nc_neg		n_un.nu_neg
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	0x01
 #define NCF_ISDOTDOT	0x02
 #define	NCF_TS		0x04
 #define	NCF_DTS		0x08
 #define	NCF_DVDROP	0x10
 #define	NCF_NEGATIVE	0x20
 #define	NCF_INVALID	0x40
 #define	NCF_WIP		0x80
 
 /*
  * Flags in negstate.neg_flag
  */
 #define NEG_HOT		0x01
 
 /*
  * Mark an entry as invalid.
  *
  * This is called before it starts getting deconstructed.
  */
 static void
 cache_ncp_invalidate(struct namecache *ncp)
 {
 
 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 	    ("%s: entry %p already invalid", __func__, ncp));
 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 	atomic_thread_fence_rel();
 }
 
 /*
  * Check whether the entry can be safely used.
  *
  * All places which elide locks are supposed to call this after they are
  * done with reading from an entry.
  */
 static bool
 cache_ncp_canuse(struct namecache *ncp)
 {
 
 	atomic_thread_fence_acq();
 	return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 }
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (dvp, name) where dvp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  *
  * These locks are used (in the order in which they can be taken):
  * NAME		TYPE	ROLE
  * vnodelock	mtx	vnode lists and v_cache_dd field protection
  * bucketlock	rwlock	for access to given set of hash buckets
  * neglist	mtx	negative entry LRU management
  *
  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
  * shrinking the LRU list.
  *
  * It is legal to take multiple vnodelock and bucketlock locks. The locking
  * order is lower address first. Both are recursive.
  *
  * "." lookups are lockless.
  *
  * ".." and vnode -> name lookups require vnodelock.
  *
  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
  *
  * Insertions and removals of entries require involved vnodes and bucketlocks
  * to be write-locked to prevent other threads from seeing the entry.
  *
  * Some lookups result in removal of the found entry (e.g. getting rid of a
  * negative entry with the intent to create a positive one), which poses a
  * problem when multiple threads reach the state. Similarly, two different
  * threads can purge two different vnodes and try to remove the same name.
  *
  * If the already held vnode lock is lower than the second required lock, we
  * can just take the other lock. However, in the opposite case, this could
  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
  * the first node, locking everything in order and revalidating the state.
  */
 
 VFS_SMR_DECLARE;
 
 /*
  * Structures associated with name caching.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 static u_long __read_mostly	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
     "Size of namecache hash table");
 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
     "Ratio of negative namecache entries");
 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
 u_int ncsizefactor = 2;
 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
     "Size factor for namecache");
 static u_int __read_mostly	ncpurgeminvnodes;
 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
     "Number of vnodes below which purgevfs ignores the request");
 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
 
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
 
 struct neglist {
 	struct mtx		nl_lock;
 	TAILQ_HEAD(, namecache) nl_list;
 } __aligned(CACHE_LINE_SIZE);
 
 static struct neglist __read_mostly	*neglists;
 static struct neglist ncneg_hot;
 static u_long numhotneg;
 
 #define ncneghash	3
 #define	numneglists	(ncneghash + 1)
 static inline struct neglist *
 NCP2NEGLIST(struct namecache *ncp)
 {
 
 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 }
 
 static inline struct negstate *
 NCP2NEGSTATE(struct namecache *ncp)
 {
 
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
 	return (&ncp->nc_neg);
 }
 
 #define	numbucketlocks (ncbuckethash + 1)
 static u_int __read_mostly  ncbuckethash;
 static struct rwlock_padalign __read_mostly  *bucketlocks;
 #define	HASH2BUCKETLOCK(hash) \
 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
 
 #define	numvnodelocks (ncvnodehash + 1)
 static u_int __read_mostly  ncvnodehash;
 static struct mtx __read_mostly *vnodelocks;
 static inline struct mtx *
 VP2VNODELOCK(struct vnode *vp)
 {
 
 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 }
 
 /*
  * UMA zones for the VFS cache.
  *
  * The small cache is used for entries with short names, which are the
  * most common.  The large cache is used for entries which are too big to
  * fit in the small cache.
  */
 static uma_zone_t __read_mostly cache_zone_small;
 static uma_zone_t __read_mostly cache_zone_small_ts;
 static uma_zone_t __read_mostly cache_zone_large;
 static uma_zone_t __read_mostly cache_zone_large_ts;
 
 static struct namecache *
 cache_alloc(int len, int ts)
 {
 	struct namecache_ts *ncp_ts;
 	struct namecache *ncp;
 
 	if (__predict_false(ts)) {
 		if (len <= CACHE_PATH_CUTOFF)
 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 		else
 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 		ncp = &ncp_ts->nc_nc;
 	} else {
 		if (len <= CACHE_PATH_CUTOFF)
 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 		else
 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 	}
 	return (ncp);
 }
 
 static void
 cache_free(struct namecache *ncp)
 {
 	struct namecache_ts *ncp_ts;
 
 	if (ncp == NULL)
 		return;
 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
 		vdrop(ncp->nc_dvp);
 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 		else
 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 	} else {
 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 			uma_zfree_smr(cache_zone_small, ncp);
 		else
 			uma_zfree_smr(cache_zone_large, ncp);
 	}
 }
 
 static void
 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 {
 	struct namecache_ts *ncp_ts;
 
 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 	    (tsp == NULL && ticksp == NULL),
 	    ("No NCF_TS"));
 
 	if (tsp == NULL && ticksp == NULL)
 		return;
 
 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 	if (tsp != NULL)
 		*tsp = ncp_ts->nc_time;
 	if (ticksp != NULL)
 		*ticksp = ncp_ts->nc_ticks;
 }
 
 #ifdef DEBUG_CACHE
 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
     "VFS namecache enabled");
 #endif
 
 /* Export size information to userland */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
     sizeof(struct namecache), "sizeof(struct namecache)");
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Name cache statistics");
 #define STATNODE_ULONG(name, descr)					\
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 #define STATNODE_COUNTER(name, descr)					\
 	static COUNTER_U64_DEFINE_EARLY(name);				\
 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 	    descr);
 STATNODE_ULONG(numneg, "Number of negative cache entries");
 STATNODE_ULONG(numcache, "Number of cache entries");
 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 STATNODE_COUNTER(dothits, "Number of '.' hits");
 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 STATNODE_COUNTER(nummiss, "Number of cache misses");
 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 STATNODE_COUNTER(numposzaps,
     "Number of cache hits (positive) we do not want to cache");
 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 STATNODE_COUNTER(numnegzaps,
     "Number of cache hits (negative) we do not want to cache");
 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 /* These count for vn_getcwd(), too. */
 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 STATNODE_COUNTER(numfullpathfail2,
     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
     "Number of successful removals after relocking");
 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
     "Number of times zap_and_exit failed to lock");
 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
     "Number of times zap_and_exit failed to lock");
 static long cache_lock_vnodes_cel_3_failures;
 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
     "Number of times 3-way vnode locking failed");
 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
 STATNODE_COUNTER(numneg_evicted,
     "Number of negative entries evicted when adding a new entry");
 STATNODE_COUNTER(shrinking_skipped,
     "Number of times shrinking was already in progress");
 
 static void cache_zap_locked(struct namecache *ncp);
 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
     char **freebuf, size_t *buflen);
 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, size_t *buflen);
 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 static int cache_yield;
 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
     "Number of times cache called yield");
 
 static void __noinline
 cache_maybe_yield(void)
 {
 
 	if (should_yield()) {
 		cache_yield++;
 		kern_yield(PRI_USER);
 	}
 }
 
 static inline void
 cache_assert_vlp_locked(struct mtx *vlp)
 {
 
 	if (vlp != NULL)
 		mtx_assert(vlp, MA_OWNED);
 }
 
 static inline void
 cache_assert_vnode_locked(struct vnode *vp)
 {
 	struct mtx *vlp;
 
 	vlp = VP2VNODELOCK(vp);
 	cache_assert_vlp_locked(vlp);
 }
 
 /*
  * TODO: With the value stored we can do better than computing the hash based
  * on the address and the choice of FNV should also be revisisted.
  */
 static void
 cache_prehash(struct vnode *vp)
 {
 
 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 }
 
 static uint32_t
 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 {
 
 	return (fnv_32_buf(name, len, dvp->v_nchash));
 }
 
 static inline struct nchashhead *
 NCP2BUCKET(struct namecache *ncp)
 {
 	uint32_t hash;
 
 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 	return (NCHHASH(hash));
 }
 
 static inline struct rwlock *
 NCP2BUCKETLOCK(struct namecache *ncp)
 {
 	uint32_t hash;
 
 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 	return (HASH2BUCKETLOCK(hash));
 }
 
 #ifdef INVARIANTS
 static void
 cache_assert_bucket_locked(struct namecache *ncp, int mode)
 {
 	struct rwlock *blp;
 
 	blp = NCP2BUCKETLOCK(ncp);
 	rw_assert(blp, mode);
 }
 #else
 #define cache_assert_bucket_locked(x, y) do { } while (0)
 #endif
 
 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
 static void
 _cache_sort_vnodes(void **p1, void **p2)
 {
 	void *tmp;
 
 	MPASS(*p1 != NULL || *p2 != NULL);
 
 	if (*p1 > *p2) {
 		tmp = *p2;
 		*p2 = *p1;
 		*p1 = tmp;
 	}
 }
 
 static void
 cache_lock_all_buckets(void)
 {
 	u_int i;
 
 	for (i = 0; i < numbucketlocks; i++)
 		rw_wlock(&bucketlocks[i]);
 }
 
 static void
 cache_unlock_all_buckets(void)
 {
 	u_int i;
 
 	for (i = 0; i < numbucketlocks; i++)
 		rw_wunlock(&bucketlocks[i]);
 }
 
 static void
 cache_lock_all_vnodes(void)
 {
 	u_int i;
 
 	for (i = 0; i < numvnodelocks; i++)
 		mtx_lock(&vnodelocks[i]);
 }
 
 static void
 cache_unlock_all_vnodes(void)
 {
 	u_int i;
 
 	for (i = 0; i < numvnodelocks; i++)
 		mtx_unlock(&vnodelocks[i]);
 }
 
 static int
 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 {
 
 	cache_sort_vnodes(&vlp1, &vlp2);
 
 	if (vlp1 != NULL) {
 		if (!mtx_trylock(vlp1))
 			return (EAGAIN);
 	}
 	if (!mtx_trylock(vlp2)) {
 		if (vlp1 != NULL)
 			mtx_unlock(vlp1);
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 static void
 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 {
 
 	MPASS(vlp1 != NULL || vlp2 != NULL);
 	MPASS(vlp1 <= vlp2);
 
 	if (vlp1 != NULL)
 		mtx_lock(vlp1);
 	if (vlp2 != NULL)
 		mtx_lock(vlp2);
 }
 
 static void
 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 {
 
 	MPASS(vlp1 != NULL || vlp2 != NULL);
 
 	if (vlp1 != NULL)
 		mtx_unlock(vlp1);
 	if (vlp2 != NULL)
 		mtx_unlock(vlp2);
 }
 
 static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
 	struct nchstats snap;
 
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
 
 	snap = nchstats;
 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
 	snap.ncs_neghits = counter_u64_fetch(numneghits);
 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 	    counter_u64_fetch(numnegzaps);
 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
 	    counter_u64_fetch(nummiss);
 
 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 }
 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
     "VFS cache effectiveness statistics");
 
 #ifdef DIAGNOSTIC
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int i, error, n_nchash, *cntbuf;
 
 retry:
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (req->oldptr == NULL)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 	cache_lock_all_buckets();
 	if (n_nchash != nchash + 1) {
 		cache_unlock_all_buckets();
 		free(cntbuf, M_TEMP);
 		goto retry;
 	}
 	/* Scan hash tables counting entries */
 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 			cntbuf[i]++;
 	cache_unlock_all_buckets();
 	for (error = 0, i = 0; i < n_nchash; i++)
 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 			break;
 	free(cntbuf, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
     "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
 	cache_lock_all_buckets();
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
 	cache_unlock_all_buckets();
 	pct = (used * 100) / (n_nchash / 100);
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 #endif
 
 /*
  * Negative entries management
  *
  * A variation of LRU scheme is used. New entries are hashed into one of
  * numneglists cold lists. Entries get promoted to the hot list on first hit.
  *
  * The shrinker will demote hot list head and evict from the cold list in a
  * round-robin manner.
  */
 static void
 cache_negative_init(struct namecache *ncp)
 {
 	struct negstate *negstate;
 
 	ncp->nc_flag |= NCF_NEGATIVE;
 	negstate = NCP2NEGSTATE(ncp);
 	negstate->neg_flag = 0;
 }
 
 static void
 cache_negative_hit(struct namecache *ncp)
 {
 	struct neglist *neglist;
 	struct negstate *negstate;
 
 	negstate = NCP2NEGSTATE(ncp);
 	if ((negstate->neg_flag & NEG_HOT) != 0)
 		return;
 	neglist = NCP2NEGLIST(ncp);
 	mtx_lock(&ncneg_hot.nl_lock);
 	mtx_lock(&neglist->nl_lock);
 	if ((negstate->neg_flag & NEG_HOT) == 0) {
 		numhotneg++;
 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 		negstate->neg_flag |= NEG_HOT;
 	}
 	mtx_unlock(&neglist->nl_lock);
 	mtx_unlock(&ncneg_hot.nl_lock);
 }
 
 static void
 cache_negative_insert(struct namecache *ncp)
 {
 	struct neglist *neglist;
 
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	neglist = NCP2NEGLIST(ncp);
 	mtx_lock(&neglist->nl_lock);
 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 	mtx_unlock(&neglist->nl_lock);
 	atomic_add_rel_long(&numneg, 1);
 }
 
 static void
 cache_negative_remove(struct namecache *ncp)
 {
 	struct neglist *neglist;
 	struct negstate *negstate;
 	bool hot_locked = false;
 	bool list_locked = false;
 
 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	neglist = NCP2NEGLIST(ncp);
 	negstate = NCP2NEGSTATE(ncp);
 	if ((negstate->neg_flag & NEG_HOT) != 0) {
 		hot_locked = true;
 		mtx_lock(&ncneg_hot.nl_lock);
 		if ((negstate->neg_flag & NEG_HOT) == 0) {
 			list_locked = true;
 			mtx_lock(&neglist->nl_lock);
 		}
 	} else {
 		list_locked = true;
 		mtx_lock(&neglist->nl_lock);
 		/*
 		 * We may be racing against promotion in lockless lookup.
 		 */
 		if ((negstate->neg_flag & NEG_HOT) != 0) {
 			mtx_unlock(&neglist->nl_lock);
 			hot_locked = true;
 			mtx_lock(&ncneg_hot.nl_lock);
 			mtx_lock(&neglist->nl_lock);
 		}
 	}
 	if ((negstate->neg_flag & NEG_HOT) != 0) {
 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 		numhotneg--;
 	} else {
 		mtx_assert(&neglist->nl_lock, MA_OWNED);
 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 	}
 	if (list_locked)
 		mtx_unlock(&neglist->nl_lock);
 	if (hot_locked)
 		mtx_unlock(&ncneg_hot.nl_lock);
 	atomic_subtract_rel_long(&numneg, 1);
 }
 
 static void
 cache_negative_shrink_select(struct namecache **ncpp,
     struct neglist **neglistpp)
 {
 	struct neglist *neglist;
 	struct namecache *ncp;
 	static u_int cycle;
 	u_int i;
 
 	*ncpp = ncp = NULL;
 
 	for (i = 0; i < numneglists; i++) {
 		neglist = &neglists[(cycle + i) % numneglists];
 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
 			continue;
 		mtx_lock(&neglist->nl_lock);
 		ncp = TAILQ_FIRST(&neglist->nl_list);
 		if (ncp != NULL)
 			break;
 		mtx_unlock(&neglist->nl_lock);
 	}
 
 	*neglistpp = neglist;
 	*ncpp = ncp;
 	cycle++;
 }
 
 static void
 cache_negative_zap_one(void)
 {
 	struct namecache *ncp, *ncp2;
 	struct neglist *neglist;
 	struct negstate *negstate;
 	struct mtx *dvlp;
 	struct rwlock *blp;
 
 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 	    !mtx_trylock(&ncneg_shrink_lock)) {
 		counter_u64_add(shrinking_skipped, 1);
 		return;
 	}
 
 	mtx_lock(&ncneg_hot.nl_lock);
 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
 	if (ncp != NULL) {
 		neglist = NCP2NEGLIST(ncp);
 		negstate = NCP2NEGSTATE(ncp);
 		mtx_lock(&neglist->nl_lock);
 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 		negstate->neg_flag &= ~NEG_HOT;
 		numhotneg--;
 		mtx_unlock(&neglist->nl_lock);
 	}
 	mtx_unlock(&ncneg_hot.nl_lock);
 
 	cache_negative_shrink_select(&ncp, &neglist);
 
 	mtx_unlock(&ncneg_shrink_lock);
 	if (ncp == NULL)
 		return;
 
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
 	blp = NCP2BUCKETLOCK(ncp);
 	mtx_unlock(&neglist->nl_lock);
 	mtx_lock(dvlp);
 	rw_wlock(blp);
 	/*
 	 * Enter SMR to safely check the negative list.
 	 * Even if the found pointer matches, the entry may now be reallocated
 	 * and used by a different vnode.
 	 */
 	vfs_smr_enter();
 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 	    blp != NCP2BUCKETLOCK(ncp2)) {
 		vfs_smr_exit();
 		ncp = NULL;
 	} else {
 		vfs_smr_exit();
 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 		    ncp->nc_name);
 		cache_zap_locked(ncp);
 		counter_u64_add(numneg_evicted, 1);
 	}
 	rw_wunlock(blp);
 	mtx_unlock(dvlp);
 	cache_free(ncp);
 }
 
 /*
  * cache_zap_locked():
  *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
 cache_zap_locked(struct namecache *ncp)
 {
 	struct nchashhead *ncpp;
 
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		cache_assert_vnode_locked(ncp->nc_vp);
 	cache_assert_vnode_locked(ncp->nc_dvp);
 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
 
 	cache_ncp_invalidate(ncp);
 
 	ncpp = NCP2BUCKET(ncp);
 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 		    ncp->nc_name, ncp->nc_vp);
 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 		if (ncp == ncp->nc_vp->v_cache_dd) {
 			vn_seqc_write_begin_unheld(ncp->nc_vp);
 			ncp->nc_vp->v_cache_dd = NULL;
 			vn_seqc_write_end(ncp->nc_vp);
 		}
 	} else {
 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 		    ncp->nc_name);
 		cache_negative_remove(ncp);
 	}
 	if (ncp->nc_flag & NCF_ISDOTDOT) {
 		if (ncp == ncp->nc_dvp->v_cache_dd) {
 			vn_seqc_write_begin_unheld(ncp->nc_dvp);
 			ncp->nc_dvp->v_cache_dd = NULL;
 			vn_seqc_write_end(ncp->nc_dvp);
 		}
 	} else {
 		LIST_REMOVE(ncp, nc_src);
 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 			ncp->nc_flag |= NCF_DVDROP;
 			counter_u64_add(numcachehv, -1);
 		}
 	}
 	atomic_subtract_rel_long(&numcache, 1);
 }
 
 static void
 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 {
 	struct rwlock *blp;
 
 	MPASS(ncp->nc_dvp == vp);
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
 	cache_assert_vnode_locked(vp);
 
 	blp = NCP2BUCKETLOCK(ncp);
 	rw_wlock(blp);
 	cache_zap_locked(ncp);
 	rw_wunlock(blp);
 }
 
 static bool
 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
     struct mtx **vlpp)
 {
 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 	struct rwlock *blp;
 
 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 	cache_assert_vnode_locked(vp);
 
 	if (ncp->nc_flag & NCF_NEGATIVE) {
 		if (*vlpp != NULL) {
 			mtx_unlock(*vlpp);
 			*vlpp = NULL;
 		}
 		cache_zap_negative_locked_vnode_kl(ncp, vp);
 		return (true);
 	}
 
 	pvlp = VP2VNODELOCK(vp);
 	blp = NCP2BUCKETLOCK(ncp);
 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
 
 	if (*vlpp == vlp1 || *vlpp == vlp2) {
 		to_unlock = *vlpp;
 		*vlpp = NULL;
 	} else {
 		if (*vlpp != NULL) {
 			mtx_unlock(*vlpp);
 			*vlpp = NULL;
 		}
 		cache_sort_vnodes(&vlp1, &vlp2);
 		if (vlp1 == pvlp) {
 			mtx_lock(vlp2);
 			to_unlock = vlp2;
 		} else {
 			if (!mtx_trylock(vlp1))
 				goto out_relock;
 			to_unlock = vlp1;
 		}
 	}
 	rw_wlock(blp);
 	cache_zap_locked(ncp);
 	rw_wunlock(blp);
 	if (to_unlock != NULL)
 		mtx_unlock(to_unlock);
 	return (true);
 
 out_relock:
 	mtx_unlock(vlp2);
 	mtx_lock(vlp1);
 	mtx_lock(vlp2);
 	MPASS(*vlpp == NULL);
 	*vlpp = vlp1;
 	return (false);
 }
 
 static int __noinline
 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
 {
 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 	struct rwlock *blp;
 	int error = 0;
 
 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 	cache_assert_vnode_locked(vp);
 
 	pvlp = VP2VNODELOCK(vp);
 	if (ncp->nc_flag & NCF_NEGATIVE) {
 		cache_zap_negative_locked_vnode_kl(ncp, vp);
 		goto out;
 	}
 
 	blp = NCP2BUCKETLOCK(ncp);
 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
 	cache_sort_vnodes(&vlp1, &vlp2);
 	if (vlp1 == pvlp) {
 		mtx_lock(vlp2);
 		to_unlock = vlp2;
 	} else {
 		if (!mtx_trylock(vlp1)) {
 			error = EAGAIN;
 			goto out;
 		}
 		to_unlock = vlp1;
 	}
 	rw_wlock(blp);
 	cache_zap_locked(ncp);
 	rw_wunlock(blp);
 	mtx_unlock(to_unlock);
 out:
 	mtx_unlock(pvlp);
 	return (error);
 }
 
 /*
  * If trylocking failed we can get here. We know enough to take all needed locks
  * in the right order and re-lookup the entry.
  */
 static int
 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
     struct rwlock *blp)
 {
 	struct namecache *rncp;
 
 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
 
 	cache_sort_vnodes(&dvlp, &vlp);
 	cache_lock_vnodes(dvlp, vlp);
 	rw_wlock(blp);
 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 		if (rncp == ncp && rncp->nc_dvp == dvp &&
 		    rncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 			break;
 	}
 	if (rncp != NULL) {
 		cache_zap_locked(rncp);
 		rw_wunlock(blp);
 		cache_unlock_vnodes(dvlp, vlp);
 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
 		return (0);
 	}
 
 	rw_wunlock(blp);
 	cache_unlock_vnodes(dvlp, vlp);
 	return (EAGAIN);
 }
 
 static int __noinline
 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
     uint32_t hash, struct rwlock *blp)
 {
 	struct mtx *dvlp, *vlp;
 	struct vnode *dvp;
 
 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 
 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
 	vlp = NULL;
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		vlp = VP2VNODELOCK(ncp->nc_vp);
 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 		cache_zap_locked(ncp);
 		rw_wunlock(blp);
 		cache_unlock_vnodes(dvlp, vlp);
 		return (0);
 	}
 
 	dvp = ncp->nc_dvp;
 	rw_wunlock(blp);
 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 }
 
 static int __noinline
 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
     uint32_t hash, struct rwlock *blp)
 {
 	struct mtx *dvlp, *vlp;
 	struct vnode *dvp;
 
 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
 
 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
 	vlp = NULL;
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		vlp = VP2VNODELOCK(ncp->nc_vp);
 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 		rw_runlock(blp);
 		rw_wlock(blp);
 		cache_zap_locked(ncp);
 		rw_wunlock(blp);
 		cache_unlock_vnodes(dvlp, vlp);
 		return (0);
 	}
 
 	dvp = ncp->nc_dvp;
 	rw_runlock(blp);
 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 }
 
 static int
 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
     struct mtx **vlpp1, struct mtx **vlpp2)
 {
 	struct mtx *dvlp, *vlp;
 
 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 
 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
 	vlp = NULL;
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		vlp = VP2VNODELOCK(ncp->nc_vp);
 	cache_sort_vnodes(&dvlp, &vlp);
 
 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
 		cache_zap_locked(ncp);
 		cache_unlock_vnodes(dvlp, vlp);
 		*vlpp1 = NULL;
 		*vlpp2 = NULL;
 		return (0);
 	}
 
 	if (*vlpp1 != NULL)
 		mtx_unlock(*vlpp1);
 	if (*vlpp2 != NULL)
 		mtx_unlock(*vlpp2);
 	*vlpp1 = NULL;
 	*vlpp2 = NULL;
 
 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 		cache_zap_locked(ncp);
 		cache_unlock_vnodes(dvlp, vlp);
 		return (0);
 	}
 
 	rw_wunlock(blp);
 	*vlpp1 = dvlp;
 	*vlpp2 = vlp;
 	if (*vlpp1 != NULL)
 		mtx_lock(*vlpp1);
 	mtx_lock(*vlpp2);
 	rw_wlock(blp);
 	return (EAGAIN);
 }
 
 static void
 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
 {
 
 	if (blp != NULL) {
 		rw_runlock(blp);
 	} else {
 		mtx_unlock(vlp);
 	}
 }
 
 static int __noinline
 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	int ltype;
 
 	*vpp = dvp;
 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			dvp, cnp->cn_nameptr);
 	counter_u64_add(dothits, 1);
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 	if (tsp != NULL)
 		timespecclear(tsp);
 	if (ticksp != NULL)
 		*ticksp = ticks;
 	vrefact(*vpp);
 	/*
 	 * When we lookup "." we still can be asked to lock it
 	 * differently...
 	 */
 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 	if (ltype != VOP_ISLOCKED(*vpp)) {
 		if (ltype == LK_EXCLUSIVE) {
 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 			if (VN_IS_DOOMED((*vpp))) {
 				/* forced unmount */
 				vrele(*vpp);
 				*vpp = NULL;
 				return (ENOENT);
 			}
 		} else
 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 	}
 	return (-1);
 }
 
 static __noinline int
 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, struct timespec *tsp, int *ticksp)
 {
 	struct namecache *ncp;
 	struct rwlock *blp;
 	struct mtx *dvlp, *dvlp2;
 	uint32_t hash;
 	int error;
 
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 		counter_u64_add(dotdothits, 1);
 		dvlp = VP2VNODELOCK(dvp);
 		dvlp2 = NULL;
 		mtx_lock(dvlp);
 retry_dotdot:
 		ncp = dvp->v_cache_dd;
 		if (ncp == NULL) {
 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 			    "..", NULL);
 			mtx_unlock(dvlp);
 			if (dvlp2 != NULL)
 				mtx_unlock(dvlp2);
 			return (0);
 		}
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 			if (ncp->nc_dvp != dvp)
 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
 			if (!cache_zap_locked_vnode_kl2(ncp,
 			    dvp, &dvlp2))
 				goto retry_dotdot;
 			MPASS(dvp->v_cache_dd == NULL);
 			mtx_unlock(dvlp);
 			if (dvlp2 != NULL)
 				mtx_unlock(dvlp2);
 			cache_free(ncp);
 		} else {
 			vn_seqc_write_begin(dvp);
 			dvp->v_cache_dd = NULL;
 			vn_seqc_write_end(dvp);
 			mtx_unlock(dvlp);
 			if (dvlp2 != NULL)
 				mtx_unlock(dvlp2);
 		}
 		return (0);
 	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 	blp = HASH2BUCKETLOCK(hash);
 retry:
 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
 		goto out_no_entry;
 
 	rw_wlock(blp);
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == NULL) {
 		rw_wunlock(blp);
 		goto out_no_entry;
 	}
 
 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
 	if (__predict_false(error != 0)) {
 		zap_and_exit_bucket_fail++;
 		cache_maybe_yield();
 		goto retry;
 	}
 	counter_u64_add(numposzaps, 1);
 	cache_free(ncp);
 	return (0);
 out_no_entry:
 	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
 	counter_u64_add(nummisszap, 1);
 	return (0);
 }
 
 /**
  * Lookup a name in the name cache
  *
  * # Arguments
  *
  * - dvp:	Parent directory in which to search.
  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
  * - cnp:	Parameters of the name search.  The most interesting bits of
  *   		the cn_flags field have the following meanings:
  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
  *   			it up.
  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
  * - tsp:	Return storage for cache timestamp.  On a successful (positive
  *   		or negative) lookup, tsp will be filled with any timespec that
  *   		was stored when this cache entry was created.  However, it will
  *   		be clear for "." entries.
  * - ticks:	Return storage for alternate cache timestamp.  On a successful
  *   		(positive or negative) lookup, it will contain the ticks value
  *   		that was current when the cache entry was created, unless cnp
  *   		was ".".
  *
  * # Returns
  *
  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
  *		to a forced unmount.  vpp will not be modified.  If the entry
  *		is a whiteout, then the ISWHITEOUT flag will be set in
  *		cnp->cn_flags.
  * - 0:		A cache miss.  vpp will not be modified.
  *
  * # Locking
  *
  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
  * lock is not recursively acquired.
  */
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	struct namecache_ts *ncp_ts;
 	struct namecache *ncp;
 	struct negstate *negstate;
 	struct rwlock *blp;
 	struct mtx *dvlp;
 	uint32_t hash;
 	enum vgetstate vs;
 	int error, ltype;
 	bool try_smr, doing_smr, whiteout;
 
 #ifdef DEBUG_CACHE
 	if (__predict_false(!doingcache)) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 #endif
 
 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 
 	if ((cnp->cn_flags & MAKEENTRY) == 0)
 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
 
 	try_smr = true;
 	if (cnp->cn_nameiop == CREATE)
 		try_smr = false;
 retry:
 	doing_smr = false;
 	blp = NULL;
 	dvlp = NULL;
 	error = 0;
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 		counter_u64_add(dotdothits, 1);
 		dvlp = VP2VNODELOCK(dvp);
 		mtx_lock(dvlp);
 		ncp = dvp->v_cache_dd;
 		if (ncp == NULL) {
 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 			    "..", NULL);
 			mtx_unlock(dvlp);
 			return (0);
 		}
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 			if (ncp->nc_flag & NCF_NEGATIVE)
 				*vpp = NULL;
 			else
 				*vpp = ncp->nc_vp;
 		} else
 			*vpp = ncp->nc_dvp;
 		/* Return failure if negative entry was found. */
 		if (*vpp == NULL)
 			goto negative_success;
 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 		    dvp, cnp->cn_nameptr, *vpp);
 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
 		    *vpp);
 		cache_out_ts(ncp, tsp, ticksp);
 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 		    NCF_DTS && tsp != NULL) {
 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 			*tsp = ncp_ts->nc_dotdottime;
 		}
 		goto success;
 	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 retry_hashed:
 	if (try_smr) {
 		vfs_smr_enter();
 		doing_smr = true;
 		try_smr = false;
 	} else {
 		blp = HASH2BUCKETLOCK(hash);
 		rw_rlock(blp);
 	}
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (__predict_false(ncp == NULL)) {
 		if (doing_smr)
 			vfs_smr_exit();
 		else
 			rw_runlock(blp);
 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 		    NULL);
 		counter_u64_add(nummiss, 1);
 		return (0);
 	}
 
 	if (ncp->nc_flag & NCF_NEGATIVE)
 		goto negative_success;
 
 	/* We found a "positive" match, return the vnode */
 	counter_u64_add(numposhits, 1);
 	*vpp = ncp->nc_vp;
 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 	    dvp, cnp->cn_nameptr, *vpp, ncp);
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
 	    *vpp);
 	cache_out_ts(ncp, tsp, ticksp);
 success:
 	/*
 	 * On success we return a locked and ref'd vnode as per the lookup
 	 * protocol.
 	 */
 	MPASS(dvp != *vpp);
 	ltype = 0;	/* silence gcc warning */
 	if (cnp->cn_flags & ISDOTDOT) {
 		ltype = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp);
 	}
 	if (doing_smr) {
 		if (!cache_ncp_canuse(ncp)) {
 			vfs_smr_exit();
 			*vpp = NULL;
 			goto retry;
 		}
 		vs = vget_prep_smr(*vpp);
 		vfs_smr_exit();
 		if (__predict_false(vs == VGET_NONE)) {
 			*vpp = NULL;
 			goto retry;
 		}
 	} else {
 		vs = vget_prep(*vpp);
 		cache_lookup_unlock(blp, dvlp);
 	}
 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
 		if (VN_IS_DOOMED(dvp)) {
 			if (error == 0)
 				vput(*vpp);
 			*vpp = NULL;
 			return (ENOENT);
 		}
 	}
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	}
 	return (-1);
 
 negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		MPASS(!doing_smr);
 		counter_u64_add(numnegzaps, 1);
 		goto zap_and_exit;
 	}
 
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
 	cache_out_ts(ncp, tsp, ticksp);
 	counter_u64_add(numneghits, 1);
 	whiteout = (ncp->nc_flag & NCF_WHITE);
 
 	if (doing_smr) {
 		/*
 		 * We need to take locks to promote an entry.
 		 */
 		negstate = NCP2NEGSTATE(ncp);
 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
 		    !cache_ncp_canuse(ncp)) {
 			vfs_smr_exit();
 			doing_smr = false;
 			goto retry_hashed;
 		}
 		vfs_smr_exit();
 	} else {
 		cache_negative_hit(ncp);
 		cache_lookup_unlock(blp, dvlp);
 	}
 	if (whiteout)
 		cnp->cn_flags |= ISWHITEOUT;
 	return (ENOENT);
 
 zap_and_exit:
 	MPASS(!doing_smr);
 	if (blp != NULL)
 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
 	else
 		error = cache_zap_locked_vnode(ncp, dvp);
 	if (__predict_false(error != 0)) {
 		zap_and_exit_bucket_fail2++;
 		cache_maybe_yield();
 		goto retry;
 	}
 	cache_free(ncp);
 	return (0);
 }
 
 struct celockstate {
 	struct mtx *vlp[3];
 	struct rwlock *blp[2];
 };
 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 
 static inline void
 cache_celockstate_init(struct celockstate *cel)
 {
 
 	bzero(cel, sizeof(*cel));
 }
 
 static void
 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
     struct vnode *dvp)
 {
 	struct mtx *vlp1, *vlp2;
 
 	MPASS(cel->vlp[0] == NULL);
 	MPASS(cel->vlp[1] == NULL);
 	MPASS(cel->vlp[2] == NULL);
 
 	MPASS(vp != NULL || dvp != NULL);
 
 	vlp1 = VP2VNODELOCK(vp);
 	vlp2 = VP2VNODELOCK(dvp);
 	cache_sort_vnodes(&vlp1, &vlp2);
 
 	if (vlp1 != NULL) {
 		mtx_lock(vlp1);
 		cel->vlp[0] = vlp1;
 	}
 	mtx_lock(vlp2);
 	cel->vlp[1] = vlp2;
 }
 
 static void
 cache_unlock_vnodes_cel(struct celockstate *cel)
 {
 
 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 
 	if (cel->vlp[0] != NULL)
 		mtx_unlock(cel->vlp[0]);
 	if (cel->vlp[1] != NULL)
 		mtx_unlock(cel->vlp[1]);
 	if (cel->vlp[2] != NULL)
 		mtx_unlock(cel->vlp[2]);
 }
 
 static bool
 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 {
 	struct mtx *vlp;
 	bool ret;
 
 	cache_assert_vlp_locked(cel->vlp[0]);
 	cache_assert_vlp_locked(cel->vlp[1]);
 	MPASS(cel->vlp[2] == NULL);
 
 	MPASS(vp != NULL);
 	vlp = VP2VNODELOCK(vp);
 
 	ret = true;
 	if (vlp >= cel->vlp[1]) {
 		mtx_lock(vlp);
 	} else {
 		if (mtx_trylock(vlp))
 			goto out;
 		cache_lock_vnodes_cel_3_failures++;
 		cache_unlock_vnodes_cel(cel);
 		if (vlp < cel->vlp[0]) {
 			mtx_lock(vlp);
 			mtx_lock(cel->vlp[0]);
 			mtx_lock(cel->vlp[1]);
 		} else {
 			if (cel->vlp[0] != NULL)
 				mtx_lock(cel->vlp[0]);
 			mtx_lock(vlp);
 			mtx_lock(cel->vlp[1]);
 		}
 		ret = false;
 	}
 out:
 	cel->vlp[2] = vlp;
 	return (ret);
 }
 
 static void
 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
     struct rwlock *blp2)
 {
 
 	MPASS(cel->blp[0] == NULL);
 	MPASS(cel->blp[1] == NULL);
 
 	cache_sort_vnodes(&blp1, &blp2);
 
 	if (blp1 != NULL) {
 		rw_wlock(blp1);
 		cel->blp[0] = blp1;
 	}
 	rw_wlock(blp2);
 	cel->blp[1] = blp2;
 }
 
 static void
 cache_unlock_buckets_cel(struct celockstate *cel)
 {
 
 	if (cel->blp[0] != NULL)
 		rw_wunlock(cel->blp[0]);
 	rw_wunlock(cel->blp[1]);
 }
 
 /*
  * Lock part of the cache affected by the insertion.
  *
  * This means vnodelocks for dvp, vp and the relevant bucketlock.
  * However, insertion can result in removal of an old entry. In this
  * case we have an additional vnode and bucketlock pair to lock. If the
  * entry is negative, ncelock is locked instead of the vnode.
  *
  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
  * preserving the locking order (smaller address first).
  */
 static void
 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
     uint32_t hash)
 {
 	struct namecache *ncp;
 	struct rwlock *blps[2];
 
 	blps[0] = HASH2BUCKETLOCK(hash);
 	for (;;) {
 		blps[1] = NULL;
 		cache_lock_vnodes_cel(cel, dvp, vp);
 		if (vp == NULL || vp->v_type != VDIR)
 			break;
 		ncp = vp->v_cache_dd;
 		if (ncp == NULL)
 			break;
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 		MPASS(ncp->nc_dvp == vp);
 		blps[1] = NCP2BUCKETLOCK(ncp);
 		if (ncp->nc_flag & NCF_NEGATIVE)
 			break;
 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 			break;
 		/*
 		 * All vnodes got re-locked. Re-validate the state and if
 		 * nothing changed we are done. Otherwise restart.
 		 */
 		if (ncp == vp->v_cache_dd &&
 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 			break;
 		cache_unlock_vnodes_cel(cel);
 		cel->vlp[0] = NULL;
 		cel->vlp[1] = NULL;
 		cel->vlp[2] = NULL;
 	}
 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
 }
 
 static void
 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
     uint32_t hash)
 {
 	struct namecache *ncp;
 	struct rwlock *blps[2];
 
 	blps[0] = HASH2BUCKETLOCK(hash);
 	for (;;) {
 		blps[1] = NULL;
 		cache_lock_vnodes_cel(cel, dvp, vp);
 		ncp = dvp->v_cache_dd;
 		if (ncp == NULL)
 			break;
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 		MPASS(ncp->nc_dvp == dvp);
 		blps[1] = NCP2BUCKETLOCK(ncp);
 		if (ncp->nc_flag & NCF_NEGATIVE)
 			break;
 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 			break;
 		if (ncp == dvp->v_cache_dd &&
 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 			break;
 		cache_unlock_vnodes_cel(cel);
 		cel->vlp[0] = NULL;
 		cel->vlp[1] = NULL;
 		cel->vlp[2] = NULL;
 	}
 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
 }
 
 static void
 cache_enter_unlock(struct celockstate *cel)
 {
 
 	cache_unlock_buckets_cel(cel);
 	cache_unlock_vnodes_cel(cel);
 }
 
 static void __noinline
 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
     struct componentname *cnp)
 {
 	struct celockstate cel;
 	struct namecache *ncp;
 	uint32_t hash;
 	int len;
 
 	if (dvp->v_cache_dd == NULL)
 		return;
 	len = cnp->cn_namelen;
 	cache_celockstate_init(&cel);
 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 	cache_enter_lock_dd(&cel, dvp, vp, hash);
 	vn_seqc_write_begin(dvp);
 	ncp = dvp->v_cache_dd;
 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 		cache_zap_locked(ncp);
 	} else {
 		ncp = NULL;
 	}
 	dvp->v_cache_dd = NULL;
 	vn_seqc_write_end(dvp);
 	cache_enter_unlock(&cel);
 	cache_free(ncp);
 }
 
 /*
  * Add an entry to the cache.
  */
 void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
 	struct celockstate cel;
 	struct namecache *ncp, *n2, *ndd;
 	struct namecache_ts *ncp_ts, *n2_ts;
 	struct nchashhead *ncpp;
 	uint32_t hash;
 	int flag;
 	int len;
 	u_long lnumcache;
 
 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 	VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
 	    ("cache_enter: Adding a doomed vnode"));
 	VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
 	    ("cache_enter: Doomed vnode used as src"));
 
 #ifdef DEBUG_CACHE
 	if (__predict_false(!doingcache))
 		return;
 #endif
 
 	flag = 0;
 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 		if (cnp->cn_namelen == 1)
 			return;
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			cache_enter_dotdot_prep(dvp, vp, cnp);
 			flag = NCF_ISDOTDOT;
 		}
 	}
 
 	/*
 	 * Avoid blowout in namecache entries.
 	 */
 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 	if (__predict_false(lnumcache >= ncsize)) {
 		atomic_add_long(&numcache, -1);
 		counter_u64_add(numdrops, 1);
 		return;
 	}
 
 	cache_celockstate_init(&cel);
 	ndd = NULL;
 	ncp_ts = NULL;
 
 	/*
 	 * Calculate the hash key and setup as much of the new
 	 * namecache entry as possible before acquiring the lock.
 	 */
 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 	ncp->nc_flag = flag | NCF_WIP;
 	ncp->nc_vp = vp;
 	if (vp == NULL)
 		cache_negative_init(ncp);
 	ncp->nc_dvp = dvp;
 	if (tsp != NULL) {
 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 		ncp_ts->nc_time = *tsp;
 		ncp_ts->nc_ticks = ticks;
 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
 		if (dtsp != NULL) {
 			ncp_ts->nc_dotdottime = *dtsp;
 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 		}
 	}
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
 	ncp->nc_name[len] = '\0';
 	cache_enter_lock(&cel, dvp, vp, hash);
 
 	/*
 	 * See if this vnode or negative entry is already in the cache
 	 * with this name.  This can happen with concurrent lookups of
 	 * the same path name.
 	 */
 	ncpp = NCHHASH(hash);
 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp == dvp &&
 		    n2->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 			if (tsp != NULL) {
 				KASSERT((n2->nc_flag & NCF_TS) != 0,
 				    ("no NCF_TS"));
 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 				n2_ts->nc_time = ncp_ts->nc_time;
 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
 				if (dtsp != NULL) {
 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
 				}
 			}
 			goto out_unlock_free;
 		}
 	}
 
 	if (flag == NCF_ISDOTDOT) {
 		/*
 		 * See if we are trying to add .. entry, but some other lookup
 		 * has populated v_cache_dd pointer already.
 		 */
 		if (dvp->v_cache_dd != NULL)
 			goto out_unlock_free;
 		KASSERT(vp == NULL || vp->v_type == VDIR,
 		    ("wrong vnode type %p", vp));
 		vn_seqc_write_begin(dvp);
 		dvp->v_cache_dd = ncp;
 		vn_seqc_write_end(dvp);
 	}
 
 	if (vp != NULL) {
 		if (vp->v_type == VDIR) {
 			if (flag != NCF_ISDOTDOT) {
 				/*
 				 * For this case, the cache entry maps both the
 				 * directory name in it and the name ".." for the
 				 * directory's parent.
 				 */
 				vn_seqc_write_begin(vp);
 				if ((ndd = vp->v_cache_dd) != NULL) {
 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 						cache_zap_locked(ndd);
 					else
 						ndd = NULL;
 				}
 				vp->v_cache_dd = ncp;
 				vn_seqc_write_end(vp);
 			}
 		} else {
 			if (vp->v_cache_dd != NULL) {
 				vn_seqc_write_begin(vp);
 				vp->v_cache_dd = NULL;
 				vn_seqc_write_end(vp);
 			}
 		}
 	}
 
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			vhold(dvp);
 			counter_u64_add(numcachehv, 1);
 		}
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp != NULL) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 		    vp);
 	} else {
 		if (cnp->cn_flags & ISWHITEOUT)
 			ncp->nc_flag |= NCF_WHITE;
 		cache_negative_insert(ncp);
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    ncp->nc_name);
 	}
 
 	/*
 	 * Insert the new namecache entry into the appropriate chain
 	 * within the cache entries table.
 	 */
 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 
 	atomic_thread_fence_rel();
 	/*
 	 * Mark the entry as fully constructed.
 	 * It is immutable past this point until its removal.
 	 */
 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
 
 	cache_enter_unlock(&cel);
 	if (numneg * ncnegfactor > lnumcache)
 		cache_negative_zap_one();
 	cache_free(ndd);
 	return;
 out_unlock_free:
 	cache_enter_unlock(&cel);
 	atomic_add_long(&numcache, -1);
 	cache_free(ncp);
 	return;
 }
 
 static u_int
 cache_roundup_2(u_int val)
 {
 	u_int res;
 
 	for (res = 1; res <= val; res <<= 1)
 		continue;
 
 	return (res);
 }
 
 static struct nchashhead *
 nchinittbl(u_long elements, u_long *hashmask)
 {
 	struct nchashhead *hashtbl;
 	u_long hashsize, i;
 
 	hashsize = cache_roundup_2(elements) / 2;
 
 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
 	for (i = 0; i < hashsize; i++)
 		CK_SLIST_INIT(&hashtbl[i]);
 	*hashmask = hashsize - 1;
 	return (hashtbl);
 }
 
 static void
 ncfreetbl(struct nchashhead *hashtbl)
 {
 
 	free(hashtbl, M_VFSCACHE);
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 	u_int i;
 
 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 
 	VFS_SMR_ZONE_SET(cache_zone_small);
 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
 	VFS_SMR_ZONE_SET(cache_zone_large);
 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
 
 	ncsize = desiredvnodes * ncsizefactor;
 	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 		ncbuckethash = 7;
 	if (ncbuckethash > nchash)
 		ncbuckethash = nchash;
 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < numbucketlocks; i++)
 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
 	ncvnodehash = ncbuckethash;
 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < numvnodelocks; i++)
 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 	ncpurgeminvnodes = numbucketlocks * 2;
 
 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < numneglists; i++) {
 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 		TAILQ_INIT(&neglists[i].nl_list);
 	}
 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
 	TAILQ_INIT(&ncneg_hot.nl_list);
 
 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
 void
 cache_vnode_init(struct vnode *vp)
 {
 
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	vp->v_cache_dd = NULL;
 	cache_prehash(vp);
 }
 
 void
 cache_changesize(u_long newmaxvnodes)
 {
 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
 	u_long new_nchash, old_nchash;
 	struct namecache *ncp;
 	uint32_t hash;
 	u_long newncsize;
 	int i;
 
 	newncsize = newmaxvnodes * ncsizefactor;
 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 	if (newmaxvnodes < numbucketlocks)
 		newmaxvnodes = numbucketlocks;
 
 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
 	/* If same hash table size, nothing to do */
 	if (nchash == new_nchash) {
 		ncfreetbl(new_nchashtbl);
 		return;
 	}
 	/*
 	 * Move everything from the old hash table to the new table.
 	 * None of the namecache entries in the table can be removed
 	 * because to do so, they have to be removed from the hash table.
 	 */
 	cache_lock_all_vnodes();
 	cache_lock_all_buckets();
 	old_nchashtbl = nchashtbl;
 	old_nchash = nchash;
 	nchashtbl = new_nchashtbl;
 	nchash = new_nchash;
 	for (i = 0; i <= old_nchash; i++) {
 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 			    ncp->nc_dvp);
 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
 			CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 		}
 	}
 	ncsize = newncsize;
 	cache_unlock_all_buckets();
 	cache_unlock_all_vnodes();
 	ncfreetbl(old_nchashtbl);
 }
 
 /*
  * Invalidate all entries from and to a particular vnode.
  */
 static void
 cache_purge_impl(struct vnode *vp)
 {
 	TAILQ_HEAD(, namecache) ncps;
 	struct namecache *ncp, *nnp;
 	struct mtx *vlp, *vlp2;
 
 	TAILQ_INIT(&ncps);
 	vlp = VP2VNODELOCK(vp);
 	vlp2 = NULL;
 	mtx_assert(vlp, MA_OWNED);
 retry:
 	while (!LIST_EMPTY(&vp->v_cache_src)) {
 		ncp = LIST_FIRST(&vp->v_cache_src);
 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 			goto retry;
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 			goto retry;
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	ncp = vp->v_cache_dd;
 	if (ncp != NULL) {
 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 		   ("lost dotdot link"));
 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 			goto retry;
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 	mtx_unlock(vlp);
 	if (vlp2 != NULL)
 		mtx_unlock(vlp2);
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 		cache_free(ncp);
 	}
 }
 
 void
 cache_purge(struct vnode *vp)
 {
 	struct mtx *vlp;
 
 	SDT_PROBE1(vfs, namecache, purge, done, vp);
 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 	    vp->v_cache_dd == NULL)
 		return;
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	cache_purge_impl(vp);
 }
 
 /*
  * Only to be used by vgone.
  */
 void
 cache_purge_vgone(struct vnode *vp)
 {
 	struct mtx *vlp;
 
 	VNPASS(VN_IS_DOOMED(vp), vp);
 	vlp = VP2VNODELOCK(vp);
 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 	    vp->v_cache_dd == NULL)) {
 		mtx_lock(vlp);
 		cache_purge_impl(vp);
 		mtx_assert(vlp, MA_NOTOWNED);
 		return;
 	}
 
 	/*
 	 * All the NULL pointer state we found above may be transient.
 	 * Serialize against a possible thread doing cache_purge.
 	 */
 	mtx_wait_unlocked(vlp);
 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 	    vp->v_cache_dd == NULL)) {
 		mtx_lock(vlp);
 		cache_purge_impl(vp);
 		mtx_assert(vlp, MA_NOTOWNED);
 		return;
 	}
 	return;
 }
 
 /*
  * Invalidate all negative entries for a particular directory vnode.
  */
 void
 cache_purge_negative(struct vnode *vp)
 {
 	TAILQ_HEAD(, namecache) ncps;
 	struct namecache *ncp, *nnp;
 	struct mtx *vlp;
 
 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 	if (LIST_EMPTY(&vp->v_cache_src))
 		return;
 	TAILQ_INIT(&ncps);
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 		if (!(ncp->nc_flag & NCF_NEGATIVE))
 			continue;
 		cache_zap_negative_locked_vnode_kl(ncp, vp);
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	mtx_unlock(vlp);
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 		cache_free(ncp);
 	}
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  */
 void
 cache_purgevfs(struct mount *mp, bool force)
 {
 	TAILQ_HEAD(, namecache) ncps;
 	struct mtx *vlp1, *vlp2;
 	struct rwlock *blp;
 	struct nchashhead *bucket;
 	struct namecache *ncp, *nnp;
 	u_long i, j, n_nchash;
 	int error;
 
 	/* Scan hash tables for applicable entries */
 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
 		return;
 	TAILQ_INIT(&ncps);
 	n_nchash = nchash + 1;
 	vlp1 = vlp2 = NULL;
 	for (i = 0; i < numbucketlocks; i++) {
 		blp = (struct rwlock *)&bucketlocks[i];
 		rw_wlock(blp);
 		for (j = i; j < n_nchash; j += numbucketlocks) {
 retry:
 			bucket = &nchashtbl[j];
 			CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
 				if (ncp->nc_dvp->v_mount != mp)
 					continue;
 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
 				    &vlp1, &vlp2);
 				if (error != 0)
 					goto retry;
 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
 			}
 		}
 		rw_wunlock(blp);
 		if (vlp1 == NULL && vlp2 == NULL)
 			cache_maybe_yield();
 	}
 	if (vlp1 != NULL)
 		mtx_unlock(vlp1);
 	if (vlp2 != NULL)
 		mtx_unlock(vlp2);
 
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 		cache_free(ncp);
 	}
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	int flags = cnp->cn_flags;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = vn_dir_check_exec(dvp, cnp);
 	if (error != 0)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /* Implementation of the getcwd syscall. */
 int
 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 {
 	char *buf, *retbuf;
 	size_t buflen;
 	int error;
 
 	buflen = uap->buflen;
 	if (__predict_false(buflen < 2))
 		return (EINVAL);
 	if (buflen > MAXPATHLEN)
 		buflen = MAXPATHLEN;
 
-	buf = malloc(buflen, M_TEMP, M_WAITOK);
+	buf = uma_zalloc(namei_zone, M_WAITOK);
 	error = vn_getcwd(td, buf, &retbuf, &buflen);
 	if (error == 0)
 		error = copyout(retbuf, uap->buf, buflen);
-	free(buf, M_TEMP);
+	uma_zfree(namei_zone, buf);
 	return (error);
 }
 
 int
 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
 {
 	struct pwd *pwd;
 	int error;
 
 	pwd = pwd_hold(td);
 	error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
 	pwd_drop(pwd);
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
 		ktrnamei(*retbuf);
 #endif
 	return (error);
 }
 
 static int
 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
     size_t size, int flags, enum uio_seg pathseg)
 {
 	struct nameidata nd;
 	char *retbuf, *freebuf;
 	int error;
 
 	if (flags != 0)
 		return (EINVAL);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
 	    pathseg, path, fd, &cap_fstat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
 	if (error == 0) {
 		error = copyout(retbuf, buf, size);
 		free(freebuf, M_TEMP);
 	}
 	NDFREE(&nd, 0);
 	return (error);
 }
 
 int
 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 {
 
 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
 	    uap->flags, UIO_USERSPACE));
 }
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	struct pwd *pwd;
 	char *buf;
 	size_t buflen;
 	int error;
 
 	if (__predict_false(vn == NULL))
 		return (EINVAL);
 
 	buflen = MAXPATHLEN;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	pwd = pwd_hold(td);
 	error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
 	pwd_drop(pwd);
 
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * This function is similar to vn_fullpath, but it attempts to lookup the
  * pathname relative to the global root mount point.  This is required for the
  * auditing sub-system, as audited pathnames must be absolute, relative to the
  * global root mount point.
  */
 int
 vn_fullpath_global(struct thread *td, struct vnode *vn,
     char **retbuf, char **freebuf)
 {
 	char *buf;
 	size_t buflen;
 	int error;
 
 	if (__predict_false(vn == NULL))
 		return (EINVAL);
 	buflen = MAXPATHLEN;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 int
 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
 {
 	struct vnode *dvp;
 	struct namecache *ncp;
 	struct mtx *vlp;
 	int error;
 
 	vlp = VP2VNODELOCK(*vp);
 	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	}
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
 			mtx_unlock(vlp);
 			vrele(*vp);
 			counter_u64_add(numfullpathfail4, 1);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    vp, NULL);
 			return (error);
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 		    ncp->nc_name, vp);
 		dvp = *vp;
 		*vp = ncp->nc_dvp;
 		vref(*vp);
 		mtx_unlock(vlp);
 		vrele(dvp);
 		return (0);
 	}
 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 
 	mtx_unlock(vlp);
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 	vput(*vp);
 	if (error) {
 		counter_u64_add(numfullpathfail2, 1);
 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 		return (error);
 	}
 
 	*vp = dvp;
 	if (VN_IS_DOOMED(dvp)) {
 		/* forced unmount */
 		vrele(dvp);
 		error = ENOENT;
 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 		return (error);
 	}
 	/*
 	 * *vp has its use count incremented still.
 	 */
 
 	return (0);
 }
 
 /*
  * Resolve a directory to a pathname.
  *
  * The name of the directory can always be found in the namecache or fetched
  * from the filesystem. There is also guaranteed to be only one parent, meaning
  * we can just follow vnodes up until we find the root.
  *
  * The vnode must be referenced.
  */
 static int
 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
 {
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
 	struct vnode *vp1;
 	size_t buflen;
 	int error;
 
 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 	VNPASS(vp->v_usecount > 0, vp);
 
 	buflen = *len;
 
 	if (!slash_prefixed) {
 		MPASS(*len >= 2);
 		buflen--;
 		buf[buflen] = '\0';
 	}
 
 	error = 0;
 
 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 	counter_u64_add(numfullpathcalls, 1);
 	while (vp != rdir && vp != rootvnode) {
 		/*
 		 * The vp vnode must be already fully constructed,
 		 * since it is either found in namecache or obtained
 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 		 * without obtaining the vnode lock.
 		 */
 		if ((vp->v_vflag & VV_ROOT) != 0) {
 			vn_lock(vp, LK_RETRY | LK_SHARED);
 
 			/*
 			 * With the vnode locked, check for races with
 			 * unmount, forced or not.  Note that we
 			 * already verified that vp is not equal to
 			 * the root vnode, which means that
 			 * mnt_vnodecovered can be NULL only for the
 			 * case of unmount.
 			 */
 			if (VN_IS_DOOMED(vp) ||
 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 			    vp1->v_mountedhere != vp->v_mount) {
 				vput(vp);
 				error = ENOENT;
 				SDT_PROBE3(vfs, namecache, fullpath, return,
 				    error, vp, NULL);
 				break;
 			}
 
 			vref(vp1);
 			vput(vp);
 			vp = vp1;
 			continue;
 		}
 		if (vp->v_type != VDIR) {
 			vrele(vp);
 			counter_u64_add(numfullpathfail1, 1);
 			error = ENOTDIR;
 			SDT_PROBE3(vfs, namecache, fullpath, return,
 			    error, vp, NULL);
 			break;
 		}
 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			break;
 		if (buflen == 0) {
 			vrele(vp);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    startvp, NULL);
 			break;
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = true;
 	}
 	if (error)
 		return (error);
 	if (!slash_prefixed) {
 		if (buflen == 0) {
 			vrele(vp);
 			counter_u64_add(numfullpathfail4, 1);
 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 			    startvp, NULL);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 	}
 	counter_u64_add(numfullpathfound, 1);
 	vrele(vp);
 
 	*retbuf = buf + buflen;
 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
 	*len -= buflen;
 	*len += addend;
 	return (0);
 }
 
 /*
  * Resolve an arbitrary vnode to a pathname.
  *
  * Note 2 caveats:
  * - hardlinks are not tracked, thus if the vnode is not a directory this can
  *   resolve to a different path than the one used to find it
  * - namecache is not mandatory, meaning names are not guaranteed to be added
  *   (in which case resolving fails)
  */
 static int
 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, size_t *buflen)
 {
 	size_t orig_buflen;
 	bool slash_prefixed;
 	int error;
 
 	if (*buflen < 2)
 		return (EINVAL);
 
 	orig_buflen = *buflen;
 
 	vref(vp);
 	slash_prefixed = false;
 	if (vp->v_type != VDIR) {
 		*buflen -= 1;
 		buf[*buflen] = '\0';
 		error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
 		if (error)
 			return (error);
 		if (*buflen == 0) {
 			vrele(vp);
 			return (ENOMEM);
 		}
 		*buflen -= 1;
 		buf[*buflen] = '/';
 		slash_prefixed = true;
 	}
 
 	return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
 	    orig_buflen - *buflen));
 }
 
 /*
  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
  *
  * Since the namecache does not track handlings, the caller is expected to first
  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
  *
  * Then we have 2 cases:
  * - if the found vnode is a directory, the path can be constructed just by
  *   fullowing names up the chain
  * - otherwise we populate the buffer with the saved name and start resolving
  *   from the parent
  */
 static int
 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
     char **freebuf, size_t *buflen)
 {
 	char *buf, *tmpbuf;
 	struct pwd *pwd;
 	struct componentname *cnp;
 	struct vnode *vp;
 	size_t addend;
 	int error;
 	bool slash_prefixed;
 
 	if (*buflen < 2)
 		return (EINVAL);
 	if (*buflen > MAXPATHLEN)
 		*buflen = MAXPATHLEN;
 
 	slash_prefixed = false;
 
 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
 	pwd = pwd_hold(td);
 
 	addend = 0;
 	vp = ndp->ni_vp;
 	if (vp->v_type != VDIR) {
 		cnp = &ndp->ni_cnd;
 		addend = cnp->cn_namelen + 2;
 		if (*buflen < addend) {
 			error = ENOMEM;
 			goto out_bad;
 		}
 		*buflen -= addend;
 		tmpbuf = buf + *buflen;
 		tmpbuf[0] = '/';
 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
 		tmpbuf[addend - 1] = '\0';
 		slash_prefixed = true;
 		vp = ndp->ni_dvp;
 	}
 
 	vref(vp);
 	error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
 	    slash_prefixed, addend);
 	if (error != 0)
 		goto out_bad;
 
 	pwd_drop(pwd);
 	*freebuf = buf;
 
 	return (0);
 out_bad:
 	pwd_drop(pwd);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 struct vnode *
 vn_dir_dd_ino(struct vnode *vp)
 {
 	struct namecache *ncp;
 	struct vnode *ddvp;
 	struct mtx *vlp;
 	enum vgetstate vs;
 
 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 			continue;
 		ddvp = ncp->nc_dvp;
 		vs = vget_prep(ddvp);
 		mtx_unlock(vlp);
 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
 			return (NULL);
 		return (ddvp);
 	}
 	mtx_unlock(vlp);
 	return (NULL);
 }
 
 int
 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 {
 	struct namecache *ncp;
 	struct mtx *vlp;
 	int l;
 
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	if (ncp == NULL) {
 		mtx_unlock(vlp);
 		return (ENOENT);
 	}
 	l = min(ncp->nc_nlen, buflen - 1);
 	memcpy(buf, ncp->nc_name, l);
 	mtx_unlock(vlp);
 	buf[l] = '\0';
 	return (0);
 }
 
 /*
  * This function updates path string to vnode's full global path
  * and checks the size of the new path string against the pathlen argument.
  *
  * Requires a locked, referenced vnode.
  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
  *
  * If vp is a directory, the call to vn_fullpath_global() always succeeds
  * because it falls back to the ".." lookup if the namecache lookup fails.
  */
 int
 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
     u_int pathlen)
 {
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* Construct global filesystem path from vp. */
 	VOP_UNLOCK(vp);
 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 
 	if (error != 0) {
 		vrele(vp);
 		return (error);
 	}
 
 	if (strlen(rpath) >= pathlen) {
 		vrele(vp);
 		error = ENAMETOOLONG;
 		goto out;
 	}
 
 	/*
 	 * Re-lookup the vnode by path to detect a possible rename.
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, path, td);
 	error = namei(&nd);
 	if (error != 0) {
 		vrele(vp);
 		goto out;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp1 = nd.ni_vp;
 	vrele(vp);
 	if (vp1 == vp)
 		strcpy(path, rpath);
 	else {
 		vput(vp1);
 		error = ENOENT;
 	}
 
 out:
 	free(fbuf, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 static void
 db_print_vpath(struct vnode *vp)
 {
 
 	while (vp != NULL) {
 		db_printf("%p: ", vp);
 		if (vp == rootvnode) {
 			db_printf("/");
 			vp = NULL;
 		} else {
 			if (vp->v_vflag & VV_ROOT) {
 				db_printf("<mount point>");
 				vp = vp->v_mount->mnt_vnodecovered;
 			} else {
 				struct namecache *ncp;
 				char *ncn;
 				int i;
 
 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
 				if (ncp != NULL) {
 					ncn = ncp->nc_name;
 					for (i = 0; i < ncp->nc_nlen; i++)
 						db_printf("%c", *ncn++);
 					vp = ncp->nc_dvp;
 				} else {
 					vp = NULL;
 				}
 			}
 		}
 		db_printf("\n");
 	}
 
 	return;
 }
 
 DB_SHOW_COMMAND(vpath, db_show_vpath)
 {
 	struct vnode *vp;
 
 	if (!have_addr) {
 		db_printf("usage: show vpath <struct vnode *>\n");
 		return;
 	}
 
 	vp = (struct vnode *)addr;
 	db_print_vpath(vp);
 }
 
 #endif
-
-extern uma_zone_t namei_zone;
 
 static bool __read_frequently cache_fast_lookup = true;
 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
     &cache_fast_lookup, 0, "");
 
 #define CACHE_FPL_FAILED	-2020
 
 static void
 cache_fpl_cleanup_cnp(struct componentname *cnp)
 {
 
 	uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 	cnp->cn_pnbuf = NULL;
 	cnp->cn_nameptr = NULL;
 #endif
 }
 
 static void
 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
 {
 	struct componentname *cnp;
 
 	cnp = &ndp->ni_cnd;
 	while (*(cnp->cn_nameptr) == '/') {
 		cnp->cn_nameptr++;
 		ndp->ni_pathlen--;
 	}
 
 	*dpp = ndp->ni_rootdir;
 }
 
 /*
  * Components of nameidata (or objects it can point to) which may
  * need restoring in case fast path lookup fails.
  */
 struct nameidata_saved {
 	long cn_namelen;
 	char *cn_nameptr;
 	size_t ni_pathlen;
 	int cn_flags;
 };
 
 struct cache_fpl {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct pwd *pwd;
 	struct vnode *dvp;
 	struct vnode *tvp;
 	seqc_t dvp_seqc;
 	seqc_t tvp_seqc;
 	struct nameidata_saved snd;
 	int line;
 	enum cache_fpl_status status:8;
 	bool in_smr;
 };
 
 static void
 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
 {
 
 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
 }
 
 static void
 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
 {
 
 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
 }
 
 #ifdef INVARIANTS
 #define cache_fpl_smr_assert_entered(fpl) ({			\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == true);				\
 	VFS_SMR_ASSERT_ENTERED();				\
 })
 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == false);				\
 	VFS_SMR_ASSERT_NOT_ENTERED();				\
 })
 #else
 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
 #endif
 
 #define cache_fpl_smr_enter_initial(fpl) ({			\
 	struct cache_fpl *_fpl = (fpl);				\
 	vfs_smr_enter();					\
 	_fpl->in_smr = true;					\
 })
 
 #define cache_fpl_smr_enter(fpl) ({				\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == false);				\
 	vfs_smr_enter();					\
 	_fpl->in_smr = true;					\
 })
 
 #define cache_fpl_smr_exit(fpl) ({				\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == true);				\
 	vfs_smr_exit();						\
 	_fpl->in_smr = false;					\
 })
 
 static int
 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 {
 
 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 		    ("%s: converting to abort from %d at %d, set at %d\n",
 		    __func__, fpl->status, line, fpl->line));
 	}
 	fpl->status = CACHE_FPL_STATUS_ABORTED;
 	fpl->line = line;
 	return (CACHE_FPL_FAILED);
 }
 
 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
 
 static int
 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
 {
 
 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
 	    __func__, line, fpl->status, fpl->line));
 	cache_fpl_smr_assert_entered(fpl);
 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
 	fpl->line = line;
 	return (CACHE_FPL_FAILED);
 }
 
 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
 
 static int
 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
 {
 
 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
 	    __func__, line, fpl->status, fpl->line));
 	cache_fpl_smr_assert_not_entered(fpl);
 	MPASS(error != CACHE_FPL_FAILED);
 	fpl->status = CACHE_FPL_STATUS_HANDLED;
 	fpl->line = line;
 	return (error);
 }
 
 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
 
 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
 	(LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
 	 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
 
 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
 
 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
     "supported and internal flags overlap");
 
 static bool
 cache_fpl_islastcn(struct nameidata *ndp)
 {
 
 	return (*ndp->ni_next == 0);
 }
 
 static bool
 cache_fpl_isdotdot(struct componentname *cnp)
 {
 
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		return (true);
 	return (false);
 }
 
 static bool
 cache_can_fplookup(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct thread *td;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	td = cnp->cn_thread;
 
 	if (!cache_fast_lookup) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 #ifdef MAC
 	if (mac_vnode_check_lookup_enabled()) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 #endif
 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 	if (ndp->ni_dirfd != AT_FDCWD) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 	if (IN_CAPABILITY_MODE(td)) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 	if (AUDITING_TD(td)) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 	if (ndp->ni_startdir != NULL) {
 		cache_fpl_aborted(fpl);
 		return (false);
 	}
 	return (true);
 }
 
 static bool
 cache_fplookup_vnode_supported(struct vnode *vp)
 {
 
 	return (vp->v_type != VLNK);
 }
 
 /*
  * Move a negative entry to the hot list.
  *
  * We have to take locks, but they may be contended and in the worst
  * case we may need to go off CPU. We don't want to spin within the
  * smr section and we can't block with it. Instead we are going to
  * look up the entry again.
  */
 static int __noinline
 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
     uint32_t hash)
 {
 	struct componentname *cnp;
 	struct namecache *ncp;
 	struct neglist *neglist;
 	struct negstate *negstate;
 	struct vnode *dvp;
 	u_char nc_flag;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 	if (!vhold_smr(dvp))
 		return (cache_fpl_aborted(fpl));
 
 	neglist = NCP2NEGLIST(oncp);
 	cache_fpl_smr_exit(fpl);
 
 	mtx_lock(&ncneg_hot.nl_lock);
 	mtx_lock(&neglist->nl_lock);
 	/*
 	 * For hash iteration.
 	 */
 	cache_fpl_smr_enter(fpl);
 
 	/*
 	 * Avoid all surprises by only succeeding if we got the same entry and
 	 * bailing completely otherwise.
 	 *
 	 * In particular at this point there can be a new ncp which matches the
 	 * search but hashes to a different neglist.
 	 */
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp == oncp)
 			break;
 	}
 
 	/*
 	 * No match to begin with.
 	 */
 	if (__predict_false(ncp == NULL)) {
 		goto out_abort;
 	}
 
 	/*
 	 * The newly found entry may be something different...
 	 */
 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 		goto out_abort;
 	}
 
 	/*
 	 * ... and not even negative.
 	 */
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_NEGATIVE) == 0) {
 		goto out_abort;
 	}
 
 	if (__predict_false(!cache_ncp_canuse(ncp))) {
 		goto out_abort;
 	}
 
 	negstate = NCP2NEGSTATE(ncp);
 	if ((negstate->neg_flag & NEG_HOT) == 0) {
 		numhotneg++;
 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 		negstate->neg_flag |= NEG_HOT;
 	}
 
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
 	counter_u64_add(numneghits, 1);
 	cache_fpl_smr_exit(fpl);
 	mtx_unlock(&neglist->nl_lock);
 	mtx_unlock(&ncneg_hot.nl_lock);
 	vdrop(dvp);
 	return (cache_fpl_handled(fpl, ENOENT));
 out_abort:
 	cache_fpl_smr_exit(fpl);
 	mtx_unlock(&neglist->nl_lock);
 	mtx_unlock(&ncneg_hot.nl_lock);
 	vdrop(dvp);
 	return (cache_fpl_aborted(fpl));
 }
 
 /*
  * The target vnode is not supported, prepare for the slow path to take over.
  */
 static int __noinline
 cache_fplookup_partial_setup(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	enum vgetstate dvs;
 	struct vnode *dvp;
 	struct pwd *pwd;
 	seqc_t dvp_seqc;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 
 	dvs = vget_prep_smr(dvp);
 	if (__predict_false(dvs == VGET_NONE)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_smr_exit(fpl);
 
 	vget_finish_ref(dvp, dvs);
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	pwd = pwd_hold(curthread);
 	if (fpl->pwd != pwd) {
 		vrele(dvp);
 		pwd_drop(pwd);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_restore(fpl, &fpl->snd);
 
 	ndp->ni_startdir = dvp;
 	cnp->cn_flags |= MAKEENTRY;
 	if (cache_fpl_islastcn(ndp))
 		cnp->cn_flags |= ISLASTCN;
 	if (cache_fpl_isdotdot(cnp))
 		cnp->cn_flags |= ISDOTDOT;
 
 	return (0);
 }
 
 static int
 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
 {
 	struct componentname *cnp;
 	struct vnode *tvp;
 	seqc_t tvp_seqc;
 	int error, lkflags;
 
 	cnp = fpl->cnp;
 	tvp = fpl->tvp;
 	tvp_seqc = fpl->tvp_seqc;
 
 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
 		lkflags = LK_SHARED;
 		if ((cnp->cn_flags & LOCKSHARED) == 0)
 			lkflags = LK_EXCLUSIVE;
 		error = vget_finish(tvp, lkflags, tvs);
 		if (__predict_false(error != 0)) {
 			return (cache_fpl_aborted(fpl));
 		}
 	} else {
 		vget_finish_ref(tvp, tvs);
 	}
 
 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 		if ((cnp->cn_flags & LOCKLEAF) != 0)
 			vput(tvp);
 		else
 			vrele(tvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	return (cache_fpl_handled(fpl, 0));
 }
 
 /*
  * They want to possibly modify the state of the namecache.
  *
  * Don't try to match the API contract, just leave.
  * TODO: this leaves scalability on the table
  */
 static int
 cache_fplookup_final_modifying(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 
 	cnp = fpl->cnp;
 	MPASS(cnp->cn_nameiop != LOOKUP);
 	return (cache_fpl_partial(fpl));
 }
 
 static int __noinline
 cache_fplookup_final_withparent(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	enum vgetstate dvs, tvs;
 	struct vnode *dvp, *tvp;
 	seqc_t dvp_seqc, tvp_seqc;
 	int error;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 	tvp = fpl->tvp;
 	tvp_seqc = fpl->tvp_seqc;
 
 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
 
 	/*
 	 * This is less efficient than it can be for simplicity.
 	 */
 	dvs = vget_prep_smr(dvp);
 	if (__predict_false(dvs == VGET_NONE)) {
 		return (cache_fpl_aborted(fpl));
 	}
 	tvs = vget_prep_smr(tvp);
 	if (__predict_false(tvs == VGET_NONE)) {
 		cache_fpl_smr_exit(fpl);
 		vget_abort(dvp, dvs);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_smr_exit(fpl);
 
 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
 		if (__predict_false(error != 0)) {
 			vget_abort(tvp, tvs);
 			return (cache_fpl_aborted(fpl));
 		}
 	} else {
 		vget_finish_ref(dvp, dvs);
 	}
 
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		vget_abort(tvp, tvs);
 		if ((cnp->cn_flags & LOCKPARENT) != 0)
 			vput(dvp);
 		else
 			vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	error = cache_fplookup_final_child(fpl, tvs);
 	if (__predict_false(error != 0)) {
 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
 		if ((cnp->cn_flags & LOCKPARENT) != 0)
 			vput(dvp);
 		else
 			vrele(dvp);
 		return (error);
 	}
 
 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
 	return (0);
 }
 
 static int
 cache_fplookup_final(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	enum vgetstate tvs;
 	struct vnode *dvp, *tvp;
 	seqc_t dvp_seqc, tvp_seqc;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 	tvp = fpl->tvp;
 	tvp_seqc = fpl->tvp_seqc;
 
 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
 
 	if (cnp->cn_nameiop != LOOKUP) {
 		return (cache_fplookup_final_modifying(fpl));
 	}
 
 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
 		return (cache_fplookup_final_withparent(fpl));
 
 	tvs = vget_prep_smr(tvp);
 	if (__predict_false(tvs == VGET_NONE)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		cache_fpl_smr_exit(fpl);
 		vget_abort(tvp, tvs);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_smr_exit(fpl);
 	return (cache_fplookup_final_child(fpl, tvs));
 }
 
 static int __noinline
 cache_fplookup_dot(struct cache_fpl *fpl)
 {
 	struct vnode *dvp;
 
 	dvp = fpl->dvp;
 
 	fpl->tvp = dvp;
 	fpl->tvp_seqc = vn_seqc_read_any(dvp);
 	if (seqc_in_modify(fpl->tvp_seqc)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	counter_u64_add(dothits, 1);
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
 
 	return (0);
 }
 
 static int __noinline
 cache_fplookup_dotdot(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct namecache *ncp;
 	struct vnode *dvp;
 	struct prison *pr;
 	u_char nc_flag;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 	/*
 	 * XXX this is racy the same way regular lookup is
 	 */
 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 	    pr = pr->pr_parent)
 		if (dvp == pr->pr_root)
 			break;
 
 	if (dvp == ndp->ni_rootdir ||
 	    dvp == ndp->ni_topdir ||
 	    dvp == rootvnode ||
 	    pr != NULL) {
 		fpl->tvp = dvp;
 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
 		if (seqc_in_modify(fpl->tvp_seqc)) {
 			return (cache_fpl_aborted(fpl));
 		}
 		return (0);
 	}
 
 	if ((dvp->v_vflag & VV_ROOT) != 0) {
 		/*
 		 * TODO
 		 * The opposite of climb mount is needed here.
 		 */
 		return (cache_fpl_aborted(fpl));
 	}
 
 	ncp = atomic_load_ptr(&dvp->v_cache_dd);
 	if (ncp == NULL) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
 		if ((nc_flag & NCF_NEGATIVE) != 0)
 			return (cache_fpl_aborted(fpl));
 		fpl->tvp = ncp->nc_vp;
 	} else {
 		fpl->tvp = ncp->nc_dvp;
 	}
 
 	if (__predict_false(!cache_ncp_canuse(ncp))) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
 	if (seqc_in_modify(fpl->tvp_seqc)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	counter_u64_add(dotdothits, 1);
 	return (0);
 }
 
 static int
 cache_fplookup_next(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	struct namecache *ncp;
 	struct negstate *negstate;
 	struct vnode *dvp, *tvp;
 	u_char nc_flag;
 	uint32_t hash;
 	bool neg_hot;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
 		return (cache_fplookup_dot(fpl));
 	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/*
 	 * If there is no entry we have to punt to the slow path to perform
 	 * actual lookup. Should there be nothing with this name a negative
 	 * entry will be created.
 	 */
 	if (__predict_false(ncp == NULL)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	tvp = atomic_load_ptr(&ncp->nc_vp);
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_NEGATIVE) != 0) {
 		/*
 		 * If they want to create an entry we need to replace this one.
 		 */
 		if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
 			return (cache_fpl_partial(fpl));
 		}
 		negstate = NCP2NEGSTATE(ncp);
 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
 		if (__predict_false(!cache_ncp_canuse(ncp))) {
 			return (cache_fpl_partial(fpl));
 		}
 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
 			return (cache_fpl_partial(fpl));
 		}
 		if (!neg_hot) {
 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
 		}
 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 		    ncp->nc_name);
 		counter_u64_add(numneghits, 1);
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled(fpl, ENOENT));
 	}
 
 	if (__predict_false(!cache_ncp_canuse(ncp))) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	fpl->tvp = tvp;
 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
 	if (seqc_in_modify(fpl->tvp_seqc)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	if (!cache_fplookup_vnode_supported(tvp)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	counter_u64_add(numposhits, 1);
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 	return (0);
 }
 
 static bool
 cache_fplookup_mp_supported(struct mount *mp)
 {
 
 	if (mp == NULL)
 		return (false);
 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 		return (false);
 	return (true);
 }
 
 /*
  * Walk up the mount stack (if any).
  *
  * Correctness is provided in the following ways:
  * - all vnodes are protected from freeing with SMR
  * - struct mount objects are type stable making them always safe to access
  * - stability of the particular mount is provided by busying it
  * - relationship between the vnode which is mounted on and the mount is
  *   verified with the vnode sequence counter after busying
  * - association between root vnode of the mount and the mount is protected
  *   by busy
  *
  * From that point on we can read the sequence counter of the root vnode
  * and get the next mount on the stack (if any) using the same protection.
  *
  * By the end of successful walk we are guaranteed the reached state was
  * indeed present at least at some point which matches the regular lookup.
  */
 static int __noinline
 cache_fplookup_climb_mount(struct cache_fpl *fpl)
 {
 	struct mount *mp, *prev_mp;
 	struct vnode *vp;
 	seqc_t vp_seqc;
 
 	vp = fpl->tvp;
 	vp_seqc = fpl->tvp_seqc;
 
 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
 	mp = atomic_load_ptr(&vp->v_mountedhere);
 	if (mp == NULL)
 		return (0);
 
 	prev_mp = NULL;
 	for (;;) {
 		if (!vfs_op_thread_enter_crit(mp)) {
 			if (prev_mp != NULL)
 				vfs_op_thread_exit_crit(prev_mp);
 			return (cache_fpl_partial(fpl));
 		}
 		if (prev_mp != NULL)
 			vfs_op_thread_exit_crit(prev_mp);
 		if (!vn_seqc_consistent(vp, vp_seqc)) {
 			vfs_op_thread_exit_crit(mp);
 			return (cache_fpl_partial(fpl));
 		}
 		if (!cache_fplookup_mp_supported(mp)) {
 			vfs_op_thread_exit_crit(mp);
 			return (cache_fpl_partial(fpl));
 		}
 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
 		if (vp == NULL || VN_IS_DOOMED(vp)) {
 			vfs_op_thread_exit_crit(mp);
 			return (cache_fpl_partial(fpl));
 		}
 		vp_seqc = vn_seqc_read_any(vp);
 		if (seqc_in_modify(vp_seqc)) {
 			vfs_op_thread_exit_crit(mp);
 			return (cache_fpl_partial(fpl));
 		}
 		prev_mp = mp;
 		mp = atomic_load_ptr(&vp->v_mountedhere);
 		if (mp == NULL)
 			break;
 	}
 
 	vfs_op_thread_exit_crit(prev_mp);
 	fpl->tvp = vp;
 	fpl->tvp_seqc = vp_seqc;
 	return (0);
 }
 
 static bool
 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	vp = fpl->tvp;
 
 	/*
 	 * Hack: while this is a union, the pointer tends to be NULL so save on
 	 * a branch.
 	 */
 	mp = atomic_load_ptr(&vp->v_mountedhere);
 	if (mp == NULL)
 		return (false);
 	if (vp->v_type == VDIR)
 		return (true);
 	return (false);
 }
 
 /*
  * Parse the path.
  *
  * The code is mostly copy-pasted from regular lookup, see lookup().
  * The structure is maintained along with comments for easier maintenance.
  * Deduplicating the code will become feasible after fast path lookup
  * becomes more feature-complete.
  */
 static int
 cache_fplookup_parse(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	char *cp;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
 		continue;
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
 	}
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
 	ndp->ni_next = cp;
 
 	/*
 	 * Replace multiple slashes by a single slash and trailing slashes
 	 * by a null.  This must be done before VOP_LOOKUP() because some
 	 * fs's don't know about trailing slashes.  Remember if there were
 	 * trailing slashes to handle symlinks, existing non-directories
 	 * and non-existing files that won't be directories specially later.
 	 */
 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
 		cp++;
 		ndp->ni_pathlen--;
 		if (*cp == '\0') {
 			/*
 			 * TODO
 			 * Regular lookup performs the following:
 			 * *ndp->ni_next = '\0';
 			 * cnp->cn_flags |= TRAILINGSLASH;
 			 *
 			 * Which is problematic since it modifies data read
 			 * from userspace. Then if fast path lookup was to
 			 * abort we would have to either restore it or convey
 			 * the flag. Since this is a corner case just ignore
 			 * it for simplicity.
 			 */
 			return (cache_fpl_partial(fpl));
 		}
 	}
 	ndp->ni_next = cp;
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 *
 	 * TODO
 	 * Another corner case handled by the regular lookup
 	 */
 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
 		return (cache_fpl_partial(fpl));
 	}
 	return (0);
 }
 
 static void
 cache_fplookup_parse_advance(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	cnp->cn_nameptr = ndp->ni_next;
 	while (*cnp->cn_nameptr == '/') {
 		cnp->cn_nameptr++;
 		ndp->ni_pathlen--;
 	}
 }
 
 static int __noinline
 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
 {
 
 	switch (error) {
 	case EAGAIN:
 		/*
 		 * Can happen when racing against vgone.
 		 * */
 	case EOPNOTSUPP:
 		cache_fpl_partial(fpl);
 		break;
 	default:
 		/*
 		 * See the API contract for VOP_FPLOOKUP_VEXEC.
 		 */
 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 			error = cache_fpl_aborted(fpl);
 		} else {
 			cache_fpl_smr_exit(fpl);
 			cache_fpl_handled(fpl, error);
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct mount *mp;
 	int error;
 
 	error = CACHE_FPL_FAILED;
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	cache_fpl_checkpoint(fpl, &fpl->snd);
 
 	fpl->dvp = dvp;
 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 	if (seqc_in_modify(fpl->dvp_seqc)) {
 		cache_fpl_aborted(fpl);
 		goto out;
 	}
 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
 	if (!cache_fplookup_mp_supported(mp)) {
 		cache_fpl_aborted(fpl);
 		goto out;
 	}
 
 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
 
 	for (;;) {
 		error = cache_fplookup_parse(fpl);
 		if (__predict_false(error != 0)) {
 			break;
 		}
 
 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
 
 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 		if (__predict_false(error != 0)) {
 			error = cache_fplookup_failed_vexec(fpl, error);
 			break;
 		}
 
 		if (__predict_false(cache_fpl_isdotdot(cnp))) {
 			error = cache_fplookup_dotdot(fpl);
 			if (__predict_false(error != 0)) {
 				break;
 			}
 		} else {
 			error = cache_fplookup_next(fpl);
 			if (__predict_false(error != 0)) {
 				break;
 			}
 
 			VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 
 			if (cache_fplookup_need_climb_mount(fpl)) {
 				error = cache_fplookup_climb_mount(fpl);
 				if (__predict_false(error != 0)) {
 					break;
 				}
 			}
 		}
 
 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 
 		if (cache_fpl_islastcn(ndp)) {
 			error = cache_fplookup_final(fpl);
 			break;
 		}
 
 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 			error = cache_fpl_aborted(fpl);
 			break;
 		}
 
 		fpl->dvp = fpl->tvp;
 		fpl->dvp_seqc = fpl->tvp_seqc;
 
 		cache_fplookup_parse_advance(fpl);
 		cache_fpl_checkpoint(fpl, &fpl->snd);
 	}
 out:
 	switch (fpl->status) {
 	case CACHE_FPL_STATUS_UNSET:
 		__assert_unreachable();
 		break;
 	case CACHE_FPL_STATUS_PARTIAL:
 		cache_fpl_smr_assert_entered(fpl);
 		return (cache_fplookup_partial_setup(fpl));
 	case CACHE_FPL_STATUS_ABORTED:
 		if (fpl->in_smr)
 			cache_fpl_smr_exit(fpl);
 		return (CACHE_FPL_FAILED);
 	case CACHE_FPL_STATUS_HANDLED:
 		MPASS(error != CACHE_FPL_FAILED);
 		cache_fpl_smr_assert_not_entered(fpl);
 		if (__predict_false(error != 0)) {
 			ndp->ni_dvp = NULL;
 			ndp->ni_vp = NULL;
 			cache_fpl_cleanup_cnp(cnp);
 			return (error);
 		}
 		ndp->ni_dvp = fpl->dvp;
 		ndp->ni_vp = fpl->tvp;
 		if (cnp->cn_flags & SAVENAME)
 			cnp->cn_flags |= HASBUF;
 		else
 			cache_fpl_cleanup_cnp(cnp);
 		return (error);
 	}
 }
 
 /*
  * Fast path lookup protected with SMR and sequence counters.
  *
  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
  *
  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
  * outlined below.
  *
  * Traditional vnode lookup conceptually looks like this:
  *
  * vn_lock(current);
  * for (;;) {
  *	next = find();
  *	vn_lock(next);
  *	vn_unlock(current);
  *	current = next;
  *	if (last)
  *	    break;
  * }
  * return (current);
  *
  * Each jump to the next vnode is safe memory-wise and atomic with respect to
  * any modifications thanks to holding respective locks.
  *
  * The same guarantee can be provided with a combination of safe memory
  * reclamation and sequence counters instead. If all operations which affect
  * the relationship between the current vnode and the one we are looking for
  * also modify the counter, we can verify whether all the conditions held as
  * we made the jump. This includes things like permissions, mount points etc.
  * Counter modification is provided by enclosing relevant places in
  * vn_seqc_write_begin()/end() calls.
  *
  * Thus this translates to:
  *
  * vfs_smr_enter();
  * dvp_seqc = seqc_read_any(dvp);
  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
  *     abort();
  * for (;;) {
  * 	tvp = find();
  * 	tvp_seqc = seqc_read_any(tvp);
  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
  * 	    abort();
  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
  * 	    abort();
  * 	dvp = tvp; // we know nothing of importance has changed
  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
  * 	if (last)
  * 	    break;
  * }
  * vget(); // secure the vnode
  * if (!seqc_consistent(tvp, tvp_seqc) // final check
  * 	    abort();
  * // at this point we know nothing has changed for any parent<->child pair
  * // as they were crossed during the lookup, meaning we matched the guarantee
  * // of the locked variant
  * return (tvp);
  *
  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
  * - they are called while within vfs_smr protection which they must never exit
  * - EAGAIN can be returned to denote checking could not be performed, it is
  *   always valid to return it
  * - if the sequence counter has not changed the result must be valid
  * - if the sequence counter has changed both false positives and false negatives
  *   are permitted (since the result will be rejected later)
  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
  *
  * Caveats to watch out for:
  * - vnodes are passed unlocked and unreferenced with nothing stopping
  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
  *   to use atomic_load_ptr to fetch it.
  * - the aforementioned object can also get freed, meaning absent other means it
  *   should be protected with vfs_smr
  * - either safely checking permissions as they are modified or guaranteeing
  *   their stability is left to the routine
  */
 int
 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
     struct pwd **pwdp)
 {
 	struct cache_fpl fpl;
 	struct pwd *pwd;
 	struct vnode *dvp;
 	struct componentname *cnp;
 	struct nameidata_saved orig;
 	int error;
 
 	MPASS(ndp->ni_lcf == 0);
 
 	fpl.status = CACHE_FPL_STATUS_UNSET;
 	fpl.ndp = ndp;
 	fpl.cnp = &ndp->ni_cnd;
 	MPASS(curthread == fpl.cnp->cn_thread);
 
 	if ((fpl.cnp->cn_flags & SAVESTART) != 0)
 		MPASS(fpl.cnp->cn_nameiop != LOOKUP);
 
 	if (!cache_can_fplookup(&fpl)) {
 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 		*status = fpl.status;
 		return (EOPNOTSUPP);
 	}
 
 	cache_fpl_checkpoint(&fpl, &orig);
 
 	cache_fpl_smr_enter_initial(&fpl);
 	pwd = pwd_get_smr();
 	fpl.pwd = pwd;
 	ndp->ni_rootdir = pwd->pwd_rdir;
 	ndp->ni_topdir = pwd->pwd_jdir;
 
 	cnp = fpl.cnp;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	if (cnp->cn_pnbuf[0] == '/') {
 		cache_fpl_handle_root(ndp, &dvp);
 	} else {
 		MPASS(ndp->ni_dirfd == AT_FDCWD);
 		dvp = pwd->pwd_cdir;
 	}
 
 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
 
 	error = cache_fplookup_impl(dvp, &fpl);
 	cache_fpl_smr_assert_not_entered(&fpl);
 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 
 	*status = fpl.status;
 	switch (fpl.status) {
 	case CACHE_FPL_STATUS_UNSET:
 		__assert_unreachable();
 		break;
 	case CACHE_FPL_STATUS_HANDLED:
 		SDT_PROBE3(vfs, namei, lookup, return, error,
 		    (error == 0 ? ndp->ni_vp : NULL), true);
 		break;
 	case CACHE_FPL_STATUS_PARTIAL:
 		*pwdp = fpl.pwd;
 		/*
 		 * Status restored by cache_fplookup_partial_setup.
 		 */
 		break;
 	case CACHE_FPL_STATUS_ABORTED:
 		cache_fpl_restore(&fpl, &orig);
 		break;
 	}
 	return (error);
 }
Index: projects/clang1100-import/sys/kern/vfs_default.c
===================================================================
--- projects/clang1100-import/sys/kern/vfs_default.c	(revision 364278)
+++ projects/clang1100-import/sys/kern/vfs_default.c	(revision 364279)
@@ -1,1576 +1,1576 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed
  * to Berkeley by John Heidemann of the UCLA Ficus project.
  *
  * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/rwlock.h>
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/poll.h>
 #include <sys/stat.h>
 #include <security/audit/audit.h>
 #include <sys/priv.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 static int	vop_nolookup(struct vop_lookup_args *);
 static int	vop_norename(struct vop_rename_args *);
 static int	vop_nostrategy(struct vop_strategy_args *);
 static int	get_next_dirent(struct vnode *vp, struct dirent **dpp,
 				char *dirbuf, int dirbuflen, off_t *off,
 				char **cpos, int *len, int *eofflag,
 				struct thread *td);
 static int	dirent_exists(struct vnode *vp, const char *dirname,
 			      struct thread *td);
 
 #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
 
 static int vop_stdis_text(struct vop_is_text_args *ap);
 static int vop_stdunset_text(struct vop_unset_text_args *ap);
 static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
 static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap);
 static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
 static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
 static int vop_stdstat(struct vop_stat_args *ap);
 
 /*
  * This vnode table stores what we want to do if the filesystem doesn't
  * implement a particular VOP.
  *
  * If there is no specific entry here, we will return EOPNOTSUPP.
  *
  * Note that every filesystem has to implement either vop_access
  * or vop_accessx; failing to do so will result in immediate crash
  * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
  * which calls vop_stdaccess() etc.
  */
 
 struct vop_vector default_vnodeops = {
 	.vop_default =		NULL,
 	.vop_bypass =		VOP_EOPNOTSUPP,
 
 	.vop_access =		vop_stdaccess,
 	.vop_accessx =		vop_stdaccessx,
 	.vop_advise =		vop_stdadvise,
 	.vop_advlock =		vop_stdadvlock,
 	.vop_advlockasync =	vop_stdadvlockasync,
 	.vop_advlockpurge =	vop_stdadvlockpurge,
 	.vop_allocate =		vop_stdallocate,
 	.vop_bmap =		vop_stdbmap,
 	.vop_close =		VOP_NULL,
 	.vop_fsync =		VOP_NULL,
 	.vop_stat =		vop_stdstat,
 	.vop_fdatasync =	vop_stdfdatasync,
 	.vop_getpages =		vop_stdgetpages,
 	.vop_getpages_async =	vop_stdgetpages_async,
 	.vop_getwritemount = 	vop_stdgetwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_need_inactive =	vop_stdneed_inactive,
 	.vop_ioctl =		vop_stdioctl,
 	.vop_kqfilter =		vop_stdkqfilter,
 	.vop_islocked =		vop_stdislocked,
 	.vop_lock1 =		vop_stdlock,
 	.vop_lookup =		vop_nolookup,
 	.vop_open =		VOP_NULL,
 	.vop_pathconf =		VOP_EINVAL,
 	.vop_poll =		vop_nopoll,
 	.vop_putpages =		vop_stdputpages,
 	.vop_readlink =		VOP_EINVAL,
 	.vop_rename =		vop_norename,
 	.vop_revoke =		VOP_PANIC,
 	.vop_strategy =		vop_nostrategy,
 	.vop_unlock =		vop_stdunlock,
 	.vop_vptocnp =		vop_stdvptocnp,
 	.vop_vptofh =		vop_stdvptofh,
 	.vop_unp_bind =		vop_stdunp_bind,
 	.vop_unp_connect =	vop_stdunp_connect,
 	.vop_unp_detach =	vop_stdunp_detach,
 	.vop_is_text =		vop_stdis_text,
 	.vop_set_text =		vop_stdset_text,
 	.vop_unset_text =	vop_stdunset_text,
 	.vop_add_writecount =	vop_stdadd_writecount,
 	.vop_copy_file_range =	vop_stdcopy_file_range,
 };
 VFS_VOP_VECTOR_REGISTER(default_vnodeops);
 
 /*
  * Series of placeholder functions for various error returns for
  * VOPs.
  */
 
 int
 vop_eopnotsupp(struct vop_generic_args *ap)
 {
 	/*
 	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
 	*/
 
 	return (EOPNOTSUPP);
 }
 
 int
 vop_ebadf(struct vop_generic_args *ap)
 {
 
 	return (EBADF);
 }
 
 int
 vop_enotty(struct vop_generic_args *ap)
 {
 
 	return (ENOTTY);
 }
 
 int
 vop_einval(struct vop_generic_args *ap)
 {
 
 	return (EINVAL);
 }
 
 int
 vop_enoent(struct vop_generic_args *ap)
 {
 
 	return (ENOENT);
 }
 
 int
 vop_null(struct vop_generic_args *ap)
 {
 
 	return (0);
 }
 
 /*
  * Helper function to panic on some bad VOPs in some filesystems.
  */
 int
 vop_panic(struct vop_generic_args *ap)
 {
 
 	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
 }
 
 /*
  * vop_std<something> and vop_no<something> are default functions for use by
  * filesystems that need the "default reasonable" implementation for a
  * particular operation.
  *
  * The documentation for the operations they implement exists (if it exists)
  * in the VOP_<SOMETHING>(9) manpage (all uppercase).
  */
 
 /*
  * Default vop for filesystems that do not support name lookup
  */
 static int
 vop_nolookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	*ap->a_vpp = NULL;
 	return (ENOTDIR);
 }
 
 /*
  * vop_norename:
  *
  * Handle unlock and reference counting for arguments of vop_rename
  * for filesystems that do not implement rename operation.
  */
 static int
 vop_norename(struct vop_rename_args *ap)
 {
 
 	vop_rename_fail(ap);
 	return (EOPNOTSUPP);
 }
 
 /*
  *	vop_nostrategy:
  *
  *	Strategy routine for VFS devices that have none.
  *
  *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
  *	routine.  Typically this is done for a BIO_READ strategy call.
  *	Typically B_INVAL is assumed to already be clear prior to a write
  *	and should not be cleared manually unless you just made the buffer
  *	invalid.  BIO_ERROR should be cleared either way.
  */
 
 static int
 vop_nostrategy (struct vop_strategy_args *ap)
 {
 	printf("No strategy for buffer at %p\n", ap->a_bp);
 	vn_printf(ap->a_vp, "vnode ");
 	ap->a_bp->b_ioflags |= BIO_ERROR;
 	ap->a_bp->b_error = EOPNOTSUPP;
 	bufdone(ap->a_bp);
 	return (EOPNOTSUPP);
 }
 
 static int
 get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
 		int dirbuflen, off_t *off, char **cpos, int *len,
 		int *eofflag, struct thread *td)
 {
 	int error, reclen;
 	struct uio uio;
 	struct iovec iov;
 	struct dirent *dp;
 
 	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
 	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
 
 	if (*len == 0) {
 		iov.iov_base = dirbuf;
 		iov.iov_len = dirbuflen;
 
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = *off;
 		uio.uio_resid = dirbuflen;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_READ;
 		uio.uio_td = td;
 
 		*eofflag = 0;
 
 #ifdef MAC
 		error = mac_vnode_check_readdir(td->td_ucred, vp);
 		if (error == 0)
 #endif
 			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
 		    		NULL, NULL);
 		if (error)
 			return (error);
 
 		*off = uio.uio_offset;
 
 		*cpos = dirbuf;
 		*len = (dirbuflen - uio.uio_resid);
 
 		if (*len == 0)
 			return (ENOENT);
 	}
 
 	dp = (struct dirent *)(*cpos);
 	reclen = dp->d_reclen;
 	*dpp = dp;
 
 	/* check for malformed directory.. */
 	if (reclen < DIRENT_MINSIZE)
 		return (EINVAL);
 
 	*cpos += reclen;
 	*len -= reclen;
 
 	return (0);
 }
 
 /*
  * Check if a named file exists in a given directory vnode.
  */
 static int
 dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
 {
 	char *dirbuf, *cpos;
 	int error, eofflag, dirbuflen, len, found;
 	off_t off;
 	struct dirent *dp;
 	struct vattr va;
 
 	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
 	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
 
 	found = 0;
 
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error)
 		return (found);
 
 	dirbuflen = DEV_BSIZE;
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	off = 0;
 	len = 0;
 	do {
 		error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
 					&cpos, &len, &eofflag, td);
 		if (error)
 			goto out;
 
 		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
 		    strcmp(dp->d_name, dirname) == 0) {
 			found = 1;
 			goto out;
 		}
 	} while (len > 0 || !eofflag);
 
 out:
 	free(dirbuf, M_TEMP);
 	return (found);
 }
 
 int
 vop_stdaccess(struct vop_access_args *ap)
 {
 
 	KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
 	    VAPPEND)) == 0, ("invalid bit in accmode"));
 
 	return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
 }
 
 int
 vop_stdaccessx(struct vop_accessx_args *ap)
 {
 	int error;
 	accmode_t accmode = ap->a_accmode;
 
 	error = vfs_unixify_accmode(&accmode);
 	if (error != 0)
 		return (error);
 
 	if (accmode == 0)
 		return (0);
 
 	return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
 }
 
 /*
  * Advisory record locking support
  */
 int
 vop_stdadvlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 
 	vp = ap->a_vp;
 	if (ap->a_fl->l_whence == SEEK_END) {
 		/*
 		 * The NFSv4 server must avoid doing a vn_lock() here, since it
 		 * can deadlock the nfsd threads, due to a LOR.  Fortunately
 		 * the NFSv4 server always uses SEEK_SET and this code is
 		 * only required for the SEEK_END case.
 		 */
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
 		VOP_UNLOCK(vp);
 		if (error)
 			return (error);
 	} else
 		vattr.va_size = 0;
 
 	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
 }
 
 int
 vop_stdadvlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 
 	vp = ap->a_vp;
 	if (ap->a_fl->l_whence == SEEK_END) {
 		/* The size argument is only needed for SEEK_END. */
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
 		VOP_UNLOCK(vp);
 		if (error)
 			return (error);
 	} else
 		vattr.va_size = 0;
 
 	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
 }
 
 int
 vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	lf_purgelocks(vp, &vp->v_lockf);
 	return (0);
 }
 
 /*
  * vop_stdpathconf:
  *
  * Standard implementation of POSIX pathconf, to get information about limits
  * for a filesystem.
  * Override per filesystem for the case where the filesystem has smaller
  * limits.
  */
 int
 vop_stdpathconf(ap)
 	struct vop_pathconf_args /* {
 	struct vnode *a_vp;
 	int a_name;
 	int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 		case _PC_ASYNC_IO:
 			*ap->a_retval = _POSIX_ASYNCHRONOUS_IO;
 			return (0);
 		case _PC_PATH_MAX:
 			*ap->a_retval = PATH_MAX;
 			return (0);
 		case _PC_ACL_EXTENDED:
 		case _PC_ACL_NFS4:
 		case _PC_CAP_PRESENT:
 		case _PC_INF_PRESENT:
 		case _PC_MAC_PRESENT:
 			*ap->a_retval = 0;
 			return (0);
 		default:
 			return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Standard lock, unlock and islocked functions.
  */
 int
 vop_stdlock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct mtx *ilk;
 
 	ilk = VI_MTX(vp);
 	return (lockmgr_lock_flags(vp->v_vnlock, ap->a_flags,
 	    &ilk->lock_object, ap->a_file, ap->a_line));
 }
 
 /* See above. */
 int
 vop_stdunlock(ap)
 	struct vop_unlock_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	return (lockmgr_unlock(vp->v_vnlock));
 }
 
 /* See above. */
 int
 vop_stdislocked(ap)
 	struct vop_islocked_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	return (lockstatus(ap->a_vp->v_vnlock));
 }
 
 /*
  * Variants of the above set.
  *
  * Differences are:
  * - shared locking disablement is not supported
  * - v_vnlock pointer is not honored
  */
 int
 vop_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int flags = ap->a_flags;
 	struct mtx *ilk;
 
 	MPASS(vp->v_vnlock == &vp->v_lock);
 
 	if (__predict_false((flags & ~(LK_TYPE_MASK | LK_NODDLKTREAT | LK_RETRY)) != 0))
 		goto other;
 
 	switch (flags & LK_TYPE_MASK) {
 	case LK_SHARED:
 		return (lockmgr_slock(&vp->v_lock, flags, ap->a_file, ap->a_line));
 	case LK_EXCLUSIVE:
 		return (lockmgr_xlock(&vp->v_lock, flags, ap->a_file, ap->a_line));
 	}
 other:
 	ilk = VI_MTX(vp);
 	return (lockmgr_lock_flags(&vp->v_lock, flags,
 	    &ilk->lock_object, ap->a_file, ap->a_line));
 }
 
 int
 vop_unlock(ap)
 	struct vop_unlock_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	MPASS(vp->v_vnlock == &vp->v_lock);
 
 	return (lockmgr_unlock(&vp->v_lock));
 }
 
 int
 vop_islocked(ap)
 	struct vop_islocked_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	MPASS(vp->v_vnlock == &vp->v_lock);
 
 	return (lockstatus(&vp->v_lock));
 }
 
 /*
  * Return true for select/poll.
  */
 int
 vop_nopoll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	if (ap->a_events & ~POLLSTANDARD)
 		return (POLLNVAL);
 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 /*
  * Implement poll for local filesystems that support it.
  */
 int
 vop_stdpoll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	if (ap->a_events & ~POLLSTANDARD)
 		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 /*
  * Return our mount point, as we will take charge of the writes.
  */
 int
 vop_stdgetwritemount(ap)
 	struct vop_getwritemount_args /* {
 		struct vnode *a_vp;
 		struct mount **a_mpp;
 	} */ *ap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note that having a reference does not prevent forced unmount from
 	 * setting ->v_mount to NULL after the lock gets released. This is of
 	 * no consequence for typical consumers (most notably vn_start_write)
 	 * since in this case the vnode is VIRF_DOOMED. Unmount might have
 	 * progressed far enough that its completion is only delayed by the
 	 * reference obtained here. The consumer only needs to concern itself
 	 * with releasing it.
 	 */
 	vp = ap->a_vp;
 	mp = vp->v_mount;
 	if (mp == NULL) {
 		*(ap->a_mpp) = NULL;
 		return (0);
 	}
 	if (vfs_op_thread_enter(mp)) {
 		if (mp == vp->v_mount) {
 			vfs_mp_count_add_pcpu(mp, ref, 1);
 			vfs_op_thread_exit(mp);
 		} else {
 			vfs_op_thread_exit(mp);
 			mp = NULL;
 		}
 	} else {
 		MNT_ILOCK(mp);
 		if (mp == vp->v_mount) {
 			MNT_REF(mp);
 			MNT_IUNLOCK(mp);
 		} else {
 			MNT_IUNLOCK(mp);
 			mp = NULL;
 		}
 	}
 	*(ap->a_mpp) = mp;
 	return (0);
 }
 
 /*
  * If the file system doesn't implement VOP_BMAP, then return sensible defaults:
  * - Return the vnode's bufobj instead of any underlying device's bufobj
  * - Calculate the physical block number as if there were equal size
  *   consecutive blocks, but
  * - Report no contiguous runs of blocks.
  */
 int
 vop_stdbmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	return (0);
 }
 
 int
 vop_stdfsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (vn_fsync_buf(ap->a_vp, ap->a_waitfor));
 }
 
 static int
 vop_stdfdatasync(struct vop_fdatasync_args *ap)
 {
 
 	return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td));
 }
 
 int
 vop_stdfdatasync_buf(struct vop_fdatasync_args *ap)
 {
 
 	return (vn_fsync_buf(ap->a_vp, MNT_WAIT));
 }
 
 /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
 int
 vop_stdgetpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int *a_rbehind;
 		int *a_rahead;
 	} */ *ap;
 {
 
 	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 	    ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL);
 }
 
 static int
 vop_stdgetpages_async(struct vop_getpages_async_args *ap)
 {
 	int error;
 
 	error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead);
 	if (ap->a_iodone != NULL)
 		ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
 	return (error);
 }
 
 int
 vop_stdkqfilter(struct vop_kqfilter_args *ap)
 {
 	return vfs_kqfilter(ap);
 }
 
 /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
 int
 vop_stdputpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 	} */ *ap;
 {
 
 	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
 	     ap->a_sync, ap->a_rtvals);
 }
 
 int
 vop_stdvptofh(struct vop_vptofh_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 int
 vop_stdvptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct ucred *cred = ap->a_cred;
 	char *buf = ap->a_buf;
 	size_t *buflen = ap->a_buflen;
 	char *dirbuf, *cpos;
 	int i, error, eofflag, dirbuflen, flags, locked, len, covered;
 	off_t off;
 	ino_t fileno;
 	struct vattr va;
 	struct nameidata nd;
 	struct thread *td;
 	struct dirent *dp;
 	struct vnode *mvp;
 
 	i = *buflen;
 	error = 0;
 	covered = 0;
 	td = curthread;
 
 	if (vp->v_type != VDIR)
 		return (ENOENT);
 
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error)
 		return (error);
 
 	VREF(vp);
 	locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 	NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
 	    "..", vp, td);
 	flags = FREAD;
 	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
 	if (error) {
 		vn_lock(vp, locked | LK_RETRY);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	mvp = *dvp = nd.ni_vp;
 
 	if (vp->v_mount != (*dvp)->v_mount &&
 	    ((*dvp)->v_vflag & VV_ROOT) &&
 	    ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
 		*dvp = (*dvp)->v_mount->mnt_vnodecovered;
 		VREF(mvp);
 		VOP_UNLOCK(mvp);
 		vn_close(mvp, FREAD, cred, td);
 		VREF(*dvp);
 		vn_lock(*dvp, LK_SHARED | LK_RETRY);
 		covered = 1;
 	}
 
 	fileno = va.va_fileid;
 
 	dirbuflen = DEV_BSIZE;
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	if ((*dvp)->v_type != VDIR) {
 		error = ENOENT;
 		goto out;
 	}
 
 	off = 0;
 	len = 0;
 	do {
 		/* call VOP_READDIR of parent */
 		error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
 					&cpos, &len, &eofflag, td);
 		if (error)
 			goto out;
 
 		if ((dp->d_type != DT_WHT) &&
 		    (dp->d_fileno == fileno)) {
 			if (covered) {
 				VOP_UNLOCK(*dvp);
 				vn_lock(mvp, LK_SHARED | LK_RETRY);
 				if (dirent_exists(mvp, dp->d_name, td)) {
 					error = ENOENT;
 					VOP_UNLOCK(mvp);
 					vn_lock(*dvp, LK_SHARED | LK_RETRY);
 					goto out;
 				}
 				VOP_UNLOCK(mvp);
 				vn_lock(*dvp, LK_SHARED | LK_RETRY);
 			}
 			i -= dp->d_namlen;
 
 			if (i < 0) {
 				error = ENOMEM;
 				goto out;
 			}
 			if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
 				error = ENOENT;
 			} else {
 				bcopy(dp->d_name, buf + i, dp->d_namlen);
 				error = 0;
 			}
 			goto out;
 		}
 	} while (len > 0 || !eofflag);
 	error = ENOENT;
 
 out:
 	free(dirbuf, M_TEMP);
 	if (!error) {
 		*buflen = i;
 		vref(*dvp);
 	}
 	if (covered) {
 		vput(*dvp);
 		vrele(mvp);
 	} else {
 		VOP_UNLOCK(mvp);
 		vn_close(mvp, FREAD, cred, td);
 	}
 	vn_lock(vp, locked | LK_RETRY);
 	return (error);
 }
 
 int
 vop_stdallocate(struct vop_allocate_args *ap)
 {
 #ifdef __notyet__
 	struct statfs *sfs;
 	off_t maxfilesize = 0;
 #endif
 	struct iovec aiov;
 	struct vattr vattr, *vap;
 	struct uio auio;
 	off_t fsize, len, cur, offset;
 	uint8_t *buf;
 	struct thread *td;
 	struct vnode *vp;
 	size_t iosize;
 	int error;
 
 	buf = NULL;
 	error = 0;
 	td = curthread;
 	vap = &vattr;
 	vp = ap->a_vp;
 	len = *ap->a_len;
 	offset = *ap->a_offset;
 
 	error = VOP_GETATTR(vp, vap, td->td_ucred);
 	if (error != 0)
 		goto out;
 	fsize = vap->va_size;
 	iosize = vap->va_blocksize;
 	if (iosize == 0)
 		iosize = BLKDEV_IOSIZE;
 	if (iosize > MAXPHYS)
 		iosize = MAXPHYS;
 	buf = malloc(iosize, M_TEMP, M_WAITOK);
 
 #ifdef __notyet__
 	/*
 	 * Check if the filesystem sets f_maxfilesize; if not use
 	 * VOP_SETATTR to perform the check.
 	 */
 	sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = VFS_STATFS(vp->v_mount, sfs, td);
 	if (error == 0)
 		maxfilesize = sfs->f_maxfilesize;
 	free(sfs, M_STATFS);
 	if (error != 0)
 		goto out;
 	if (maxfilesize) {
 		if (offset > maxfilesize || len > maxfilesize ||
 		    offset + len > maxfilesize) {
 			error = EFBIG;
 			goto out;
 		}
 	} else
 #endif
 	if (offset + len > vap->va_size) {
 		/*
 		 * Test offset + len against the filesystem's maxfilesize.
 		 */
 		VATTR_NULL(vap);
 		vap->va_size = offset + len;
 		error = VOP_SETATTR(vp, vap, td->td_ucred);
 		if (error != 0)
 			goto out;
 		VATTR_NULL(vap);
 		vap->va_size = fsize;
 		error = VOP_SETATTR(vp, vap, td->td_ucred);
 		if (error != 0)
 			goto out;
 	}
 
 	for (;;) {
 		/*
 		 * Read and write back anything below the nominal file
 		 * size.  There's currently no way outside the filesystem
 		 * to know whether this area is sparse or not.
 		 */
 		cur = iosize;
 		if ((offset % iosize) != 0)
 			cur -= (offset % iosize);
 		if (cur > len)
 			cur = len;
 		if (offset < fsize) {
 			aiov.iov_base = buf;
 			aiov.iov_len = cur;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = offset;
 			auio.uio_resid = cur;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = td;
 			error = VOP_READ(vp, &auio, 0, td->td_ucred);
 			if (error != 0)
 				break;
 			if (auio.uio_resid > 0) {
 				bzero(buf + cur - auio.uio_resid,
 				    auio.uio_resid);
 			}
 		} else {
 			bzero(buf, cur);
 		}
 
 		aiov.iov_base = buf;
 		aiov.iov_len = cur;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = offset;
 		auio.uio_resid = cur;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_WRITE;
 		auio.uio_td = td;
 
 		error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
 		if (error != 0)
 			break;
 
 		len -= cur;
 		offset += cur;
 		if (len == 0)
 			break;
 		if (should_yield())
 			break;
 	}
 
  out:
 	*ap->a_len = len;
 	*ap->a_offset = offset;
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 vop_stdadvise(struct vop_advise_args *ap)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	daddr_t startn, endn;
 	off_t bstart, bend, start, end;
 	int bsize, error;
 
 	vp = ap->a_vp;
 	switch (ap->a_advice) {
 	case POSIX_FADV_WILLNEED:
 		/*
 		 * Do nothing for now.  Filesystems should provide a
 		 * custom method which starts an asynchronous read of
 		 * the requested region.
 		 */
 		error = 0;
 		break;
 	case POSIX_FADV_DONTNEED:
 		error = 0;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (VN_IS_DOOMED(vp)) {
 			VOP_UNLOCK(vp);
 			break;
 		}
 
 		/*
 		 * Round to block boundaries (and later possibly further to
 		 * page boundaries).  Applications cannot reasonably be aware  
 		 * of the boundaries, and the rounding must be to expand at
 		 * both extremities to cover enough.  It still doesn't cover
 		 * read-ahead.  For partial blocks, this gives unnecessary
 		 * discarding of buffers but is efficient enough since the
 		 * pages usually remain in VMIO for some time.
 		 */
 		bsize = vp->v_bufobj.bo_bsize;
 		bstart = rounddown(ap->a_start, bsize);
 		bend = roundup(ap->a_end, bsize);
 
 		/*
 		 * Deactivate pages in the specified range from the backing VM
 		 * object.  Pages that are resident in the buffer cache will
 		 * remain wired until their corresponding buffers are released
 		 * below.
 		 */
 		if (vp->v_object != NULL) {
 			start = trunc_page(bstart);
 			end = round_page(bend);
 			VM_OBJECT_RLOCK(vp->v_object);
 			vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
 			    OFF_TO_IDX(end));
 			VM_OBJECT_RUNLOCK(vp->v_object);
 		}
 
 		bo = &vp->v_bufobj;
 		BO_RLOCK(bo);
 		startn = bstart / bsize;
 		endn = bend / bsize;
 		error = bnoreuselist(&bo->bo_clean, bo, startn, endn);
 		if (error == 0)
 			error = bnoreuselist(&bo->bo_dirty, bo, startn, endn);
 		BO_RUNLOCK(bo);
 		VOP_UNLOCK(vp);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 vop_stdunp_bind(struct vop_unp_bind_args *ap)
 {
 
 	ap->a_vp->v_unpcb = ap->a_unpcb;
 	return (0);
 }
 
 int
 vop_stdunp_connect(struct vop_unp_connect_args *ap)
 {
 
 	*ap->a_unpcb = ap->a_vp->v_unpcb;
 	return (0);
 }
 
 int
 vop_stdunp_detach(struct vop_unp_detach_args *ap)
 {
 
 	ap->a_vp->v_unpcb = NULL;
 	return (0);
 }
 
 static int
 vop_stdis_text(struct vop_is_text_args *ap)
 {
 
 	return (ap->a_vp->v_writecount < 0);
 }
 
 int
 vop_stdset_text(struct vop_set_text_args *ap)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 
 	vp = ap->a_vp;
 	VI_LOCK(vp);
 	if (vp->v_writecount > 0) {
 		error = ETXTBSY;
 	} else {
 		/*
 		 * If requested by fs, keep a use reference to the
 		 * vnode until the last text reference is released.
 		 */
 		mp = vp->v_mount;
 		if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 &&
 		    vp->v_writecount == 0) {
 			vp->v_iflag |= VI_TEXT_REF;
 			vrefl(vp);
 		}
 
 		vp->v_writecount--;
 		error = 0;
 	}
 	VI_UNLOCK(vp);
 	return (error);
 }
 
 static int
 vop_stdunset_text(struct vop_unset_text_args *ap)
 {
 	struct vnode *vp;
 	int error;
 	bool last;
 
 	vp = ap->a_vp;
 	last = false;
 	VI_LOCK(vp);
 	if (vp->v_writecount < 0) {
 		if ((vp->v_iflag & VI_TEXT_REF) != 0 &&
 		    vp->v_writecount == -1) {
 			last = true;
 			vp->v_iflag &= ~VI_TEXT_REF;
 		}
 		vp->v_writecount++;
 		error = 0;
 	} else {
 		error = EINVAL;
 	}
 	VI_UNLOCK(vp);
 	if (last)
 		vunref(vp);
 	return (error);
 }
 
 static int
 vop_stdadd_writecount(struct vop_add_writecount_args *ap)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 
 	vp = ap->a_vp;
 	VI_LOCK_FLAGS(vp, MTX_DUPOK);
 	if (vp->v_writecount < 0) {
 		error = ETXTBSY;
 	} else {
 		VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
 		    ("neg writecount increment %d", ap->a_inc));
 		if (vp->v_writecount == 0) {
 			mp = vp->v_mount;
 			if (mp != NULL && (mp->mnt_kern_flag & MNTK_NOMSYNC) == 0)
 				vlazy(vp);
 		}
 		vp->v_writecount += ap->a_inc;
 		error = 0;
 	}
 	VI_UNLOCK(vp);
 	return (error);
 }
 
 int
 vop_stdneed_inactive(struct vop_need_inactive_args *ap)
 {
 
 	return (1);
 }
 
 int
 vop_stdioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct vattr va;
 	off_t *offp;
 	int error;
 
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 	case FIOSEEKHOLE:
 		vp = ap->a_vp;
 		error = vn_lock(vp, LK_SHARED);
 		if (error != 0)
 			return (EBADF);
 		if (vp->v_type == VREG)
 			error = VOP_GETATTR(vp, &va, ap->a_cred);
 		else
 			error = ENOTTY;
 		if (error == 0) {
 			offp = ap->a_data;
 			if (*offp < 0 || *offp >= va.va_size)
 				error = ENXIO;
 			else if (ap->a_command == FIOSEEKHOLE)
 				*offp = va.va_size;
 		}
 		VOP_UNLOCK(vp);
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return (error);
 }
 
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.
  */
 int
 vfs_stdroot (mp, flags, vpp)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdstatfs (mp, sbp)
 	struct mount *mp;
 	struct statfs *sbp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdquotactl (mp, cmds, uid, arg)
 	struct mount *mp;
 	int cmds;
 	uid_t uid;
 	void *arg;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdsync(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	struct vnode *vp, *mvp;
 	struct thread *td;
 	int error, lockreq, allerror = 0;
 
 	td = curthread;
 	lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (waitfor != MNT_WAIT)
 		lockreq |= LK_NOWAIT;
 	/*
 	 * Force stale buffer cache information to be flushed.
 	 */
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		if ((error = vget(vp, lockreq, td)) != 0) {
+		if ((error = vget(vp, lockreq)) != 0) {
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		vput(vp);
 	}
 	return (allerror);
 }
 
 int
 vfs_stdnosync (mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 
 	return (0);
 }
 
 static int
 vop_stdcopy_file_range(struct vop_copy_file_range_args *ap)
 {
 	int error;
 
 	error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
 	    ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred,
 	    ap->a_outcred, ap->a_fsizetd);
 	return (error);
 }
 
 int
 vfs_stdvget (mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdfhtovp (mp, fhp, flags, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdinit (vfsp)
 	struct vfsconf *vfsp;
 {
 
 	return (0);
 }
 
 int
 vfs_stduninit (vfsp)
 	struct vfsconf *vfsp;
 {
 
 	return(0);
 }
 
 int
 vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
 	struct mount *mp;
 	int cmd;
 	struct vnode *filename_vp;
 	int attrnamespace;
 	const char *attrname;
 {
 
 	if (filename_vp != NULL)
 		VOP_UNLOCK(filename_vp);
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdsysctl(mp, op, req)
 	struct mount *mp;
 	fsctlop_t op;
 	struct sysctl_req *req;
 {
 
 	return (EOPNOTSUPP);
 }
 
 static vop_bypass_t *
 bp_by_off(struct vop_vector *vop, struct vop_generic_args *a)
 {
 
 	return (*(vop_bypass_t **)((char *)vop + a->a_desc->vdesc_vop_offset));
 }
 
 int
 vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a)
 {
 	vop_bypass_t *bp;
 	int prev_stops, rc;
 
 	bp = bp_by_off(vop, a);
 	MPASS(bp != NULL);
 
 	prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT);
 	rc = bp(a);
 	sigallowstop(prev_stops);
 	return (rc);
 }
 
 static int
 vop_stdstat(struct vop_stat_args *a)
 {
 	struct vattr vattr;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct stat *sb;
 	int error;
 	u_short mode;
 
 	vp = a->a_vp;
 	sb = a->a_sb;
 
 	error = vop_stat_helper_pre(a);
 	if (error != 0)
 		return (error);
 
 	vap = &vattr;
 
 	/*
 	 * Initialize defaults for new and unusual fields, so that file
 	 * systems which don't support these fields don't need to know
 	 * about them.
 	 */
 	vap->va_birthtime.tv_sec = -1;
 	vap->va_birthtime.tv_nsec = 0;
 	vap->va_fsid = VNOVAL;
 	vap->va_rdev = NODEV;
 
 	error = VOP_GETATTR(vp, vap, a->a_active_cred);
 	if (error)
 		goto out;
 
 	/*
 	 * Zero the spare stat fields
 	 */
 	bzero(sb, sizeof *sb);
 
 	/*
 	 * Copy from vattr table
 	 */
 	if (vap->va_fsid != VNOVAL)
 		sb->st_dev = vap->va_fsid;
 	else
 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vap->va_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		error = EBADF;
 		goto out;
 	}
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	if (vap->va_size > OFF_MAX) {
 		error = EOVERFLOW;
 		goto out;
 	}
 	sb->st_size = vap->va_size;
 	sb->st_atim.tv_sec = vap->va_atime.tv_sec;
 	sb->st_atim.tv_nsec = vap->va_atime.tv_nsec;
 	sb->st_mtim.tv_sec = vap->va_mtime.tv_sec;
 	sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec;
 	sb->st_ctim.tv_sec = vap->va_ctime.tv_sec;
 	sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec;
 	sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec;
 	sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec;
 
 	/*
 	 * According to www.opengroup.org, the meaning of st_blksize is
 	 *   "a filesystem-specific preferred I/O block size for this
 	 *    object.  In some filesystem types, this may vary from file
 	 *    to file"
 	 * Use minimum/default of PAGE_SIZE (e.g. for VCHR).
 	 */
 
 	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
 	sb->st_flags = vap->va_flags;
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 	sb->st_gen = vap->va_gen;
 out:
 	return (vop_stat_helper_post(a, error));
 }
Index: projects/clang1100-import/sys/kern/vfs_lookup.c
===================================================================
--- projects/clang1100-import/sys/kern/vfs_lookup.c	(revision 364278)
+++ projects/clang1100-import/sys/kern/vfs_lookup.c	(revision 364279)
@@ -1,1548 +1,1547 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef INVARIANTS
 #include <machine/_inttypes.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 #define	NAMEI_DIAGNOSTIC 1
 #undef NAMEI_DIAGNOSTIC
 
 SDT_PROVIDER_DEFINE(vfs);
 SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
     "unsigned long", "bool");
 SDT_PROBE_DEFINE3(vfs, namei, lookup, return, "int", "struct vnode *", "bool");
 
 /* Allocation zone for namei. */
 uma_zone_t namei_zone;
 
 /* Placeholder vnode for mp traversal. */
 static struct vnode *vp_crossmp;
 
 static int
 crossmp_vop_islocked(struct vop_islocked_args *ap)
 {
 
 	return (LK_SHARED);
 }
 
 static int
 crossmp_vop_lock1(struct vop_lock1_args *ap)
 {
 	struct vnode *vp;
 	struct lock *lk __unused;
 	const char *file __unused;
 	int flags, line __unused;
 
 	vp = ap->a_vp;
 	lk = vp->v_vnlock;
 	flags = ap->a_flags;
 	file = ap->a_file;
 	line = ap->a_line;
 
 	if ((flags & LK_SHARED) == 0)
 		panic("invalid lock request for crossmp");
 
 	WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line,
 	    flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL);
 	WITNESS_LOCK(&lk->lock_object, 0, file, line);
 	if ((flags & LK_INTERLOCK) != 0)
 		VI_UNLOCK(vp);
 	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line);
 	return (0);
 }
 
 static int
 crossmp_vop_unlock(struct vop_unlock_args *ap)
 {
 	struct vnode *vp;
 	struct lock *lk __unused;
 
 	vp = ap->a_vp;
 	lk = vp->v_vnlock;
 
 	WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE);
 	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
 	    LOCK_LINE);
 	return (0);
 }
 
 static struct vop_vector crossmp_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_islocked =		crossmp_vop_islocked,
 	.vop_lock1 =		crossmp_vop_lock1,
 	.vop_unlock =		crossmp_vop_unlock,
 };
 /*
  * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode
  * gets allocated early. See nameiinit for the direct call below.
  */
 
 struct nameicap_tracker {
 	struct vnode *dp;
 	TAILQ_ENTRY(nameicap_tracker) nm_link;
 };
 
 /* Zone for cap mode tracker elements used for dotdot capability checks. */
 static uma_zone_t nt_zone;
 
 static void
 nameiinit(void *dummy __unused)
 {
 
 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	vfs_vector_op_register(&crossmp_vnodeops);
 	getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
 
 static int lookup_cap_dotdot = 1;
 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
     &lookup_cap_dotdot, 0,
     "enables \"..\" components in path lookup in capability mode");
 static int lookup_cap_dotdot_nonlocal = 1;
 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
     &lookup_cap_dotdot_nonlocal, 0,
     "enables \"..\" components in path lookup in capability mode "
     "on non-local mount");
 
 static void
 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
 {
 	struct nameicap_tracker *nt;
 
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
 		return;
 	if ((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_BENEATH_LATCHED)) ==
 	    NI_LCF_BENEATH_ABS) {
 		MPASS((ndp->ni_lcf & NI_LCF_LATCH) != 0);
 		if (dp != ndp->ni_beneath_latch)
 			return;
 		ndp->ni_lcf |= NI_LCF_BENEATH_LATCHED;
 	}
 	nt = uma_zalloc(nt_zone, M_WAITOK);
 	vhold(dp);
 	nt->dp = dp;
 	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
 }
 
 static void
 nameicap_cleanup(struct nameidata *ndp, bool clean_latch)
 {
 	struct nameicap_tracker *nt, *nt1;
 
 	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
 	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
 	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
 		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
 		vdrop(nt->dp);
 		uma_zfree(nt_zone, nt);
 	}
 	if (clean_latch && (ndp->ni_lcf & NI_LCF_LATCH) != 0) {
 		ndp->ni_lcf &= ~NI_LCF_LATCH;
 		vrele(ndp->ni_beneath_latch);
 	}
 }
 
 /*
  * For dotdot lookups in capability mode, only allow the component
  * lookup to succeed if the resulting directory was already traversed
  * during the operation.  Also fail dotdot lookups for non-local
  * filesystems, where external agents might assist local lookups to
  * escape the compartment.
  */
 static int
 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
 {
 	struct nameicap_tracker *nt;
 	struct mount *mp;
 
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL ||
 	    dp->v_type != VDIR)
 		return (0);
 	mp = dp->v_mount;
 	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
 	    (mp->mnt_flag & MNT_LOCAL) == 0)
 		return (ENOTCAPABLE);
 	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
 	    nm_link) {
 		if (dp == nt->dp)
 			return (0);
 	}
 	if ((ndp->ni_lcf & NI_LCF_BENEATH_ABS) != 0) {
 		ndp->ni_lcf &= ~NI_LCF_BENEATH_LATCHED;
 		nameicap_cleanup(ndp, false);
 		return (0);
 	}
 	return (ENOTCAPABLE);
 }
 
 static void
 namei_cleanup_cnp(struct componentname *cnp)
 {
 
 	uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 	cnp->cn_pnbuf = NULL;
 	cnp->cn_nameptr = NULL;
 #endif
 }
 
 static int
 namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
 {
 	struct componentname *cnp;
 
 	cnp = &ndp->ni_cnd;
 	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_CAPFAIL))
 			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 		return (ENOTCAPABLE);
 	}
 	if ((cnp->cn_flags & BENEATH) != 0) {
 		ndp->ni_lcf |= NI_LCF_BENEATH_ABS;
 		ndp->ni_lcf &= ~NI_LCF_BENEATH_LATCHED;
 		nameicap_cleanup(ndp, false);
 	}
 	while (*(cnp->cn_nameptr) == '/') {
 		cnp->cn_nameptr++;
 		ndp->ni_pathlen--;
 	}
 	*dpp = ndp->ni_rootdir;
 	vrefact(*dpp);
 	return (0);
 }
 
 static int
 namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
 {
 	struct componentname *cnp;
 	struct file *dfp;
 	struct thread *td;
 	struct pwd *pwd;
 	cap_rights_t rights;
 	struct filecaps dirfd_caps;
 	int error;
 	bool startdir_used;
 
 	cnp = &ndp->ni_cnd;
 	td = cnp->cn_thread;
 
 	startdir_used = false;
 	*pwdp = NULL;
 	*dpp = NULL;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * In capability mode, lookups must be restricted to happen in
 	 * the subtree with the root specified by the file descriptor:
 	 * - The root must be real file descriptor, not the pseudo-descriptor
 	 *   AT_FDCWD.
 	 * - The passed path must be relative and not absolute.
 	 * - If lookup_cap_dotdot is disabled, path must not contain the
 	 *   '..' components.
 	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
 	 *   components lookups result in the directories which were
 	 *   previously walked by us, which prevents an escape from
 	 *   the relative root.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
 		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 		if (ndp->ni_dirfd == AT_FDCWD) {
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			return (ECAPMODE);
 		}
 	}
 #endif
 	error = 0;
 
 	/*
 	 * Get starting point for the translation.
 	 */
 	pwd = pwd_hold(td);
 	/*
 	 * The reference on ni_rootdir is acquired in the block below to avoid
 	 * back-to-back atomics for absolute lookups.
 	 */
 	ndp->ni_rootdir = pwd->pwd_rdir;
 	ndp->ni_topdir = pwd->pwd_jdir;
 
 	if (cnp->cn_pnbuf[0] == '/') {
 		ndp->ni_resflags |= NIRES_ABS;
 		error = namei_handle_root(ndp, dpp);
 	} else {
 		if (ndp->ni_startdir != NULL) {
 			*dpp = ndp->ni_startdir;
 			startdir_used = true;
 		} else if (ndp->ni_dirfd == AT_FDCWD) {
 			*dpp = pwd->pwd_cdir;
 			vrefact(*dpp);
 		} else {
 			rights = *ndp->ni_rightsneeded;
 			cap_rights_set_one(&rights, CAP_LOOKUP);
 
 			if (cnp->cn_flags & AUDITVNODE1)
 				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
 			if (cnp->cn_flags & AUDITVNODE2)
 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
 			/*
 			 * Effectively inlined fgetvp_rights, because we need to
 			 * inspect the file as well as grabbing the vnode.
 			 */
 			error = fget_cap(td, ndp->ni_dirfd, &rights,
 			    &dfp, &ndp->ni_filecaps);
 			if (error != 0) {
 				/*
 				 * Preserve the error; it should either be EBADF
 				 * or capability-related, both of which can be
 				 * safely returned to the caller.
 				 */
 			} else {
 				if (dfp->f_ops == &badfileops) {
 					error = EBADF;
 				} else if (dfp->f_vnode == NULL) {
 					error = ENOTDIR;
 				} else {
 					*dpp = dfp->f_vnode;
 					vrefact(*dpp);
 
 					if ((dfp->f_flag & FSEARCH) != 0)
 						cnp->cn_flags |= NOEXECCHECK;
 				}
 				fdrop(dfp, td);
 			}
 #ifdef CAPABILITIES
 			/*
 			 * If file descriptor doesn't have all rights,
 			 * all lookups relative to it must also be
 			 * strictly relative.
 			 */
 			CAP_ALL(&rights);
 			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
 			    &rights) ||
 			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 			    ndp->ni_filecaps.fc_nioctls != -1) {
 				ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 			}
 #endif
 		}
 		if (error == 0 && (*dpp)->v_type != VDIR)
 			error = ENOTDIR;
 	}
 	if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
 		if (ndp->ni_dirfd == AT_FDCWD) {
 			ndp->ni_beneath_latch = pwd->pwd_cdir;
 			vrefact(ndp->ni_beneath_latch);
 		} else {
 			rights = *ndp->ni_rightsneeded;
 			cap_rights_set_one(&rights, CAP_LOOKUP);
 			error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
 			    &dirfd_caps, &ndp->ni_beneath_latch);
 			if (error == 0 && (*dpp)->v_type != VDIR) {
 				vrele(ndp->ni_beneath_latch);
 				error = ENOTDIR;
 			}
 		}
 		if (error == 0)
 			ndp->ni_lcf |= NI_LCF_LATCH;
 	}
 	/*
 	 * If we are auditing the kernel pathname, save the user pathname.
 	 */
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
 	if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
 	if (ndp->ni_startdir != NULL && !startdir_used)
 		vrele(ndp->ni_startdir);
 	if (error != 0) {
 		if (*dpp != NULL)
 			vrele(*dpp);
 		pwd_drop(pwd);
 		return (error);
 	}
 	MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) !=
 	    NI_LCF_BENEATH_ABS);
 	if (((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
 	    lookup_cap_dotdot != 0) ||
 	    ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
 	    (cnp->cn_flags & BENEATH) != 0))
 		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
 	SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
 	    cnp->cn_flags, false);
 	*pwdp = pwd;
 	return (0);
 }
 
 /*
  * Convert a pathname into a pointer to a locked vnode.
  *
  * The FOLLOW flag is set when symbolic links are to be followed
  * when they occur at the end of the name translation process.
  * Symbolic links are always followed for all other pathname
  * components other than the last.
  *
  * The segflg defines whether the name is to be copied from user
  * space or kernel space.
  *
  * Overall outline of namei:
  *
  *	copy in name
  *	get starting directory
  *	while (!done && !error) {
  *		call lookup to search path.
  *		if symbolic link, massage name in buffer and continue
  *	}
  */
 int
 namei(struct nameidata *ndp)
 {
 	char *cp;		/* pointer into pathname argument */
 	struct vnode *dp;	/* the directory we are searching */
 	struct iovec aiov;		/* uio for reading symbolic links */
 	struct componentname *cnp;
 	struct thread *td;
 	struct pwd *pwd;
 	struct uio auio;
 	int error, linklen;
 	enum cache_fpl_status status;
 
 	cnp = &ndp->ni_cnd;
 	td = cnp->cn_thread;
 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
 	KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc"));
 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
 	    ("namei: nameiop contaminated with flags"));
 	KASSERT((cnp->cn_flags & OPMASK) == 0,
 	    ("namei: flags contaminated with nameiops"));
 	KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0,
 	    ("namei: unexpected flags: %" PRIx64 "\n",
 	    cnp->cn_flags & NAMEI_INTERNAL_FLAGS));
 	if (cnp->cn_flags & NOCACHE)
 		KASSERT(cnp->cn_nameiop != LOOKUP,
 		    ("%s: NOCACHE passed with LOOKUP", __func__));
 	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
 	    ndp->ni_startdir->v_type == VBAD);
 
 	ndp->ni_lcf = 0;
 	ndp->ni_vp = NULL;
 
 	/*
 	 * Get a buffer for the name to be translated, and copy the
 	 * name into the buffer.
 	 */
-	if ((cnp->cn_flags & HASBUF) == 0)
-		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
 		    &ndp->ni_pathlen);
 	else
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
 		    &ndp->ni_pathlen);
 
 	if (__predict_false(error != 0)) {
 		namei_cleanup_cnp(cnp);
 		return (error);
 	}
 
 	/*
 	 * Don't allow empty pathnames.
 	 */
 	if (__predict_false(*cnp->cn_pnbuf == '\0')) {
 		namei_cleanup_cnp(cnp);
 		return (ENOENT);
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_NAMEI)) {
 		KASSERT(cnp->cn_thread == curthread,
 		    ("namei not using curthread"));
 		ktrnamei(cnp->cn_pnbuf);
 	}
 #endif
 
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 
 	/*
 	 * First try looking up the target without locking any vnodes.
 	 *
 	 * We may need to start from scratch or pick up where it left off.
 	 */
 	error = cache_fplookup(ndp, &status, &pwd);
 	switch (status) {
 	case CACHE_FPL_STATUS_UNSET:
 		__assert_unreachable();
 		break;
 	case CACHE_FPL_STATUS_HANDLED:
 		return (error);
 	case CACHE_FPL_STATUS_PARTIAL:
 		TAILQ_INIT(&ndp->ni_cap_tracker);
 		dp = ndp->ni_startdir;
 		break;
 	case CACHE_FPL_STATUS_ABORTED:
 		TAILQ_INIT(&ndp->ni_cap_tracker);
 		error = namei_setup(ndp, &dp, &pwd);
 		if (error != 0) {
 			namei_cleanup_cnp(cnp);
 			return (error);
 		}
 		break;
 	}
 
 	ndp->ni_loopcnt = 0;
 
 	/*
 	 * Locked lookup.
 	 */
 	for (;;) {
 		ndp->ni_startdir = dp;
 		error = lookup(ndp);
 		if (error != 0)
 			goto out;
 		/*
 		 * If not a symbolic link, we're done.
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
 				namei_cleanup_cnp(cnp);
 			} else
 				cnp->cn_flags |= HASBUF;
 			if ((ndp->ni_lcf & (NI_LCF_BENEATH_ABS |
 			    NI_LCF_BENEATH_LATCHED)) == NI_LCF_BENEATH_ABS) {
 				NDFREE(ndp, 0);
 				error = ENOTCAPABLE;
 			}
 			nameicap_cleanup(ndp, true);
 			SDT_PROBE3(vfs, namei, lookup, return, error,
 			    (error == 0 ? ndp->ni_vp : NULL), false);
 			pwd_drop(pwd);
 			return (error);
 		}
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			break;
 		}
 #ifdef MAC
 		if ((cnp->cn_flags & NOMACCHECK) == 0) {
 			error = mac_vnode_check_readlink(td->td_ucred,
 			    ndp->ni_vp);
 			if (error != 0)
 				break;
 		}
 #endif
 		if (ndp->ni_pathlen > 1)
 			cp = uma_zalloc(namei_zone, M_WAITOK);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = td;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error != 0) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENOENT;
 			break;
 		}
 		if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENAMETOOLONG;
 			break;
 		}
 		if (ndp->ni_pathlen > 1) {
 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 		vput(ndp->ni_vp);
 		dp = ndp->ni_dvp;
 		/*
 		 * Check if root directory should replace current directory.
 		 */
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		if (*(cnp->cn_nameptr) == '/') {
 			vrele(dp);
 			error = namei_handle_root(ndp, &dp);
 			if (error != 0)
 				goto out;
 		}
 	}
 	vput(ndp->ni_vp);
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
 out:
 	MPASS(error != 0);
 	namei_cleanup_cnp(cnp);
 	nameicap_cleanup(ndp, true);
 	SDT_PROBE3(vfs, namei, lookup, return, error, NULL, false);
 	pwd_drop(pwd);
 	return (error);
 }
 
 static int
 compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
 {
 
 	if (mp == NULL || ((lkflags & LK_SHARED) &&
 	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
 	    ((cnflags & ISDOTDOT) &&
 	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
 		lkflags &= ~LK_SHARED;
 		lkflags |= LK_EXCLUSIVE;
 	}
 	lkflags |= LK_NODDLKTREAT;
 	return (lkflags);
 }
 
 static __inline int
 needs_exclusive_leaf(struct mount *mp, int flags)
 {
 
 	/*
 	 * Intermediate nodes can use shared locks, we only need to
 	 * force an exclusive lock for leaf nodes.
 	 */
 	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
 		return (0);
 
 	/* Always use exclusive locks if LOCKSHARED isn't set. */
 	if (!(flags & LOCKSHARED))
 		return (1);
 
 	/*
 	 * For lookups during open(), if the mount point supports
 	 * extended shared operations, then use a shared lock for the
 	 * leaf node, otherwise use an exclusive lock.
 	 */
 	if ((flags & ISOPEN) != 0)
 		return (!MNT_EXTENDED_SHARED(mp));
 
 	/*
 	 * Lookup requests outside of open() that specify LOCKSHARED
 	 * only need a shared lock on the leaf vnode.
 	 */
 	return (0);
 }
 
 /*
  * Search a pathname.
  * This is a very central and rather complicated routine.
  *
  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
  * The starting directory is taken from ni_startdir. The pathname is
  * descended until done, or a symbolic link is encountered. The variable
  * ni_more is clear if the path is completed; it is set to one if a
  * symbolic link needing interpretation is encountered.
  *
  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
  * whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
  * returned unlocked. Otherwise the parent directory is not returned. If
  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
  * the target is returned locked, otherwise it is returned unlocked.
  * When creating or renaming and LOCKPARENT is specified, the target may not
  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
  *
  * Overall outline of lookup:
  *
  * dirloop:
  *	identify next component of name at ndp->ni_ptr
  *	handle degenerate case where name is null string
  *	if .. and crossing mount points and on mounted filesys, find parent
  *	call VOP_LOOKUP routine for next component name
  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
  *	    component vnode returned in ni_vp (if it exists), locked.
  *	if result vnode is mounted on and crossing mount points,
  *	    find mounted on vnode
  *	if more components of name, do next level at dirloop
  *	return the answer in ni_vp, locked if LOCKLEAF set
  *	    if LOCKPARENT set, return locked parent in ni_dvp
  *	    if WANTPARENT set, return unlocked parent in ni_dvp
  */
 int
 lookup(struct nameidata *ndp)
 {
 	char *cp;			/* pointer into pathname argument */
 	char *prev_ni_next;		/* saved ndp->ni_next */
 	struct vnode *dp = NULL;	/* the directory we are searching */
 	struct vnode *tdp;		/* saved dp */
 	struct mount *mp;		/* mount table entry */
 	struct prison *pr;
 	size_t prev_ni_pathlen;		/* saved ndp->ni_pathlen */
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 	int dpunlocked = 0;		/* dp has already been unlocked */
 	int relookup = 0;		/* do not consume the path component */
 	struct componentname *cnp = &ndp->ni_cnd;
 	int lkflags_save;
 	int ni_dvp_unlocked;
 
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	ni_dvp_unlocked = 0;
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
 	    (wantparent && cnp->cn_nameiop != CREATE &&
 	     cnp->cn_nameiop != LOOKUP))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	ndp->ni_dvp = NULL;
 	/*
 	 * We use shared locks until we hit the parent of the last cn then
 	 * we adjust based on the requesting flags.
 	 */
 	cnp->cn_lkflags = LK_SHARED;
 	dp = ndp->ni_startdir;
 	ndp->ni_startdir = NULLVP;
 	vn_lock(dp,
 	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
 	    cnp->cn_flags));
 
 dirloop:
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
 		continue;
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	if (cnp->cn_namelen > NAME_MAX) {
 		error = ENAMETOOLONG;
 		goto bad;
 	}
 #ifdef NAMEI_DIAGNOSTIC
 	{ char c = *cp;
 	*cp = '\0';
 	printf("{%s}: ", cnp->cn_nameptr);
 	*cp = c; }
 #endif
 	prev_ni_pathlen = ndp->ni_pathlen;
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
 	prev_ni_next = ndp->ni_next;
 	ndp->ni_next = cp;
 
 	/*
 	 * Replace multiple slashes by a single slash and trailing slashes
 	 * by a null.  This must be done before VOP_LOOKUP() because some
 	 * fs's don't know about trailing slashes.  Remember if there were
 	 * trailing slashes to handle symlinks, existing non-directories
 	 * and non-existing files that won't be directories specially later.
 	 */
 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
 		cp++;
 		ndp->ni_pathlen--;
 		if (*cp == '\0') {
 			*ndp->ni_next = '\0';
 			cnp->cn_flags |= TRAILINGSLASH;
 		}
 	}
 	ndp->ni_next = cp;
 
 	cnp->cn_flags |= MAKEENTRY;
 	if (*cp == '\0' && docache == 0)
 		cnp->cn_flags &= ~MAKEENTRY;
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		cnp->cn_flags |= ISDOTDOT;
 	else
 		cnp->cn_flags &= ~ISDOTDOT;
 	if (*ndp->ni_next == 0)
 		cnp->cn_flags |= ISLASTCN;
 	else
 		cnp->cn_flags &= ~ISLASTCN;
 
 	if ((cnp->cn_flags & ISLASTCN) != 0 &&
 	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EINVAL;
 		goto bad;
 	}
 
 	nameicap_tracker_add(ndp, dp);
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (cnp->cn_nameiop != LOOKUP) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (wantparent) {
 			ndp->ni_dvp = dp;
 			VREF(dp);
 		}
 		ndp->ni_vp = dp;
 
 		if (cnp->cn_flags & AUDITVNODE1)
 			AUDIT_ARG_VNODE1(dp);
 		else if (cnp->cn_flags & AUDITVNODE2)
 			AUDIT_ARG_VNODE2(dp);
 
 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
 			VOP_UNLOCK(dp);
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		goto success;
 	}
 
 	/*
 	 * Handle "..": five special cases.
 	 * 0. If doing a capability lookup and lookup_cap_dotdot is
 	 *    disabled, return ENOTCAPABLE.
 	 * 1. Return an error if this is the last component of
 	 *    the name and the operation is DELETE or RENAME.
 	 * 2. If at root directory (e.g. after chroot)
 	 *    or at absolute root directory
 	 *    then ignore it so can't get out.
 	 * 3. If this vnode is the root of a mounted
 	 *    filesystem, then replace it with the
 	 *    vnode which was mounted on so we take the
 	 *    .. in the other filesystem.
 	 * 4. If the vnode is the top directory of
 	 *    the jail or chroot, don't let them out.
 	 * 5. If doing a capability lookup and lookup_cap_dotdot is
 	 *    enabled, return ENOTCAPABLE if the lookup would escape
 	 *    from the initial file descriptor directory.  Checks are
 	 *    done by ensuring that namei() already traversed the
 	 *    result of dotdot lookup.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
 		    == NI_LCF_STRICTRELATIVE) {
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			error = ENOTCAPABLE;
 			goto bad;
 		}
 		if ((cnp->cn_flags & ISLASTCN) != 0 &&
 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 			error = EINVAL;
 			goto bad;
 		}
 		for (;;) {
 			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 			     pr = pr->pr_parent)
 				if (dp == pr->pr_root)
 					break;
 			if (dp == ndp->ni_rootdir || 
 			    dp == ndp->ni_topdir || 
 			    dp == rootvnode ||
 			    pr != NULL ||
 			    ((dp->v_vflag & VV_ROOT) != 0 &&
 			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
 				VREF(dp);
 				goto nextname;
 			}
 			if ((dp->v_vflag & VV_ROOT) == 0)
 				break;
 			if (VN_IS_DOOMED(dp)) {	/* forced unmount */
 				error = ENOENT;
 				goto bad;
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			VREF(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, ISDOTDOT));
 			error = nameicap_check_dotdot(ndp, dp);
 			if (error != 0) {
 #ifdef KTRACE
 				if (KTRPOINT(curthread, KTR_CAPFAIL))
 					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 				goto bad;
 			}
 		}
 	}
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 unionlookup:
 #ifdef MAC
 	error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp, cnp);
 	if (error)
 		goto bad;
 #endif
 	ndp->ni_dvp = dp;
 	ndp->ni_vp = NULL;
 	ASSERT_VOP_LOCKED(dp, "lookup");
 	/*
 	 * If we have a shared lock we may need to upgrade the lock for the
 	 * last operation.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
 	    dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
 		vn_lock(dp, LK_UPGRADE|LK_RETRY);
 	if (VN_IS_DOOMED(dp)) {
 		error = ENOENT;
 		goto bad;
 	}
 	/*
 	 * If we're looking up the last component and we need an exclusive
 	 * lock, adjust our lkflags.
 	 */
 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 #ifdef NAMEI_DIAGNOSTIC
 	vn_printf(dp, "lookup in ");
 #endif
 	lkflags_save = cnp->cn_lkflags;
 	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
 	    cnp->cn_flags);
 	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
 	cnp->cn_lkflags = lkflags_save;
 	if (error != 0) {
 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
 #endif
 		if ((error == ENOENT) &&
 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			VREF(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, cnp->cn_flags));
 			nameicap_tracker_add(ndp, dp);
 			goto unionlookup;
 		}
 
 		if (error == ERELOOKUP) {
 			vref(dp);
 			ndp->ni_vp = dp;
 			error = 0;
 			relookup = 1;
 			goto good;
 		}
 
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * At this point, we know we're at the end of the
 		 * pathname.  If creating / renaming, we can consider
 		 * allowing the file or directory to be created / renamed,
 		 * provided we're not on a read-only filesystem.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* trailing slash only allowed for directories */
 		if ((cnp->cn_flags & TRAILINGSLASH) &&
 		    !(cnp->cn_flags & WILLBEDIR)) {
 			error = ENOENT;
 			goto bad;
 		}
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir = ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
 		goto success;
 	}
 
 good:
 #ifdef NAMEI_DIAGNOSTIC
 	printf("found\n");
 #endif
 	dp = ndp->ni_vp;
 
 	/*
 	 * Check to see if the vnode has been mounted on;
 	 * if so find the root of the mounted filesystem.
 	 */
 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
 		if (vfs_busy(mp, 0))
 			continue;
 		vput(dp);
 		if (dp != ndp->ni_dvp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vrefact(vp_crossmp);
 		ndp->ni_dvp = vp_crossmp;
 		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
 		    cnp->cn_flags), &tdp);
 		vfs_unbusy(mp);
 		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
 			panic("vp_crossmp exclusively locked or reclaimed");
 		if (error) {
 			dpunlocked = 1;
 			goto bad2;
 		}
 		ndp->ni_vp = dp = tdp;
 	}
 
 	/*
 	 * Check for symbolic link
 	 */
 	if ((dp->v_type == VLNK) &&
 	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
 	     *ndp->ni_next == '/')) {
 		cnp->cn_flags |= ISSYMLINK;
 		if (VN_IS_DOOMED(dp)) {
 			/*
 			 * We can't know whether the directory was mounted with
 			 * NOSYMFOLLOW, so we can't follow safely.
 			 */
 			error = ENOENT;
 			goto bad2;
 		}
 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
 			error = EACCES;
 			goto bad2;
 		}
 		/*
 		 * Symlink code always expects an unlocked dvp.
 		 */
 		if (ndp->ni_dvp != ndp->ni_vp) {
 			VOP_UNLOCK(ndp->ni_dvp);
 			ni_dvp_unlocked = 1;
 		}
 		goto success;
 	}
 
 nextname:
 	/*
 	 * Not a symbolic link that we will follow.  Continue with the
 	 * next component if there is any; otherwise, we're done.
 	 */
 	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
 	    ("lookup: invalid path state."));
 	if (relookup) {
 		relookup = 0;
 		ndp->ni_pathlen = prev_ni_pathlen;
 		ndp->ni_next = prev_ni_next;
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
 	if (cnp->cn_flags & ISDOTDOT) {
 		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
 		if (error != 0) {
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			goto bad2;
 		}
 	}
 	if (*ndp->ni_next == '/') {
 		cnp->cn_nameptr = ndp->ni_next;
 		while (*cnp->cn_nameptr == '/') {
 			cnp->cn_nameptr++;
 			ndp->ni_pathlen--;
 		}
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
 	/*
 	 * If we're processing a path with a trailing slash,
 	 * check that the end result is a directory.
 	 */
 	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto bad2;
 	}
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad2;
 	}
 	if (cnp->cn_flags & SAVESTART) {
 		ndp->ni_startdir = ndp->ni_dvp;
 		VREF(ndp->ni_startdir);
 	}
 	if (!wantparent) {
 		ni_dvp_unlocked = 2;
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
 		VOP_UNLOCK(ndp->ni_dvp);
 		ni_dvp_unlocked = 1;
 	}
 
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG_VNODE1(dp);
 	else if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG_VNODE2(dp);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp);
 success:
 	/*
 	 * Because of shared lookup we may have the vnode shared locked, but
 	 * the caller may want it to be exclusively locked.
 	 */
 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
 	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
 		vn_lock(dp, LK_UPGRADE | LK_RETRY);
 		if (VN_IS_DOOMED(dp)) {
 			error = ENOENT;
 			goto bad2;
 		}
 	}
 	return (0);
 
 bad2:
 	if (ni_dvp_unlocked != 2) {
 		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 	}
 bad:
 	if (!dpunlocked)
 		vput(dp);
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * relookup - lookup a path name component
  *    Used by lookup to re-acquire things.
  */
 int
 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct vnode *dp = NULL;		/* the directory we are searching */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 
 	KASSERT(cnp->cn_flags & ISLASTCN,
 	    ("relookup: Not given last component."));
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
 	KASSERT(wantparent, ("relookup: parent not wanted."));
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	dp = dvp;
 	cnp->cn_lkflags = LK_EXCLUSIVE;
 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
 
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	printf("{%s}: ", cnp->cn_nameptr);
 #endif
 
 	/*
 	 * Check for "" which represents the root directory after slash
 	 * removal.
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		/*
 		 * Support only LOOKUP for "/" because lookup()
 		 * can't succeed for CREATE, DELETE and RENAME.
 		 */
 		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
 		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
 
 		if (!(cnp->cn_flags & LOCKLEAF))
 			VOP_UNLOCK(dp);
 		*vpp = dp;
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		return (0);
 	}
 
 	if (cnp->cn_flags & ISDOTDOT)
 		panic ("relookup: lookup on dot-dot");
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	vn_printf(dp, "search in ");
 #endif
 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
 		KASSERT(*vpp == NULL, ("leaf should be empty"));
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* ASSERT(dvp == ndp->ni_startdir) */
 		if (cnp->cn_flags & SAVESTART)
 			VREF(dvp);
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		return (0);
 	}
 
 	dp = *vpp;
 
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		if (dvp == dp)
 			vrele(dvp);
 		else
 			vput(dvp);
 		error = EROFS;
 		goto bad;
 	}
 	/*
 	 * Set the parent lock/ref state to the requested state.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
 		if (wantparent)
 			VOP_UNLOCK(dvp);
 		else
 			vput(dvp);
 	} else if (!wantparent)
 		vrele(dvp);
 	/*
 	 * Check for symbolic link
 	 */
 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
 	    ("relookup: symlink found.\n"));
 
 	/* ASSERT(dvp == ndp->ni_startdir) */
 	if (cnp->cn_flags & SAVESTART)
 		VREF(dvp);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp);
 	return (0);
 bad:
 	vput(dp);
 	*vpp = NULL;
 	return (error);
 }
 
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
 NDFREE_PNBUF(struct nameidata *ndp)
 {
 
 	if ((ndp->ni_cnd.cn_flags & HASBUF) != 0) {
 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 }
 
 void
 (NDFREE)(struct nameidata *ndp, const u_int flags)
 {
 	int unlock_dvp;
 	int unlock_vp;
 
 	unlock_dvp = 0;
 	unlock_vp = 0;
 
 	if (!(flags & NDF_NO_FREE_PNBUF)) {
 		NDFREE_PNBUF(ndp);
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		unlock_vp = 1;
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		unlock_dvp = 1;
 	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
 		if (unlock_vp) {
 			vput(ndp->ni_vp);
 			unlock_vp = 0;
 		} else
 			vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (unlock_vp)
 		VOP_UNLOCK(ndp->ni_vp);
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		if (unlock_dvp) {
 			vput(ndp->ni_dvp);
 			unlock_dvp = 0;
 		} else
 			vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (unlock_dvp)
 		VOP_UNLOCK(ndp->ni_dvp);
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
 
 /*
  * Determine if there is a suitable alternate filename under the specified
  * prefix for the specified path.  If the create flag is set, then the
  * alternate prefix will be used so long as the parent directory exists.
  * This is used by the various compatibility ABIs so that Linux binaries prefer
  * files under /compat/linux for example.  The chosen path (whether under
  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
  * to by pathbuf.  The caller is responsible for free'ing the buffer from
  * the M_TEMP bucket if one is returned.
  */
 int
 kern_alternate_path(struct thread *td, const char *prefix, const char *path,
     enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
 {
 	struct nameidata nd, ndroot;
 	char *ptr, *buf, *cp;
 	size_t len, sz;
 	int error;
 
 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	*pathbuf = buf;
 
 	/* Copy the prefix into the new pathname as a starting point. */
 	len = strlcpy(buf, prefix, MAXPATHLEN);
 	if (len >= MAXPATHLEN) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (EINVAL);
 	}
 	sz = MAXPATHLEN - len;
 	ptr = buf + len;
 
 	/* Append the filename to the prefix. */
 	if (pathseg == UIO_SYSSPACE)
 		error = copystr(path, ptr, sz, &len);
 	else
 		error = copyinstr(path, ptr, sz, &len);
 
 	if (error) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (error);
 	}
 
 	/* Only use a prefix with absolute pathnames. */
 	if (*ptr != '/') {
 		error = EINVAL;
 		goto keeporig;
 	}
 
 	if (dirfd != AT_FDCWD) {
 		/*
 		 * We want the original because the "prefix" is
 		 * included in the already opened dirfd.
 		 */
 		bcopy(ptr, buf, len);
 		return (0);
 	}
 
 	/*
 	 * We know that there is a / somewhere in this pathname.
 	 * Search backwards for it, to find the file's parent dir
 	 * to see if it exists in the alternate tree. If it does,
 	 * and we want to create a file (cflag is set). We don't
 	 * need to worry about the root comparison in this case.
 	 */
 
 	if (create) {
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
 		*cp = '\0';
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
 		error = namei(&nd);
 		*cp = '/';
 		if (error != 0)
 			goto keeporig;
 	} else {
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
 
 		error = namei(&nd);
 		if (error != 0)
 			goto keeporig;
 
 		/*
 		 * We now compare the vnode of the prefix to the one
 		 * vnode asked. If they resolve to be the same, then we
 		 * ignore the match so that the real root gets used.
 		 * This avoids the problem of traversing "../.." to find the
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
 		    td);
 
 		/* We shouldn't ever get an error from this namei(). */
 		error = namei(&ndroot);
 		if (error == 0) {
 			if (nd.ni_vp == ndroot.ni_vp)
 				error = ENOENT;
 
 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
 			vrele(ndroot.ni_vp);
 		}
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
 
 keeporig:
 	/* If there was an error, use the original path name. */
 	if (error)
 		bcopy(ptr, buf, len);
 	return (error);
 }
Index: projects/clang1100-import/sys/kern/vfs_subr.c
===================================================================
--- projects/clang1100-import/sys/kern/vfs_subr.c	(revision 364278)
+++ projects/clang1100-import/sys/kern/vfs_subr.c	(revision 364279)
@@ -1,6681 +1,6679 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	v_init_counters(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static void	vfs_knl_assert_locked(void *arg);
 static void	vfs_knl_assert_unlocked(void *arg);
 static void	destroy_vpollinfo(struct vpollinfo *vi);
 static int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 		    daddr_t startlbn, daddr_t endlbn);
 static void	vnlru_recalc(void);
 
 /*
  * These fences are intended for cases where some synchronization is
  * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
  * and v_usecount) updates.  Access to v_iflags is generally synchronized
  * by the interlock, but we have some internal assertions that check vnode
  * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
  * for now.
  */
 #ifdef INVARIANTS
 #define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
 #define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
 #else
 #define	VNODE_REFCOUNT_FENCE_ACQ()
 #define	VNODE_REFCOUNT_FENCE_REL()
 #endif
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
  */
 static u_long __exclusive_cache_line numvnodes;
 
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
 static counter_u64_t vnodes_created;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
     "Number of vnodes created by getnewvnode");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of allocates vnodes in the system.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_list;
 static struct vnode *vnode_list_free_marker;
 static struct vnode *vnode_list_reclaim_marker;
 
 /*
  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  * just ones that are cheap to recycle.  Usually they are for files which
  * have been stat'd but not read; these usually have inode and namecache
  * data attached to them.  This target is the preferred minimum size of a
  * sub-cache consisting mostly of such files. The system balances the size
  * of this sub-cache with its complement to try to prevent either from
  * thrashing while the other is relatively inactive.  The targets express
  * a preference for the best balance.
  *
  * "Above" this target there are 2 further targets (watermarks) related
  * to recyling of free vnodes.  In the best-operating case, the cache is
  * exactly full, the free list has size between vlowat and vhiwat above the
  * free target, and recycling from it and normal use maintains this state.
  * Sometimes the free list is below vlowat or even empty, but this state
  * is even better for immediate use provided the cache is not full.
  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  * ones) to reach one of these states.  The watermarks are currently hard-
  * coded as 4% and 9% of the available space higher.  These and the default
  * of 25% for wantfreevnodes are too large if the memory size is large.
  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  * whenever vnlru_proc() becomes active.
  */
 static long wantfreevnodes;
 static long __exclusive_cache_line freevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
     &freevnodes, 0, "Number of \"free\" vnodes");
 static long freevnodes_old;
 
 static counter_u64_t recycles_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
     "Number of vnodes recycled to meet vnode cache targets");
 
 static counter_u64_t recycles_free_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
     "Number of free vnodes recycled to meet vnode cache targets");
 
 static counter_u64_t deferred_inact;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
     "Number of times inactive processing was deferred");
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx __exclusive_cache_line vnode_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 static uma_zone_t buf_trie_zone;
 static smr_t buf_trie_smr;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 __read_frequently smr_t vfs_smr;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 static struct cv sync_wakeup;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
     "Time to delay syncing files (in seconds)");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
     "Time to delay syncing directories (in seconds)");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
     "Time to delay syncing metadata (in seconds)");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
 #define	VDBATCH_SIZE 8
 struct vdbatch {
 	u_int index;
 	long freevnodes;
 	struct mtx lock;
 	struct vnode *tab[VDBATCH_SIZE];
 };
 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
 
 static void	vdbatch_dequeue(struct vnode *vp);
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /* Target for maximum number of vnodes. */
 u_long desiredvnodes;
 static u_long gapvnodes;		/* gap between wanted and desired */
 static u_long vhiwat;		/* enough extras after expansion */
 static u_long vlowat;		/* minimal extras before expansion */
 static u_long vstir;		/* nonzero to stir non-free vnodes */
 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static u_long vnlru_read_freevnodes(void);
 
 /*
  * Note that no attempt is made to sanitize these parameters.
  */
 static int
 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	int error;
 
 	val = desiredvnodes;
 	error = sysctl_handle_long(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == desiredvnodes)
 		return (0);
 	mtx_lock(&vnode_list_mtx);
 	desiredvnodes = val;
 	wantfreevnodes = desiredvnodes / 4;
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	/*
 	 * XXX There is no protection against multiple threads changing
 	 * desiredvnodes at the same time. Locking above only helps vnlru and
 	 * getnewvnode.
 	 */
 	vfs_hash_changesize(desiredvnodes);
 	cache_changesize(desiredvnodes);
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
     "LU", "Target for maximum number of vnodes");
 
 static int
 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	int error;
 
 	val = wantfreevnodes;
 	error = sysctl_handle_long(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == wantfreevnodes)
 		return (0);
 	mtx_lock(&vnode_list_mtx);
 	wantfreevnodes = val;
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
     "LU", "Target for minimum number of \"free\" vnodes");
 
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 static int
 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	char *buf;
 	unsigned long ndflags;
 	int error;
 
 	if (req->newptr == NULL)
 		return (EINVAL);
 	if (req->newlen >= PATH_MAX)
 		return (E2BIG);
 
 	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
 	error = SYSCTL_IN(req, buf, req->newlen);
 	if (error != 0)
 		goto out;
 
 	buf[req->newlen] = '\0';
 
 	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME;
 	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vp = nd.ni_vp;
 
 	if (VN_IS_DOOMED(vp)) {
 		/*
 		 * This vnode is being recycled.  Return != 0 to let the caller
 		 * know that the sysctl had no effect.  Return EAGAIN because a
 		 * subsequent call will likely succeed (since namei will create
 		 * a new vnode if necessary)
 		 */
 		error = EAGAIN;
 		goto putvnode;
 	}
 
 	counter_u64_add(recycles_count, 1);
 	vgone(vp);
 putvnode:
 	NDFREE(&nd, 0);
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 	int fd;
 
 	if (req->newptr == NULL)
 		return (EBADF);
 
         error = sysctl_handle_int(oidp, &fd, 0, req);
         if (error != 0)
                 return (error);
 	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 
 	error = vn_lock(vp, LK_EXCLUSIVE);
 	if (error != 0)
 		goto drop;
 
 	counter_u64_add(recycles_count, 1);
 	vgone(vp);
 	VOP_UNLOCK(vp);
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_ftry_reclaim_vnode, "I",
     "Try to reclaim a vnode by its file descriptor");
 
 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 static int vnsz2log;
 
 /*
  * Support for the bufobj clean & dirty pctrie.
  */
 static void *
 buf_trie_alloc(struct pctrie *ptree)
 {
 	return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
 }
 
 static void
 buf_trie_free(struct pctrie *ptree, void *node)
 {
 	uma_zfree_smr(buf_trie_zone, node);
 }
 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
     buf_trie_smr);
 
 /*
  * Initialize the vnode management data structures.
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512UL * 1024 * 1024 / 64)	/* 8M */
 #endif
 
 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 static struct vnode *
 vn_alloc_marker(struct mount *mp)
 {
 	struct vnode *vp;
 
 	vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	vp->v_type = VMARKER;
 	vp->v_mount = mp;
 
 	return (vp);
 }
 
 static void
 vn_free_marker(struct vnode *vp)
 {
 
 	MPASS(vp->v_type == VMARKER);
 	free(vp, M_VNODE_MARKER);
 }
 
 /*
  * Initialize a vnode as it first enters the zone.
  */
 static int
 vnode_init(void *mem, int size, int flags)
 {
 	struct vnode *vp;
 
 	vp = mem;
 	bzero(vp, size);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 	    LK_NOSHARE | LK_IS_VNODE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bufobj_init(&vp->v_bufobj, vp);
 	/*
 	 * Initialize namecache.
 	 */
 	cache_vnode_init(vp);
 	/*
 	 * Initialize rangelocks.
 	 */
 	rangelock_init(&vp->v_rl);
 
 	vp->v_dbatchcpu = NOCPU;
 
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	return (0);
 }
 
 /*
  * Free a vnode when it is cleared from the zone.
  */
 static void
 vnode_fini(void *mem, int size)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	vp = mem;
 	vdbatch_dequeue(vp);
 	mtx_lock(&vnode_list_mtx);
 	TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 	mtx_unlock(&vnode_list_mtx);
 	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	bo = &vp->v_bufobj;
 	rw_destroy(BO_LOCKPTR(bo));
 }
 
 /*
  * Provide the size of NFS nclnode and NFS fh for calculation of the
  * vnode memory consumption.  The size is specified directly to
  * eliminate dependency on NFS-private header.
  *
  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  * private inode data, but the NFS-based estimation is ample enough.
  * Still, we care about differences in the size between 64- and 32-bit
  * platforms.
  *
  * Namecache structure size is heuristically
  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  */
 #ifdef _LP64
 #define	NFS_NCLNODE_SZ	(528 + 64)
 #define	NC_SZ		148
 #else
 #define	NFS_NCLNODE_SZ	(360 + 32)
 #define	NC_SZ		92
 #endif
 
 static void
 vntblinit(void *dummy __unused)
 {
 	struct vdbatch *vd;
 	int cpu, physvnodes, virtvnodes;
 	u_int i;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
 	 * physical memory size.  The ratio of desiredvnodes to the physical
 	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
 	 * Thereafter, the
 	 * marginal ratio of desiredvnodes to the physical memory size is
 	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
 	 * must not exceed 1/10th of the kernel's heap size.
 	 */
 	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 	desiredvnodes = min(physvnodes, virtvnodes);
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %lu -> %lu\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_list);
 	mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
 	/*
 	 * The lock is taken to appease WITNESS.
 	 */
 	mtx_lock(&vnode_list_mtx);
 	vnlru_recalc();
 	mtx_unlock(&vnode_list_mtx);
 	vnode_list_free_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
 	vnode_list_reclaim_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 	uma_zone_set_smr(vnode_zone, vfs_smr);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
 	 * Preallocate enough nodes to support one-per buf so that
 	 * we can not fail an insert.  reassignbuf() callers can not
 	 * tolerate the insertion failure.
 	 */
 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
 	    UMA_ZONE_NOFREE | UMA_ZONE_SMR);
 	buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
 	uma_prealloc(buf_trie_zone, nbuf);
 
 	vnodes_created = counter_u64_alloc(M_WAITOK);
 	recycles_count = counter_u64_alloc(M_WAITOK);
 	recycles_free_count = counter_u64_alloc(M_WAITOK);
 	deferred_inact = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 		vnsz2log++;
 	vnsz2log--;
 
 	CPU_FOREACH(cpu) {
 		vd = DPCPU_ID_PTR((cpu), vd);
 		bzero(vd, sizeof(*vd));
 		mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
 	}
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Eventually, mountlist_mtx is not released on failure.
  *
  * vfs_busy() is a custom lock, it can block the caller.
  * vfs_busy() only sleeps if the unmount is active on the mount point.
  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  * vnode belonging to mp.
  *
  * Lookup uses vfs_busy() to traverse mount points.
  * root fs			var fs
  * / vnode lock		A	/ vnode lock (/var)		D
  * /var vnode lock	B	/log vnode lock(/var/log)	E
  * vfs_busy lock	C	vfs_busy lock			F
  *
  * Within each file system, the lock order is C->A->B and F->D->E.
  *
  * When traversing across mounts, the system follows that lock order:
  *
  *        C->A->B
  *              |
  *              +->F->D->E
  *
  * The lookup() process for namei("/var") illustrates the process:
  *  VOP_LOOKUP() obtains B while A is held
  *  vfs_busy() obtains a shared lock on F while A and B are held
  *  vput() releases lock on B
  *  vput() releases lock on A
  *  VFS_ROOT() obtains lock on D while shared lock on F is held
  *  vfs_unbusy() releases shared lock on F
  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  *    Attempt to lock A (instead of vp_crossmp) while D is held would
  *    violate the global order, causing deadlocks.
  *
  * dounmount() locks B while F is drained.
  */
 int
 vfs_busy(struct mount *mp, int flags)
 {
 
 	MPASS((flags & ~MBF_MASK) == 0);
 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 
 	if (vfs_op_thread_enter(mp)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
 		vfs_mp_count_add_pcpu(mp, ref, 1);
 		vfs_mp_count_add_pcpu(mp, lockref, 1);
 		vfs_op_thread_exit(mp);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REF(mp);
 	/*
 	 * If mount point is currently being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 	 * about to be really destroyed.  vfs_busy needs to release its
 	 * reference on the mount point in this case and return with ENOENT,
 	 * telling the caller that mount mount it tried to busy is no longer
 	 * valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
 			    __func__);
 			return (ENOENT);
 		}
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_lock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 	}
 	if (flags & MBF_MNTLSTLOCK)
 		mtx_unlock(&mountlist_mtx);
 	mp->mnt_lockref++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp)
 {
 	int c;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if (vfs_op_thread_enter(mp)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		vfs_mp_count_sub_pcpu(mp, lockref, 1);
 		vfs_mp_count_sub_pcpu(mp, ref, 1);
 		vfs_op_thread_exit(mp);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_lockref;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
 		wakeup(&mp->mnt_lockref);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	return ((struct mount *) 0);
 }
 
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
  *
  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  * cache for popular filesystem identifiers.  The cache is lockess, using
  * the fact that struct mount's are never freed.  In worst case we may
  * get pointer to unmounted or even different filesystem, so we have to
  * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
 #define	FSID_CACHE_SIZE	256
 	typedef struct mount * volatile vmp_t;
 	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
 	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	hash = fsid->val[0] ^ fsid->val[1];
 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 	mp = cache[hash];
 	if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
 		goto slow;
 	if (vfs_busy(mp, 0) != 0) {
 		cache[hash] = NULL;
 		goto slow;
 	}
 	if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
 		return (mp);
 	else
 	    vfs_unbusy(mp);
 
 slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
 				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
 			cache[hash] = mp;
 			return (mp);
 		}
 	}
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if (jailed(td->td_ucred)) {
 		/*
 		 * If the jail of the calling thread lacks permission for
 		 * this type of file system, deny immediately.
 		 */
 		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
 			return (EPERM);
 
 		/*
 		 * If the file system was mounted outside the jail of the
 		 * calling thread, deny immediately.
 		 */
 		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 			return (EPERM);
 	}
 
 	/*
 	 * If file system supports delegated administration, we don't check
 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 	 * by the file system itself.
 	 * If this is not the user that did original mount, we check for
 	 * the PRIV_VFS_MOUNT_OWNER privilege.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static uint16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
     "3+: sec + ns (max. precision))");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Try to reduce the total number of vnodes.
  *
  * This routine (and its user) are buggy in at least the following ways:
  * - all parameters were picked years ago when RAM sizes were significantly
  *   smaller
  * - it can pick vnodes based on pages used by the vm object, but filesystems
  *   like ZFS don't use it making the pick broken
  * - since ZFS has its own aging policy it gets partially combated by this one
  * - a dedicated method should be provided for filesystems to let them decide
  *   whether the vnode should be recycled
  *
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desirable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  *
  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
  * 			 entries if this argument is strue
  * @param trigger	 Only reclaim vnodes with fewer than this many resident
  *			 pages.
  * @param target	 How many vnodes to reclaim.
  * @return		 The number of vnodes that were reclaimed.
  */
 static int
 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
 {
 	struct vnode *vp, *mvp;
 	struct mount *mp;
 	struct vm_object *object;
 	u_long done;
 	bool retried;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 
 	retried = false;
 	done = 0;
 
 	mvp = vnode_list_reclaim_marker;
 restart:
 	vp = mvp;
 	while (done < target) {
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL))
 			break;
 
 		if (__predict_false(vp->v_type == VMARKER))
 			continue;
 
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 * Also skip free vnodes.  We are trying to make space
 		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
 			goto next_iter;
 
 		if (vp->v_type == VBAD || vp->v_type == VNON)
 			goto next_iter;
 
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 
 		if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    VN_IS_DOOMED(vp) || vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 
 		object = atomic_load_ptr(&vp->v_object);
 		if (object == NULL || object->resident_page_count > trigger) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 
 		vholdl(vp);
 		VI_UNLOCK(vp);
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			vdrop(vp);
 			goto next_iter_unlocked;
 		}
 		if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
 			vdrop(vp);
 			vn_finished_write(mp);
 			goto next_iter_unlocked;
 		}
 
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0 ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp);
 			vdropl(vp);
 			vn_finished_write(mp);
 			goto next_iter_unlocked;
 		}
 		counter_u64_add(recycles_count, 1);
 		vgonel(vp);
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 		vn_finished_write(mp);
 		done++;
 next_iter_unlocked:
 		if (should_yield())
 			kern_yield(PRI_USER);
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 next_iter:
 		MPASS(vp->v_type != VMARKER);
 		if (!should_yield())
 			continue;
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		mtx_unlock(&vnode_list_mtx);
 		kern_yield(PRI_USER);
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 	}
 	if (done == 0 && !retried) {
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 		retried = true;
 		goto restart;
 	}
 	return (done);
 }
 
 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
     0,
     "limit on vnode free requests per call to the vnlru_free routine");
 
 /*
  * Attempt to reduce the free list by the requested amount.
  */
 static int
 vnlru_free_locked(int count, struct vfsops *mnt_op)
 {
 	struct vnode *vp, *mvp;
 	struct mount *mp;
 	int ocount;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
 	ocount = count;
 	mvp = vnode_list_free_marker;
 restart:
 	vp = mvp;
 	while (count > 0) {
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL)) {
 			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 			TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
 			break;
 		}
 		if (__predict_false(vp->v_type == VMARKER))
 			continue;
 
 		/*
 		 * Don't recycle if our vnode is from different type
 		 * of mount point.  Note that mp is type-safe, the
 		 * check does not reach unmapped address even if
 		 * vnode is reclaimed.
 		 * Don't recycle if we can't get the interlock without
 		 * blocking.
 		 */
 		if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 		    mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
 			continue;
 		}
 		TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 		TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 		if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		vholdl(vp);
 		count--;
 		mtx_unlock(&vnode_list_mtx);
 		VI_UNLOCK(vp);
 		vtryrecycle(vp);
 		vdrop(vp);
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 	}
 	return (ocount - count);
 }
 
 void
 vnlru_free(int count, struct vfsops *mnt_op)
 {
 
 	mtx_lock(&vnode_list_mtx);
 	vnlru_free_locked(count, mnt_op);
 	mtx_unlock(&vnode_list_mtx);
 }
 
 static void
 vnlru_recalc(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 	vlowat = vhiwat / 2;
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 /*
  * The main freevnodes counter is only updated when threads requeue their vnode
  * batches. CPUs are conditionally walked to compute a more accurate total.
  *
  * Limit how much of a slop are we willing to tolerate. Note: the actual value
  * at any given moment can still exceed slop, but it should not be by significant
  * margin in practice.
  */
 #define VNLRU_FREEVNODES_SLOP 128
 
 static u_long
 vnlru_read_freevnodes(void)
 {
 	struct vdbatch *vd;
 	long slop;
 	int cpu;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (freevnodes > freevnodes_old)
 		slop = freevnodes - freevnodes_old;
 	else
 		slop = freevnodes_old - freevnodes;
 	if (slop < VNLRU_FREEVNODES_SLOP)
 		return (freevnodes >= 0 ? freevnodes : 0);
 	freevnodes_old = freevnodes;
 	CPU_FOREACH(cpu) {
 		vd = DPCPU_ID_PTR((cpu), vd);
 		freevnodes_old += vd->freevnodes;
 	}
 	return (freevnodes_old >= 0 ? freevnodes_old : 0);
 }
 
 static bool
 vnlru_under(u_long rnumvnodes, u_long limit)
 {
 	u_long rfreevnodes, space;
 
 	if (__predict_false(rnumvnodes > desiredvnodes))
 		return (true);
 
 	space = desiredvnodes - rnumvnodes;
 	if (space < limit) {
 		rfreevnodes = vnlru_read_freevnodes();
 		if (rfreevnodes > wantfreevnodes)
 			space += rfreevnodes - wantfreevnodes;
 	}
 	return (space < limit);
 }
 
 static bool
 vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
 {
 	long rfreevnodes, space;
 
 	if (__predict_false(rnumvnodes > desiredvnodes))
 		return (true);
 
 	space = desiredvnodes - rnumvnodes;
 	if (space < limit) {
 		rfreevnodes = atomic_load_long(&freevnodes);
 		if (rfreevnodes > wantfreevnodes)
 			space += rfreevnodes - wantfreevnodes;
 	}
 	return (space < limit);
 }
 
 static void
 vnlru_kick(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (vnlruproc_sig == 0) {
 		vnlruproc_sig = 1;
 		wakeup(vnlruproc);
 	}
 }
 
 static void
 vnlru_proc(void)
 {
 	u_long rnumvnodes, rfreevnodes, target;
 	unsigned long onumvnodes;
 	int done, force, trigger, usevnodes;
 	bool reclaim_nc_src, want_reread;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
 	force = 0;
 	want_reread = false;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_list_mtx);
 		rnumvnodes = atomic_load_long(&numvnodes);
 
 		if (want_reread) {
 			force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
 			want_reread = false;
 		}
 
 		/*
 		 * If numvnodes is too large (due to desiredvnodes being
 		 * adjusted using its sysctl, or emergency growth), first
 		 * try to reduce it by discarding from the free list.
 		 */
 		if (rnumvnodes > desiredvnodes) {
 			vnlru_free_locked(rnumvnodes - desiredvnodes, NULL);
 			rnumvnodes = atomic_load_long(&numvnodes);
 		}
 		/*
 		 * Sleep if the vnode cache is in a good state.  This is
 		 * when it is not over-full and has space for about a 4%
 		 * or 9% expansion (by growing its size or inexcessively
 		 * reducing its free list).  Otherwise, try to reclaim
 		 * space for a 10% expansion.
 		 */
 		if (vstir && force == 0) {
 			force = 1;
 			vstir = 0;
 		}
 		if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		rfreevnodes = vnlru_read_freevnodes();
 
 		onumvnodes = rnumvnodes;
 		/*
 		 * Calculate parameters for recycling.  These are the same
 		 * throughout the loop to give some semblance of fairness.
 		 * The trigger point is to avoid recycling vnodes with lots
 		 * of resident pages.  We aren't trying to free memory; we
 		 * are trying to recycle or at least free vnodes.
 		 */
 		if (rnumvnodes <= desiredvnodes)
 			usevnodes = rnumvnodes - rfreevnodes;
 		else
 			usevnodes = rnumvnodes;
 		if (usevnodes <= 0)
 			usevnodes = 1;
 		/*
 		 * The trigger value is is chosen to give a conservatively
 		 * large value to ensure that it alone doesn't prevent
 		 * making progress.  The value can easily be so large that
 		 * it is effectively infinite in some congested and
 		 * misconfigured cases, and this is necessary.  Normally
 		 * it is about 8 to 100 (pages), which is quite large.
 		 */
 		trigger = vm_cnt.v_page_count * 2 / usevnodes;
 		if (force < 2)
 			trigger = vsmalltrigger;
 		reclaim_nc_src = force >= 3;
 		target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 		target = target / 10 + 1;
 		done = vlrureclaim(reclaim_nc_src, trigger, target);
 		mtx_unlock(&vnode_list_mtx);
 		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
 			uma_reclaim(UMA_RECLAIM_DRAIN);
 		if (done == 0) {
 			if (force == 0 || force == 1) {
 				force = 2;
 				continue;
 			}
 			if (force == 2) {
 				force = 3;
 				continue;
 			}
 			want_reread = true;
 			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else {
 			want_reread = true;
 			kern_yield(PRI_USER);
 		}
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
     &vnlru_kp);
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct mount *vnmp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, vp %p lock is already held",
 		    __func__, vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, cannot start the write for %p",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp);
 		VI_UNLOCK(vp);
 		vn_finished_write(vnmp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, %p is already referenced",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	if (!VN_IS_DOOMED(vp)) {
 		counter_u64_add(recycles_free_count, 1);
 		vgonel(vp);
 	}
 	VOP_UNLOCK(vp);
 	VI_UNLOCK(vp);
 	vn_finished_write(vnmp);
 	return (0);
 }
 
 /*
  * Allocate a new vnode.
  *
  * The operation never returns an error. Returning an error was disabled
  * in r145385 (dated 2005) with the following comment:
  *
  * XXX Not all VFS_VGET/ffs_vget callers check returns.
  *
  * Given the age of this commit (almost 15 years at the time of writing this
  * comment) restoring the ability to fail requires a significant audit of
  * all codepaths.
  *
  * The routine can try to free a vnode or stall for up to 1 second waiting for
  * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
  */
 static u_long vn_alloc_cyclecount;
 
 static struct vnode * __noinline
 vn_alloc_hard(struct mount *mp)
 {
 	u_long rnumvnodes, rfreevnodes;
 
 	mtx_lock(&vnode_list_mtx);
 	rnumvnodes = atomic_load_long(&numvnodes);
 	if (rnumvnodes + 1 < desiredvnodes) {
 		vn_alloc_cyclecount = 0;
 		goto alloc;
 	}
 	rfreevnodes = vnlru_read_freevnodes();
 	if (vn_alloc_cyclecount++ >= rfreevnodes) {
 		vn_alloc_cyclecount = 0;
 		vstir = 1;
 	}
 	/*
 	 * Grow the vnode cache if it will not be above its target max
 	 * after growing.  Otherwise, if the free list is nonempty, try
 	 * to reclaim 1 item from it before growing the cache (possibly
 	 * above its target max if the reclamation failed or is delayed).
 	 * Otherwise, wait for some space.  In all cases, schedule
 	 * vnlru_proc() if we are getting short of space.  The watermarks
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
 	if (vnlru_free_locked(1, NULL) > 0)
 		goto alloc;
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		/*
 		 * Wait for space for a new vnode.
 		 */
 		vnlru_kick();
 		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 		    vnlru_read_freevnodes() > 1)
 			vnlru_free_locked(1, NULL);
 	}
 alloc:
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 	if (vnlru_under(rnumvnodes, vlowat))
 		vnlru_kick();
 	mtx_unlock(&vnode_list_mtx);
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static struct vnode *
 vn_alloc(struct mount *mp)
 {
 	u_long rnumvnodes;
 
 	if (__predict_false(vn_alloc_cyclecount != 0))
 		return (vn_alloc_hard(mp));
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 	if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
 		atomic_subtract_long(&numvnodes, 1);
 		return (vn_alloc_hard(mp));
 	}
 
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static void
 vn_free(struct vnode *vp)
 {
 
 	atomic_subtract_long(&numvnodes, 1);
 	uma_zfree_smr(vnode_zone, vp);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct lock_object *lo;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 
 	KASSERT(vops->registered,
 	    ("%s: not registered vector op %p\n", __func__, vops));
 
 	td = curthread;
 	if (td->td_vp_reserved != NULL) {
 		vp = td->td_vp_reserved;
 		td->td_vp_reserved = NULL;
 	} else {
 		vp = vn_alloc(mp);
 	}
 	counter_u64_add(vnodes_created, 1);
 	/*
 	 * Locks are given the generic name "vnode" when created.
 	 * Follow the historic practice of using the filesystem
 	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 	 *
 	 * Locks live in a witness group keyed on their name. Thus,
 	 * when a lock is renamed, it must also move from the witness
 	 * group of its old name to the witness group of its new name.
 	 *
 	 * The change only needs to be made when the vnode moves
 	 * from one filesystem type to another. We ensure that each
 	 * filesystem use a single static name pointer for its tag so
 	 * that we can compare pointers rather than doing a strcmp().
 	 */
 	lo = &vp->v_vnlock->lock_object;
 #ifdef WITNESS
 	if (lo->lo_name != tag) {
 #endif
 		lo->lo_name = tag;
 #ifdef WITNESS
 		WITNESS_DESTROY(lo);
 		WITNESS_INIT(lo, tag);
 	}
 #endif
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_op = vops;
 	v_init_counters(vp);
 	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef DIAGNOSTIC
 	if (mp == NULL && vops != &dead_vnodeops)
 		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 #endif
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 #endif
 	if (mp != NULL) {
 		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
 	 * still initialize v_hash to have vfs_hash_index() useful.
 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 	 * its own hashing.
 	 */
 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
 
 	*vpp = vp;
 	return (0);
 }
 
 void
 getnewvnode_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(td->td_vp_reserved == NULL);
 	td->td_vp_reserved = vn_alloc(NULL);
 }
 
 void
 getnewvnode_drop_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_vp_reserved != NULL) {
 		vn_free(td->td_vp_reserved);
 		td->td_vp_reserved = NULL;
 	}
 }
 
 static void
 freevnode(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 *
 	 * The vnode will be returned to the zone where it will
 	 * normally remain until it is needed for another vnode. We
 	 * need to cleanup (or verify that the cleanup has already
 	 * been done) any residual data left from its current use
 	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	/*
 	 * Paired with vgone.
 	 */
 	vn_seqc_write_end_locked(vp);
 	VNPASS(vp->v_seqc_users == 0, vp);
 
 	bo = &vp->v_bufobj;
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
 	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 	    ("Dangling rangelock waiters"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		destroy_vpollinfo(vp->v_pollinfo);
 		vp->v_pollinfo = NULL;
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	vp->v_mountedhere = NULL;
 	vp->v_unpcb = NULL;
 	vp->v_rdev = NULL;
 	vp->v_fifoinfo = NULL;
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 	vp->v_irflag = 0;
 	vp->v_iflag = 0;
 	vp->v_vflag = 0;
 	bo->bo_flag = 0;
 	vn_free(vp);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	vp->v_mount = NULL;
 	VI_UNLOCK(vp);
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 {
 
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 int
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
 	 * holdcnt on it before we get it on both the vnode list
 	 * and the active vnode list. The mount mutex protects only
 	 * manipulation of the vnode list and the vnode freelist
 	 * mutex protects only manipulation of the active vnode list.
 	 * Hence the need to hold the vnode interlock throughout.
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 	    mp->mnt_nvnodelistsize == 0)) &&
 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 		VI_UNLOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (dtr != NULL)
 			dtr(vp, dtr_arg);
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	VI_UNLOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
 			BO_UNLOCK(bo);
 			vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
 			BO_LOCK(bo);
 		}
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 	    bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 	    bo->bo_dirty.bv_cnt > 0)
 		panic("vinvalbuf: flush dirty failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 {
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	if (vp->v_object != NULL && vp->v_object->handle != vp)
 		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		/*
 		 * If we are flushing both V_NORMAL and V_ALT buffers then
 		 * do not skip any buffers. If we are flushing only V_NORMAL
 		 * buffers then skip buffers marked as BX_ALTDATA. If we are
 		 * flushing only V_ALT buffers then skip buffers not marked
 		 * as BX_ALTDATA.
 		 */
 		if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
 		   (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
 			continue;
 		}
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp == NULL)
 			break;
 		nbp = gbincore(bo, lblkno);
 		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		    != xflags)
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 int
 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 {
 	struct buf *bp;
 	int error;
 	daddr_t lblkno;
 
 	ASSERT_BO_LOCKED(bo);
 
 	for (lblkno = startn;;) {
 again:
 		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 		if (bp == NULL || bp->b_lblkno >= endn ||
 		    bp->b_lblkno < startn)
 			break;
 		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 		if (error != 0) {
 			BO_RLOCK(bo);
 			if (error == ENOLCK)
 				goto again;
 			return (error);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		lblkno = bp->b_lblkno + 1;
 		if ((bp->b_flags & B_MANAGED) == 0)
 			bremfree(bp);
 		bp->b_flags |= B_RELBUF;
 		/*
 		 * In the VMIO case, use the B_NOREUSE flag to hint that the
 		 * pages backing each buffer in the range are unlikely to be
 		 * reused.  Dirty buffers will have the hint applied once
 		 * they've been written.
 		 */
 		if ((bp->b_flags & B_VMIO) != 0)
 			bp->b_flags |= B_NOREUSE;
 		brelse(bp);
 		BO_RLOCK(bo);
 	}
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	daddr_t startlbn;
 
 	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
 	    vp, blksize, (uintmax_t)length);
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	startlbn = howmany(length, blksize);
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 
 	bo = &vp->v_bufobj;
 restart_unlocked:
 	BO_LOCK(bo);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
 		;
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart_unlocked;
 
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	BO_UNLOCK(bo);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Invalidate the cached pages of a file's buffer within the range of block
  * numbers [startlbn, endlbn).
  */
 void
 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
     int blksize)
 {
 	struct bufobj *bo;
 	off_t start, end;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
 
 	start = blksize * startlbn;
 	end = blksize * endlbn;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	MPASS(blksize == bo->bo_bsize);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
 		;
 
 	BO_UNLOCK(bo);
 	vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
 }
 
 static int
 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
     daddr_t startlbn, daddr_t endlbn)
 {
 	struct buf *bp, *nbp;
 	bool anyfreed;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
 	ASSERT_BO_LOCKED(bo);
 
 	do {
 		anyfreed = false;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    nbp->b_vp != vp ||
 			    (nbp->b_flags & B_DELWRI) != 0))
 				return (EAGAIN);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0))
 				return (EAGAIN);
 		}
 	} while (anyfreed);
 	return (0);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct bufv *bv;
 	b_xflags_t flags;
 
 	flags = bp->b_xflags;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
 	    (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
 	    ("%s: buffer %p has invalid queue state", __func__, bp));
 
 	if ((flags & BX_VNDIRTY) != 0)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct bufv *bv;
 	struct buf *n;
 	int error;
 
 	ASSERT_BO_WLOCKED(bo);
 	KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
 	    ("buf_vlist_add: bo %p does not allow bufs", bo));
 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	/*
 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
 	 * we tend to grow at the tail so lookup_le should usually be cheaper
 	 * than _ge. 
 	 */
 	if (bv->bv_cnt == 0 ||
 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	else
 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 	if (error)
 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
 }
 
 /*
  * Look up a buffer using the buffer tries.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
 }
 
 /*
  * Look up a buf using the buffer tries, without the bufobj lock.  This relies
  * on SMR for safe lookup, and bufs being in a no-free zone to provide type
  * stability of the result.  Like other lockless lookups, the found buf may
  * already be invalid by the time this function returns.
  */
 struct buf *
 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_UNLOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, bo, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	buf_vlist_remove(bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 	vdrop(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
     CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
 	vp = bo2vnode(*bo);
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (*bo == LIST_FIRST(slp));
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	BO_LOCK(*bo);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	BO_UNLOCK(*bo);
 	vdrop(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 static int first_printf = 1;
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining... ");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 
 			if (first_printf == 0) {
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
 		if (syncer_state != SYNCER_RUNNING)
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 
 	if (howto & RB_NOSYNC)
 		return;
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_shutdown(arg, howto);
 }
 
 void
 syncer_suspend(void)
 {
 
 	syncer_shutdown(updateproc, 0);
 }
 
 void
 syncer_resume(void)
 {
 
 	mtx_lock(&sync_mtx);
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_resume(updateproc);
 }
 
 /*
  * Move the buffer between the clean and dirty lists of its vnode.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 
 	KASSERT((bp->b_flags & B_PAGING) == 0,
 	    ("%s: cannot reassign paging buffer %p", __func__, bp));
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 
 	BO_LOCK(bo);
 	buf_vlist_remove(bp);
 
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	BO_UNLOCK(bo);
 }
 
 static void
 v_init_counters(struct vnode *vp)
 {
 
 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 
 	refcount_init(&vp->v_holdcnt, 1);
 	refcount_init(&vp->v_usecount, 1);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it.  VIRF_DOOMED is set if the vnode
  * is being destroyed.  Only callers who specify LK_RETRY will
  * see doomed vnodes.  If inactive processing was delayed in
  * vput try to do it here.
  *
  * usecount is manipulated using atomics without holding any locks.
  *
  * holdcnt can be manipulated using atomics without holding any locks,
  * except when transitioning 1<->0, in which case the interlock is held.
  *
  * Consumers which don't guarantee liveness of the vnode can use SMR to
  * try to get a reference. Note this operation can fail since the vnode
  * may be awaiting getting freed by the time they get to it.
  */
 enum vgetstate
 vget_prep_smr(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		if (vhold_smr(vp))
 			vs = VGET_HOLDCNT;
 		else
 			vs = VGET_NONE;
 	}
 	return (vs);
 }
 
 enum vgetstate
 vget_prep(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		vhold(vp);
 		vs = VGET_HOLDCNT;
 	}
 	return (vs);
 }
 
 void
 vget_abort(struct vnode *vp, enum vgetstate vs)
 {
 
 	switch (vs) {
 	case VGET_USECOUNT:
 		vrele(vp);
 		break;
 	case VGET_HOLDCNT:
 		vdrop(vp);
 		break;
 	default:
 		__assert_unreachable();
 	}
 }
 
 int
-vget(struct vnode *vp, int flags, struct thread *td)
+vget(struct vnode *vp, int flags)
 {
 	enum vgetstate vs;
 
-	MPASS(td == curthread);
-
 	vs = vget_prep(vp);
 	return (vget_finish(vp, flags, vs));
 }
 
 int
 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 {
 	int error;
 
 	if ((flags & LK_INTERLOCK) != 0)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
 	error = vn_lock(vp, flags);
 	if (__predict_false(error != 0)) {
 		vget_abort(vp, vs);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
 		return (error);
 	}
 
 	vget_finish_ref(vp, vs);
 	return (0);
 }
 
 void
 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
 {
 	int old;
 
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
 	if (vs == VGET_USECOUNT)
 		return;
 
 	/*
 	 * We hold the vnode. If the usecount is 0 it will be utilized to keep
 	 * the vnode around. Otherwise someone else lended their hold count and
 	 * we have to drop ours.
 	 */
 	old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
 	if (old != 0) {
 #ifdef INVARIANTS
 		old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
 		VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
 #else
 		refcount_release(&vp->v_holdcnt);
 #endif
 	}
 }
 
 void
 vref(struct vnode *vp)
 {
 	enum vgetstate vs;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vs = vget_prep(vp);
 	vget_finish_ref(vp, vs);
 }
 
 void
 vrefl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vref(vp);
 }
 
 void
 vrefact(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
 #else
 	refcount_acquire(&vp->v_usecount);
 #endif
 }
 
 void
 vlazy(struct vnode *vp)
 {
 	struct mount *mp;
 
 	VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
 
 	if ((vp->v_mflag & VMP_LAZYLIST) != 0)
 		return;
 	/*
 	 * We may get here for inactive routines after the vnode got doomed.
 	 */
 	if (VN_IS_DOOMED(vp))
 		return;
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
 		vp->v_mflag |= VMP_LAZYLIST;
 		TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize++;
 	}
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 /*
  * This routine is only meant to be called from vgonel prior to dooming
  * the vnode.
  */
 static void
 vunlazy_gone(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(!VN_IS_DOOMED(vp), vp);
 
 	if (vp->v_mflag & VMP_LAZYLIST) {
 		mp = vp->v_mount;
 		mtx_lock(&mp->mnt_listmtx);
 		VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 		vp->v_mflag &= ~VMP_LAZYLIST;
 		TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 		mp->mnt_lazyvnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
 }
 
 static void
 vdefer_inactive(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(vp->v_holdcnt > 0, vp,
 	    ("%s: vnode without hold count", __func__));
 	if (VN_IS_DOOMED(vp)) {
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_iflag & VI_DEFINACT) {
 		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_usecount > 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		vdropl(vp);
 		return;
 	}
 	vlazy(vp);
 	vp->v_iflag |= VI_DEFINACT;
 	VI_UNLOCK(vp);
 	counter_u64_add(deferred_inact, 1);
 }
 
 static void
 vdefer_inactive_unlocked(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		vdropl(vp);
 		return;
 	}
 	vdefer_inactive(vp);
 }
 
 enum vput_op { VRELE, VPUT, VUNREF };
 
 /*
  * Handle ->v_usecount transitioning to 0.
  *
  * By releasing the last usecount we take ownership of the hold count which
  * provides liveness of the vnode, meaning we have to vdrop.
  *
  * For all vnodes we may need to perform inactive processing. It requires an
  * exclusive lock on the vnode, while it is legal to call here with only a
  * shared lock (or no locks). If locking the vnode in an expected manner fails,
  * inactive processing gets deferred to the syncer.
  *
  * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
  * on the lock being held all the way until VOP_INACTIVE. This in particular
  * happens with UFS which adds half-constructed vnodes to the hash, where they
  * can be found by other code.
  */
 static void
 vput_final(struct vnode *vp, enum vput_op func)
 {
 	int error;
 	bool want_unlock;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 
 	VI_LOCK(vp);
 
 	/*
 	 * By the time we got here someone else might have transitioned
 	 * the count back to > 0.
 	 */
 	if (vp->v_usecount > 0)
 		goto out;
 
 	/*
 	 * If the vnode is doomed vgone already performed inactive processing
 	 * (if needed).
 	 */
 	if (VN_IS_DOOMED(vp))
 		goto out;
 
 	if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
 		goto out;
 
 	if (vp->v_iflag & VI_DOINGINACT)
 		goto out;
 
 	/*
 	 * Locking operations here will drop the interlock and possibly the
 	 * vnode lock, opening a window where the vnode can get doomed all the
 	 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
 	 * perform inactive.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	want_unlock = false;
 	error = 0;
 	switch (func) {
 	case VRELE:
 		switch (VOP_ISLOCKED(vp)) {
 		case LK_EXCLUSIVE:
 			break;
 		case LK_EXCLOTHER:
 		case 0:
 			want_unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 			VI_LOCK(vp);
 			break;
 		default:
 			/*
 			 * The lock has at least one sharer, but we have no way
 			 * to conclude whether this is us. Play it safe and
 			 * defer processing.
 			 */
 			error = EAGAIN;
 			break;
 		}
 		break;
 	case VPUT:
 		want_unlock = true;
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 			    LK_NOWAIT);
 			VI_LOCK(vp);
 		}
 		break;
 	case VUNREF:
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 			VI_LOCK(vp);
 		}
 		break;
 	}
 	if (error == 0) {
 		vinactive(vp);
 		if (want_unlock)
 			VOP_UNLOCK(vp);
 		vdropl(vp);
 	} else {
 		vdefer_inactive(vp);
 	}
 	return;
 out:
 	if (func == VPUT)
 		VOP_UNLOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Decrement ->v_usecount for a vnode.
  *
  * Releasing the last use count requires additional processing, see vput_final
  * above for details.
  *
  * Comment above each variant denotes lock state on entry and exit.
  */
 
 /*
  * in: any
  * out: same as passed in
  */
 void
 vrele(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	vput_final(vp, VRELE);
 }
 
 /*
  * in: locked
  * out: unlocked
  */
 void
 vput(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount)) {
 		VOP_UNLOCK(vp);
 		return;
 	}
 	vput_final(vp, VPUT);
 }
 
 /*
  * in: locked
  * out: locked
  */
 void
 vunref(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, __func__);
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	vput_final(vp, VUNREF);
 }
 
 void
 vhold(struct vnode *vp)
 {
 	struct vdbatch *vd;
 	int old;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 	if (old != 0)
 		return;
 	critical_enter();
 	vd = DPCPU_PTR(vd);
 	vd->freevnodes--;
 	critical_exit();
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vhold(vp);
 }
 
 void
 vholdnz(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 #else
 	atomic_add_int(&vp->v_holdcnt, 1);
 #endif
 }
 
 /*
  * Grab a hold count unless the vnode is freed.
  *
  * Only use this routine if vfs smr is the only protection you have against
  * freeing the vnode.
  *
  * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
  * is not set.  After the flag is set the vnode becomes immutable to anyone but
  * the thread which managed to set the flag.
  *
  * It may be tempting to replace the loop with:
  * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
  * if (count & VHOLD_NO_SMR) {
  *     backpedal and error out;
  * }
  *
  * However, while this is more performant, it hinders debugging by eliminating
  * the previously mentioned invariant.
  */
 bool
 vhold_smr(struct vnode *vp)
 {
 	int count;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	count = atomic_load_int(&vp->v_holdcnt);
 	for (;;) {
 		if (count & VHOLD_NO_SMR) {
 			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 			    ("non-zero hold count with flags %d\n", count));
 			return (false);
 		}
 
 		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1))
 			return (true);
 	}
 }
 
 static void __noinline
 vdbatch_process(struct vdbatch *vd)
 {
 	struct vnode *vp;
 	int i;
 
 	mtx_assert(&vd->lock, MA_OWNED);
 	MPASS(curthread->td_pinned > 0);
 	MPASS(vd->index == VDBATCH_SIZE);
 
 	mtx_lock(&vnode_list_mtx);
 	critical_enter();
 	freevnodes += vd->freevnodes;
 	for (i = 0; i < VDBATCH_SIZE; i++) {
 		vp = vd->tab[i];
 		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 		TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
 		MPASS(vp->v_dbatchcpu != NOCPU);
 		vp->v_dbatchcpu = NOCPU;
 	}
 	mtx_unlock(&vnode_list_mtx);
 	vd->freevnodes = 0;
 	bzero(vd->tab, sizeof(vd->tab));
 	vd->index = 0;
 	critical_exit();
 }
 
 static void
 vdbatch_enqueue(struct vnode *vp)
 {
 	struct vdbatch *vd;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(!VN_IS_DOOMED(vp), vp,
 	    ("%s: deferring requeue of a doomed vnode", __func__));
 
 	critical_enter();
 	vd = DPCPU_PTR(vd);
 	vd->freevnodes++;
 	if (vp->v_dbatchcpu != NOCPU) {
 		VI_UNLOCK(vp);
 		critical_exit();
 		return;
 	}
 
 	sched_pin();
 	critical_exit();
 	mtx_lock(&vd->lock);
 	MPASS(vd->index < VDBATCH_SIZE);
 	MPASS(vd->tab[vd->index] == NULL);
 	/*
 	 * A hack: we depend on being pinned so that we know what to put in
 	 * ->v_dbatchcpu.
 	 */
 	vp->v_dbatchcpu = curcpu;
 	vd->tab[vd->index] = vp;
 	vd->index++;
 	VI_UNLOCK(vp);
 	if (vd->index == VDBATCH_SIZE)
 		vdbatch_process(vd);
 	mtx_unlock(&vd->lock);
 	sched_unpin();
 }
 
 /*
  * This routine must only be called for vnodes which are about to be
  * deallocated. Supporting dequeue for arbitrary vndoes would require
  * validating that the locked batch matches.
  */
 static void
 vdbatch_dequeue(struct vnode *vp)
 {
 	struct vdbatch *vd;
 	int i;
 	short cpu;
 
 	VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
 	    ("%s: called for a used vnode\n", __func__));
 
 	cpu = vp->v_dbatchcpu;
 	if (cpu == NOCPU)
 		return;
 
 	vd = DPCPU_ID_PTR(cpu, vd);
 	mtx_lock(&vd->lock);
 	for (i = 0; i < vd->index; i++) {
 		if (vd->tab[i] != vp)
 			continue;
 		vp->v_dbatchcpu = NOCPU;
 		vd->index--;
 		vd->tab[i] = vd->tab[vd->index];
 		vd->tab[vd->index] = NULL;
 		break;
 	}
 	mtx_unlock(&vd->lock);
 	/*
 	 * Either we dequeued the vnode above or the target CPU beat us to it.
 	 */
 	MPASS(vp->v_dbatchcpu == NOCPU);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VIRF_DOOMED) in which case we will free it.
  *
  * Because the vnode vm object keeps a hold reference on the vnode if
  * there is at least one resident non-cached page, the vnode cannot
  * leave the active list without the page cleanup done.
  */
 static void
 vdrop_deactivate(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	/*
 	 * Mark a vnode as free: remove it from its active list
 	 * and put it up for recycling on the freelist.
 	 */
 	VNASSERT(!VN_IS_DOOMED(vp), vp,
 	    ("vdrop: returning doomed vnode"));
 	VNASSERT(vp->v_op != NULL, vp,
 	    ("vdrop: vnode already reclaimed."));
 	VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 	    ("vnode with VI_OWEINACT set"));
 	VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp,
 	    ("vnode with VI_DEFINACT set"));
 	if (vp->v_mflag & VMP_LAZYLIST) {
 		mp = vp->v_mount;
 		mtx_lock(&mp->mnt_listmtx);
 		VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST"));
 		/*
 		 * Don't remove the vnode from the lazy list if another thread
 		 * has increased the hold count. It may have re-enqueued the
 		 * vnode to the lazy list and is now responsible for its
 		 * removal.
 		 */
 		if (vp->v_holdcnt == 0) {
 			vp->v_mflag &= ~VMP_LAZYLIST;
 			TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 			mp->mnt_lazyvnodelistsize--;
 		}
 		mtx_unlock(&mp->mnt_listmtx);
 	}
 	vdbatch_enqueue(vp);
 }
 
 void
 vdrop(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (refcount_release_if_not_last(&vp->v_holdcnt))
 		return;
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 void
 vdropl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (!refcount_release(&vp->v_holdcnt)) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	if (!VN_IS_DOOMED(vp)) {
 		vdrop_deactivate(vp);
 		/*
 		 * Also unlocks the interlock. We can't assert on it as we
 		 * released our hold and by now the vnode might have been
 		 * freed.
 		 */
 		return;
 	}
 	/*
 	 * Set the VHOLD_NO_SMR flag.
 	 *
 	 * We may be racing against vhold_smr. If they win we can just pretend
 	 * we never got this far, they will vdrop later.
 	 */
 	if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) {
 		VI_UNLOCK(vp);
 		/*
 		 * We lost the aforementioned race. Any subsequent access is
 		 * invalid as they might have managed to vdropl on their own.
 		 */
 		return;
 	}
 	freevnode(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  */
 static void
 vinactivef(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	/*
 	 * Before moving off the active list, we must be sure that any
 	 * modified pages are converted into the vnode's dirty
 	 * buffers, since these will no longer be checked once the
 	 * vnode is on the inactive list.
 	 *
 	 * The write-out of the dirty pages is asynchronous.  At the
 	 * point that VOP_INACTIVE() is called, there could still be
 	 * pending I/O and dirty pages in the object.
 	 */
 	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj)) {
 		VM_OBJECT_WLOCK(obj);
 		vm_object_page_clean(obj, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	VOP_INACTIVE(vp, curthread);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 void
 vinactive(struct vnode *vp)
 {
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 
 	if ((vp->v_iflag & VI_OWEINACT) == 0)
 		return;
 	if (vp->v_iflag & VI_DOINGINACT)
 		return;
 	if (vp->v_usecount > 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		return;
 	}
 	vinactivef(vp);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 #endif
 
 int
 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 	    rootrefs, flags);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 			    __func__, error);
 			return (error);
 		}
 		vput(rootvp);
 	}
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		vholdl(vp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp);
 			vdrop(vp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			if (vp->v_object != NULL) {
 				VM_OBJECT_WLOCK(vp->v_object);
 				vm_object_page_clean(vp->v_object, 0, 0, 0);
 				VM_OBJECT_WUNLOCK(vp->v_object);
 			}
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 			if (error != 0) {
 				VOP_UNLOCK(vp);
 				vdrop(vp);
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				return (error);
 			}
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp);
 				vdropl(vp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vn_printf(vp, "vflush: busy vnode ");
 #endif
 		}
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 	}
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy) {
 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 		    busy);
 		return (EBUSY);
 	}
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp)
 {
 	int recycled;
 
 	VI_LOCK(vp);
 	recycled = vrecyclel(vp);
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * vrecycle, with the vp interlock held.
  */
 int
 vrecyclel(struct vnode *vp)
 {
 	int recycled;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	recycled = 0;
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 static void
 notify_lowervp_vfs_dummy(struct mount *mp __unused,
     struct vnode *lowervp __unused)
 {
 }
 
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
 void
 vfs_notify_upper(struct vnode *vp, int event)
 {
 	static struct vfsops vgonel_vfsops = {
 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
 	};
 	struct mount *mp, *ump, *mmp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	if (TAILQ_EMPTY(&mp->mnt_uppers))
 		return;
 
 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
 	mmp->mnt_op = &vgonel_vfsops;
 	mmp->mnt_kern_flag |= MNTK_MARKER;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
 			ump = TAILQ_NEXT(ump, mnt_upper_link);
 			continue;
 		}
 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
 			VFS_RECLAIM_LOWERVP(ump, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
 			VFS_UNLINK_LOWERVP(ump, vp);
 			break;
 		default:
 			KASSERT(0, ("invalid event %d", event));
 			break;
 		}
 		MNT_ILOCK(mp);
 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
 	}
 	free(mmp, M_TEMP);
 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	struct mount *mp;
 	vm_object_t object;
 	bool active, oweinact;
 
 	ASSERT_VOP_ELOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_irflag & VIRF_DOOMED)
 		return;
 	/*
 	 * Paired with freevnode.
 	 */
 	vn_seqc_write_begin_locked(vp);
 	vunlazy_gone(vp);
 	vp->v_irflag |= VIRF_DOOMED;
 
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount > 0;
 	oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 	/*
 	 * If we need to do inactive VI_OWEINACT will be set.
 	 */
 	if (vp->v_iflag & VI_DEFINACT) {
 		VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 		vp->v_iflag &= ~VI_DEFINACT;
 		vdropl(vp);
 	} else {
 		VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
 		VI_UNLOCK(vp);
 	}
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		vinactivef(vp);
 		VI_UNLOCK(vp);
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 		while (vinvalbuf(vp, 0, 0, 0) != 0)
 			;
 	}
 
 	BO_LOCK(&vp->v_bufobj);
 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
 	    ("vp %p bufobj not invalidated", vp));
 
 	/*
 	 * For VMIO bufobj, BO_DEAD is set later, or in
 	 * vm_object_terminate() after the object's page queue is
 	 * flushed.
 	 */
 	object = vp->v_bufobj.bo_object;
 	if (object == NULL)
 		vp->v_bufobj.bo_flag |= BO_DEAD;
 	BO_UNLOCK(&vp->v_bufobj);
 
 	/*
 	 * Handle the VM part.  Tmpfs handles v_object on its own (the
 	 * OBJT_VNODE check).  Nullfs or other bypassing filesystems
 	 * should not touch the object borrowed from the lower vnode
 	 * (the handle check).
 	 */
 	if (object != NULL && object->type == OBJT_VNODE &&
 	    object->handle == vp)
 		vnode_destroy_vobject(vp);
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p", vp));
 	/*
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
 	vp->v_lockf = NULL;
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge_vgone(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_type = VBAD;
 }
 
 /*
  * Print out a description of a vnode.
  */
 static const char * const typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
     "new hold count flag not added to vn_printf");
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 	u_int holdcnt;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("type %s\n", typename[vp->v_type]);
 	holdcnt = atomic_load_int(&vp->v_holdcnt);
 	printf("    usecount %d, writecount %d, refcount %d seqc users %d",
 	    vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
 	    vp->v_seqc_users);
 	switch (vp->v_type) {
 	case VDIR:
 		printf(" mountedhere %p\n", vp->v_mountedhere);
 		break;
 	case VCHR:
 		printf(" rdev %p\n", vp->v_rdev);
 		break;
 	case VSOCK:
 		printf(" socket %p\n", vp->v_unpcb);
 		break;
 	case VFIFO:
 		printf(" fifoinfo %p\n", vp->v_fifoinfo);
 		break;
 	default:
 		printf("\n");
 		break;
 	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (holdcnt & VHOLD_NO_SMR)
 		strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
 	printf("    hold count flags (%s)\n", buf + 1);
 
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_irflag & VIRF_DOOMED)
 		strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
 	flags = vp->v_irflag & ~(VIRF_DOOMED);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_ETERNALDEV)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_VMSIZEVNLOCK)
 		strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_NOKNOTE)
 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	if (vp->v_vflag & VV_READLINK)
 		strlcat(buf, "|VV_READLINK", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 	    VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_TEXT_REF)
 		strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	if (vp->v_iflag & VI_DEFINACT)
 		strlcat(buf, "|VI_DEFINACT", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT |
 	    VI_OWEINACT | VI_DEFINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_mflag & VMP_LAZYLIST)
 		strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
 	flags = vp->v_mflag & ~(VMP_LAZYLIST);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d "
 		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count,
 		    vp->v_bufobj.bo_clean.bv_cnt,
 		    vp->v_bufobj.bo_dirty.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vn_printf(vp, "vnode ");
 		}
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 
 /*
  * Show details about the given mount point.
  */
 DB_SHOW_COMMAND(mount, db_show_mount)
 {
 	struct mount *mp;
 	struct vfsopt *opt;
 	struct statfs *sp;
 	struct vnode *vp;
 	char buf[512];
 	uint64_t mflags;
 	u_int flags;
 
 	if (!have_addr) {
 		/* No address given, print short info about all mount points. */
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			db_printf("%p %s on %s (%s)\n", mp,
 			    mp->mnt_stat.f_mntfromname,
 			    mp->mnt_stat.f_mntonname,
 			    mp->mnt_stat.f_fstypename);
 			if (db_pager_quit)
 				break;
 		}
 		db_printf("\nMore info: show mount <addr>\n");
 		return;
 	}
 
 	mp = (struct mount *)addr;
 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 
 	buf[0] = '\0';
 	mflags = mp->mnt_flag;
 #define	MNT_FLAG(flag)	do {						\
 	if (mflags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		mflags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_FLAG(MNT_RDONLY);
 	MNT_FLAG(MNT_SYNCHRONOUS);
 	MNT_FLAG(MNT_NOEXEC);
 	MNT_FLAG(MNT_NOSUID);
 	MNT_FLAG(MNT_NFS4ACLS);
 	MNT_FLAG(MNT_UNION);
 	MNT_FLAG(MNT_ASYNC);
 	MNT_FLAG(MNT_SUIDDIR);
 	MNT_FLAG(MNT_SOFTDEP);
 	MNT_FLAG(MNT_NOSYMFOLLOW);
 	MNT_FLAG(MNT_GJOURNAL);
 	MNT_FLAG(MNT_MULTILABEL);
 	MNT_FLAG(MNT_ACLS);
 	MNT_FLAG(MNT_NOATIME);
 	MNT_FLAG(MNT_NOCLUSTERR);
 	MNT_FLAG(MNT_NOCLUSTERW);
 	MNT_FLAG(MNT_SUJ);
 	MNT_FLAG(MNT_EXRDONLY);
 	MNT_FLAG(MNT_EXPORTED);
 	MNT_FLAG(MNT_DEFEXPORTED);
 	MNT_FLAG(MNT_EXPORTANON);
 	MNT_FLAG(MNT_EXKERB);
 	MNT_FLAG(MNT_EXPUBLIC);
 	MNT_FLAG(MNT_LOCAL);
 	MNT_FLAG(MNT_QUOTA);
 	MNT_FLAG(MNT_ROOTFS);
 	MNT_FLAG(MNT_USER);
 	MNT_FLAG(MNT_IGNORE);
 	MNT_FLAG(MNT_UPDATE);
 	MNT_FLAG(MNT_DELEXPORT);
 	MNT_FLAG(MNT_RELOAD);
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
 #undef MNT_FLAG
 	if (mflags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%016jx", mflags);
 	}
 	db_printf("    mnt_flag = %s\n", buf);
 
 	buf[0] = '\0';
 	flags = mp->mnt_kern_flag;
 #define	MNT_KERN_FLAG(flag)	do {					\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
 	MNT_KERN_FLAG(MNTK_ASYNC);
 	MNT_KERN_FLAG(MNTK_SOFTDEP);
 	MNT_KERN_FLAG(MNTK_DRAINING);
 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 	MNT_KERN_FLAG(MNTK_MARKER);
 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_FPLOOKUP);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 	MNT_KERN_FLAG(MNTK_NOKNOTE);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%08x", flags);
 	}
 	db_printf("    mnt_kern_flag = %s\n", buf);
 
 	db_printf("    mnt_opt = ");
 	opt = TAILQ_FIRST(mp->mnt_opt);
 	if (opt != NULL) {
 		db_printf("%s", opt->name);
 		opt = TAILQ_NEXT(opt, link);
 		while (opt != NULL) {
 			db_printf(", %s", opt->name);
 			opt = TAILQ_NEXT(opt, link);
 		}
 	}
 	db_printf("\n");
 
 	sp = &mp->mnt_stat;
 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 
 	db_printf("    mnt_cred = { uid=%u ruid=%u",
 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 	if (jailed(mp->mnt_cred))
 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 	db_printf(" }\n");
 	db_printf("    mnt_ref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 	db_printf("    mnt_lazyvnodelistsize = %d\n",
 	    mp->mnt_lazyvnodelistsize);
 	db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 	db_printf("    mnt_lockref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 	db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 	db_printf("\n\nList of inactive vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static int
 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp.vfc_vfsops = NULL;
 	xvfsp.vfc_next = NULL;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 
 #ifdef COMPAT_FREEBSD32
 struct xvfsconf32 {
 	uint32_t	vfc_vfsops;
 	char		vfc_name[MFSNAMELEN];
 	int32_t		vfc_typenum;
 	int32_t		vfc_refcount;
 	int32_t		vfc_flags;
 	uint32_t	vfc_next;
 };
 
 static int
 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf32 xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	int error;
 
 	error = 0;
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			error = vfsconf2x32(req, vfsp);
 		else
 #endif
 			error = vfsconf2x(req, vfsp);
 		if (error)
 			break;
 	}
 	vfsconf_sunlock();
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		vfsconf_slock();
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		}
 		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			return (vfsconf2x32(req, vfsp));
 		else
 #endif
 			return (vfsconf2x(req, vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, vfs_sysctl,
     "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error != 0) {
 			vfsconf_sunlock();
 			return (error);
 		}
 	}
 	vfsconf_sunlock();
 	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
     "");
 #endif
 
 static void
 unmount_or_warn(struct mount *mp)
 {
 	int error;
 
 	error = dounmount(mp, MNT_FORCE, curthread);
 	if (error != 0) {
 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp, *tmp;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 		vfs_ref(mp);
 
 		/*
 		 * Forcibly unmounting "/dev" before "/" would prevent clean
 		 * unmount of the latter.
 		 */
 		if (mp == rootdevmp)
 			continue;
 
 		unmount_or_warn(mp);
 	}
 
 	if (rootdevmp != NULL)
 		unmount_or_warn(rootdevmp);
 }
 
 static void
 vfs_deferred_inactive(struct vnode *vp, int lkflags)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		vdropl(vp);
 		return;
 	}
 	if (vn_lock(vp, lkflags) == 0) {
 		VI_LOCK(vp);
 		vinactive(vp);
 		VOP_UNLOCK(vp);
 		vdropl(vp);
 		return;
 	}
 	vdefer_inactive_unlocked(vp);
 }
 
 static int
 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
 {
 
 	return (vp->v_iflag & VI_DEFINACT);
 }
 
 static void __noinline
 vfs_periodic_inactive(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	int lkflags;
 
 	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (flags != MNT_WAIT)
 		lkflags |= LK_NOWAIT;
 
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
 		if ((vp->v_iflag & VI_DEFINACT) == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		vp->v_iflag &= ~VI_DEFINACT;
 		vfs_deferred_inactive(vp, lkflags);
 	}
 }
 
 static inline bool
 vfs_want_msync(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	/*
 	 * This test may be performed without any locks held.
 	 * We rely on vm_object's type stability.
 	 */
 	if (vp->v_vflag & VV_NOSYNC)
 		return (false);
 	obj = vp->v_object;
 	return (obj != NULL && vm_object_mightbedirty(obj));
 }
 
 static int
 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
 {
 
 	if (vp->v_vflag & VV_NOSYNC)
 		return (false);
 	if (vp->v_iflag & VI_DEFINACT)
 		return (true);
 	return (vfs_want_msync(vp));
 }
 
 static void __noinline
 vfs_periodic_msync_inactive(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 	struct thread *td;
 	int lkflags, objflags;
 	bool seen_defer;
 
 	td = curthread;
 
 	lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (flags != MNT_WAIT) {
 		lkflags |= LK_NOWAIT;
 		objflags = OBJPC_NOSYNC;
 	} else {
 		objflags = OBJPC_SYNC;
 	}
 
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
 		seen_defer = false;
 		if (vp->v_iflag & VI_DEFINACT) {
 			vp->v_iflag &= ~VI_DEFINACT;
 			seen_defer = true;
 		}
 		if (!vfs_want_msync(vp)) {
 			if (seen_defer)
 				vfs_deferred_inactive(vp, lkflags);
 			else
 				VI_UNLOCK(vp);
 			continue;
 		}
-		if (vget(vp, lkflags, td) == 0) {
+		if (vget(vp, lkflags) == 0) {
 			obj = vp->v_object;
 			if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
 				VM_OBJECT_WLOCK(obj);
 				vm_object_page_clean(obj, 0, 0, objflags);
 				VM_OBJECT_WUNLOCK(obj);
 			}
 			vput(vp);
 			if (seen_defer)
 				vdrop(vp);
 		} else {
 			if (seen_defer)
 				vdefer_inactive_unlocked(vp);
 		}
 	}
 }
 
 void
 vfs_periodic(struct mount *mp, int flags)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
 		vfs_periodic_inactive(mp, flags);
 	else
 		vfs_periodic_msync_inactive(mp, flags);
 }
 
 static void
 destroy_vpollinfo_free(struct vpollinfo *vi)
 {
 
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	uma_zfree(vnodepoll_zone, vi);
 }
 
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
 
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
 }
 
 /*
  * Initialize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	if (vp->v_pollinfo != NULL)
 		return;
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
 		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return (events);
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return (0);
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 void
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
 	vp->v_type = VNON;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque() failed");
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	VOP_UNLOCK(vp);
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	if (mp->mnt_syncer == NULL) {
 		mp->mnt_syncer = vp;
 		vp = NULL;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 	if (vp != NULL) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		vput(vp);
 	}
 }
 
 void
 vfs_deallocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_lock(&sync_mtx);
 	vp = mp->mnt_syncer;
 	if (vp != NULL)
 		mp->mnt_syncer = NULL;
 	mtx_unlock(&sync_mtx);
 	if (vp != NULL)
 		vrele(vp);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	int error, save;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp);
 		return (0);
 	}
 	save = curthread_pflags_set(TDP_SYNCIO);
 	/*
 	 * The filesystem at hand may be idle with free vnodes stored in the
 	 * batch.  Return them instead of letting them stay there indefinitely.
 	 */
 	vfs_periodic(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY);
 	curthread_pflags_restore(save);
 	vn_finished_write(mp);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	mtx_lock(&sync_mtx);
 	if (vp->v_mount->mnt_syncer == vp)
 		vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 int
 vn_need_pageq_flush(struct vnode *vp)
 {
 	struct vm_object *obj;
 	int need;
 
 	MPASS(mtx_owned(VI_MTX(vp)));
 	need = 0;
 	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj))
 		need = 1;
 	return (need);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	if (vp->v_type != VCHR) {
 		error = ENOTBLK;
 		goto out;
 	}
 	error = 0;
 	dev_lock();
 	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 out:
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
  * the comment above cache_fplookup for details.
  *
  * We never deny as priv_check_cred calls are not yet supported, see vaccess.
  */
 int
 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		if (file_mode & S_IXUSR)
 			return (0);
 		return (EAGAIN);
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			return (0);
 		return (EAGAIN);
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		return (0);
 	return (EAGAIN);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, and credentials.
  * Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     accmode_t accmode, struct ucred *cred)
 {
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    ("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ))
 		priv_granted |= VREAD;
 
 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN))
 		priv_granted |= VADMIN;
 
 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
 		return (0);
 	}
 
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, accmode_t accmode)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, accmode, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to suppress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL ||		\
 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
     "Drop into debugger on lock violation");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
     0, "Check for interlock across VOPs");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
     0, "Print lock violations");
 
 int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
     0, "Print vnode details on lock violations");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_vnode)
 		vn_printf(vp, "vnode ");
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 	int locked;
 
 	if (!IGNORE_LOCK(vp)) {
 		locked = VOP_ISLOCKED(vp);
 		if (locked == 0 || locked == LK_EXCLOTHER)
 			vfs_badlock("is not locked but should be", str, vp);
 	}
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_fail(struct vop_rename_args *ap)
 {
 
 	if (ap->a_tvp != NULL)
 		vput(ap->a_tvp);
 	if (ap->a_tdvp == ap->a_tvp)
 		vrele(ap->a_tdvp);
 	else
 		vput(ap->a_tdvp);
 	vrele(ap->a_fdvp);
 	vrele(ap->a_fvp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	/*
 	 * It may be tempting to add vn_seqc_write_begin/end calls here and
 	 * in vop_rename_post but that's not going to work out since some
 	 * filesystems relookup vnodes mid-rename. This is probably a bug.
 	 *
 	 * For now filesystems are expected to do the relevant calls after they
 	 * decide what vnodes to operate on.
 	 */
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_fplookup_vexec_debugpre(void *ap __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
 {
 
 	VFS_SMR_ASSERT_ENTERED();
 }
 
 void
 vop_strategy_debugpre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 }
 
 void
 vop_lock_debugpre(void *ap)
 {
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_debugpost(void *ap, int rc)
 {
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_debugpre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_need_inactive_debugpre(void *ap)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 
 void
 vop_need_inactive_debugpost(void *ap, int rc)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 #endif
 
 void
 vop_create_pre(void *ap)
 {
 	struct vop_create_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_whiteout_pre(void *ap)
 {
 	struct vop_whiteout_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_whiteout_post(void *ap, int rc)
 {
 	struct vop_whiteout_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 }
 
 void
 vop_deleteextattr_pre(void *ap)
 {
 	struct vop_deleteextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_deleteextattr_post(void *ap, int rc)
 {
 	struct vop_deleteextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_link_pre(void *ap)
 {
 	struct vop_link_args *a;
 	struct vnode *vp, *tdvp;
 
 	a = ap;
 	vp = a->a_vp;
 	tdvp = a->a_tdvp;
 	vn_seqc_write_begin(vp);
 	vn_seqc_write_begin(tdvp);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a;
 	struct vnode *vp, *tdvp;
 
 	a = ap;
 	vp = a->a_vp;
 	tdvp = a->a_tdvp;
 	vn_seqc_write_end(vp);
 	vn_seqc_write_end(tdvp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_pre(void *ap)
 {
 	struct vop_mkdir_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_pre(void *ap)
 {
 	struct vop_mknod_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_reclaim_post(void *ap, int rc)
 {
 	struct vop_reclaim_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	ASSERT_VOP_IN_SEQC(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
 }
 
 void
 vop_remove_pre(void *ap)
 {
 	struct vop_remove_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_begin(dvp);
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_end(dvp);
 	vn_seqc_write_end(vp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 	long hint;
 
 	if (!rc) {
 		hint = NOTE_WRITE;
 		if (a->a_fdvp == a->a_tdvp) {
 			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		} else {
 			hint |= NOTE_EXTEND;
 			if (a->a_fvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 
 			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 			    a->a_tvp->v_type == VDIR)
 				hint &= ~NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		}
 
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_pre(void *ap)
 {
 	struct vop_rmdir_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_begin(dvp);
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a;
 	struct vnode *dvp, *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = a->a_vp;
 	vn_seqc_write_end(dvp);
 	vn_seqc_write_end(vp);
 	if (!rc) {
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_pre(void *ap)
 {
 	struct vop_setattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 }
 
 void
 vop_setacl_pre(void *ap)
 {
 	struct vop_setacl_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setacl_post(void *ap, int rc __unused)
 {
 	struct vop_setacl_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 }
 
 void
 vop_setextattr_pre(void *ap)
 {
 	struct vop_setextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_begin(vp);
 }
 
 void
 vop_setextattr_post(void *ap, int rc)
 {
 	struct vop_setextattr_args *a;
 	struct vnode *vp;
 
 	a = ap;
 	vp = a->a_vp;
 	vn_seqc_write_end(vp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_pre(void *ap)
 {
 	struct vop_symlink_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_begin(dvp);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vn_seqc_write_end(dvp);
 	if (!rc)
 		VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 }
 
 void
 vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 }
 
 void
 vop_close_post(void *ap, int rc)
 {
 	struct vop_close_args *a = ap;
 
 	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 	    !VN_IS_DOOMED(a->a_vp))) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
 	}
 }
 
 void
 vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_readdir_post(void *ap, int rc)
 {
 	struct vop_readdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init_mtx(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent
 };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
     NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread
 };
 static struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite
 };
 static struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode
 };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp);
 }
 
 static void
 vfs_knl_assert_locked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 #endif
 }
 
 static void
 vfs_knl_assert_unlocked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 #endif
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 	vdrop(vp);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 	int res;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
 		return (0);
 
 	VI_LOCK(vp);
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	VI_LOCK(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	VI_UNLOCK(vp);
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	int res;
 
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
 	}
 	res = (kn->kn_fflags != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*
  * Returns whether the directory is empty or not.
  * If it is empty, the return value is 0; otherwise
  * the return value is an error value (which may
  * be ENOTEMPTY).
  */
 int
 vfs_emptydir(struct vnode *vp)
 {
 	struct uio uio;
 	struct iovec iov;
 	struct dirent *dirent, *dp, *endp;
 	int error, eof;
 
 	error = 0;
 	eof = 0;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 
 	dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
 	iov.iov_base = dirent;
 	iov.iov_len = sizeof(struct dirent);
 
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = sizeof(struct dirent);
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 
 	while (eof == 0 && error == 0) {
 		error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
 		    NULL, NULL);
 		if (error != 0)
 			break;
 		endp = (void *)((uint8_t *)dirent +
 		    sizeof(struct dirent) - uio.uio_resid);
 		for (dp = dirent; dp < endp;
 		     dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
 			if (dp->d_type == DT_WHT)
 				continue;
 			if (dp->d_namlen == 0)
 				continue;
 			if (dp->d_type != DT_DIR &&
 			    dp->d_type != DT_UNKNOWN) {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen > 2) {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen == 1 &&
 			    dp->d_name[0] != '.') {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen == 2 &&
 			    dp->d_name[1] != '.') {
 				error = ENOTEMPTY;
 				break;
 			}
 			uio.uio_resid = sizeof(struct dirent);
 		}
 	}
 	free(dirent, M_TEMP);
 	return (error);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	*ap->a_ncookies += 1;
 	return (0);
 }
 
 /*
  * The purpose of this routine is to remove granularity from accmode_t,
  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
  * VADMIN and VAPPEND.
  *
  * If it returns 0, the caller is supposed to continue with the usual
  * access checks using 'accmode' as modified by this routine.  If it
  * returns nonzero value, the caller is supposed to return that value
  * as errno.
  *
  * Note that after this routine runs, accmode may be zero.
  */
 int
 vfs_unixify_accmode(accmode_t *accmode)
 {
 	/*
 	 * There is no way to specify explicit "deny" rule using
 	 * file mode or POSIX.1e ACLs.
 	 */
 	if (*accmode & VEXPLICIT_DENY) {
 		*accmode = 0;
 		return (0);
 	}
 
 	/*
 	 * None of these can be translated into usual access bits.
 	 * Also, the common case for NFSv4 ACLs is to not contain
 	 * either of these bits. Caller should check for VWRITE
 	 * on the containing directory instead.
 	 */
 	if (*accmode & (VDELETE_CHILD | VDELETE))
 		return (EPERM);
 
 	if (*accmode & VADMIN_PERMS) {
 		*accmode &= ~VADMIN_PERMS;
 		*accmode |= VADMIN;
 	}
 
 	/*
 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 	 */
 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 
 	return (0);
 }
 
 /*
  * Clear out a doomed vnode (if any) and replace it with a new one as long
  * as the fs is not being unmounted. Return the root vnode to the caller.
  */
 static int __noinline
 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 restart:
 	if (mp->mnt_rootvnode != NULL) {
 		MNT_ILOCK(mp);
 		vp = mp->mnt_rootvnode;
 		if (vp != NULL) {
 			if (!VN_IS_DOOMED(vp)) {
 				vrefact(vp);
 				MNT_IUNLOCK(mp);
 				error = vn_lock(vp, flags);
 				if (error == 0) {
 					*vpp = vp;
 					return (0);
 				}
 				vrele(vp);
 				goto restart;
 			}
 			/*
 			 * Clear the old one.
 			 */
 			mp->mnt_rootvnode = NULL;
 		}
 		MNT_IUNLOCK(mp);
 		if (vp != NULL) {
 			vfs_op_barrier_wait(mp);
 			vrele(vp);
 		}
 	}
 	error = VFS_CACHEDROOT(mp, flags, vpp);
 	if (error != 0)
 		return (error);
 	if (mp->mnt_vfs_ops == 0) {
 		MNT_ILOCK(mp);
 		if (mp->mnt_vfs_ops != 0) {
 			MNT_IUNLOCK(mp);
 			return (0);
 		}
 		if (mp->mnt_rootvnode == NULL) {
 			vrefact(*vpp);
 			mp->mnt_rootvnode = *vpp;
 		} else {
 			if (mp->mnt_rootvnode != *vpp) {
 				if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
 					panic("%s: mismatch between vnode returned "
 					    " by VFS_CACHEDROOT and the one cached "
 					    " (%p != %p)",
 					    __func__, *vpp, mp->mnt_rootvnode);
 				}
 			}
 		}
 		MNT_IUNLOCK(mp);
 	}
 	return (0);
 }
 
 int
 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (!vfs_op_thread_enter(mp))
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	vp = atomic_load_ptr(&mp->mnt_rootvnode);
 	if (vp == NULL || VN_IS_DOOMED(vp)) {
 		vfs_op_thread_exit(mp);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	vrefact(vp);
 	vfs_op_thread_exit(mp);
 	error = vn_lock(vp, flags);
 	if (error != 0) {
 		vrele(vp);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	*vpp = vp;
 	return (0);
 }
 
 struct vnode *
 vfs_cache_root_clear(struct mount *mp)
 {
 	struct vnode *vp;
 
 	/*
 	 * ops > 0 guarantees there is nobody who can see this vnode
 	 */
 	MPASS(mp->mnt_vfs_ops > 0);
 	vp = mp->mnt_rootvnode;
 	if (vp != NULL)
 		vn_seqc_write_begin(vp);
 	mp->mnt_rootvnode = NULL;
 	return (vp);
 }
 
 void
 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
 {
 
 	MPASS(mp->mnt_vfs_ops > 0);
 	vrefact(vp);
 	mp->mnt_rootvnode = vp;
 }
 
 /*
  * These are helper functions for filesystems to traverse all
  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
  *
  * This interface replaces MNT_VNODE_FOREACH.
  */
 
 struct vnode *
 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = vn_alloc_marker(mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		vn_free_marker(*mvp);
 		*mvp = NULL;
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	vn_free_marker(*mvp);
 	*mvp = NULL;
 }
 
 /*
  * These are helper functions for filesystems to traverse their
  * lazy vnodes.  See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
  */
 static void
 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 {
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	vn_free_marker(*mvp);
 	*mvp = NULL;
 }
 
 /*
  * Relock the mp mount vnode list lock with the vp vnode interlock in the
  * conventional lock order during mnt_vnode_next_lazy iteration.
  *
  * On entry, the mount vnode list lock is held and the vnode interlock is not.
  * The list lock is dropped and reacquired.  On success, both locks are held.
  * On failure, the mount vnode list lock is held but the vnode interlock is
  * not, and the procedure may have yielded.
  */
 static bool
 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
     struct vnode *vp)
 {
 
 	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 	    TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
 	    ("%s: bad marker", __func__));
 	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 	    ("%s: inappropriate vnode", __func__));
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
 	TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
 
 	/*
 	 * Note we may be racing against vdrop which transitioned the hold
 	 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
 	 * if we are the only user after we get the interlock we will just
 	 * vdrop.
 	 */
 	vhold(vp);
 	mtx_unlock(&mp->mnt_listmtx);
 	VI_LOCK(vp);
 	if (VN_IS_DOOMED(vp)) {
 		VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 		goto out_lost;
 	}
 	VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 	/*
 	 * There is nothing to do if we are the last user.
 	 */
 	if (!refcount_release_if_not_last(&vp->v_holdcnt))
 		goto out_lost;
 	mtx_lock(&mp->mnt_listmtx);
 	return (true);
 out_lost:
 	vdropl(vp);
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 	return (false);
 }
 
 static struct vnode *
 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 	struct vnode *vp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 restart:
 	vp = TAILQ_NEXT(*mvp, v_lazylist);
 	while (vp != NULL) {
 		if (vp->v_type == VMARKER) {
 			vp = TAILQ_NEXT(vp, v_lazylist);
 			continue;
 		}
 		/*
 		 * See if we want to process the vnode. Note we may encounter a
 		 * long string of vnodes we don't care about and hog the list
 		 * as a result. Check for it and requeue the marker.
 		 */
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		if (!cb(vp, cbarg)) {
 			if (!should_yield()) {
 				vp = TAILQ_NEXT(vp, v_lazylist);
 				continue;
 			}
 			TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
 			    v_lazylist);
 			TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
 			    v_lazylist);
 			mtx_unlock(&mp->mnt_listmtx);
 			kern_yield(PRI_USER);
 			mtx_lock(&mp->mnt_listmtx);
 			goto restart;
 		}
 		/*
 		 * Try-lock because this is the wrong lock order.
 		 */
 		if (!VI_TRYLOCK(vp) &&
 		    !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
 			goto restart;
 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 		    ("alien vnode on the lazy list %p %p", vp, mp));
 		VNPASS(vp->v_mount == mp, vp);
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		break;
 	}
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_lazy(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
 	mtx_unlock(&mp->mnt_listmtx);
 	ASSERT_VI_LOCKED(vp, "lazy iter");
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	mtx_lock(&mp->mnt_listmtx);
 	return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 }
 
 struct vnode *
 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
     void *cbarg)
 {
 	struct vnode *vp;
 
 	if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
 		return (NULL);
 
 	*mvp = vn_alloc_marker(mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 
 	mtx_lock(&mp->mnt_listmtx);
 	vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_lazy(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
 	return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 }
 
 void
 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 	mtx_unlock(&mp->mnt_listmtx);
 	mnt_vnode_markerfree_lazy(mvp, mp);
 }
 
 int
 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
 {
 
 	if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 		cnp->cn_flags &= ~NOEXECCHECK;
 		return (0);
 	}
 
 	return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
 }
 
 /*
  * Do not use this variant unless you have means other than the hold count
  * to prevent the vnode from getting freed.
  */
 void
 vn_seqc_write_begin_unheld_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_seqc_users >= 0, vp);
 	vp->v_seqc_users++;
 	if (vp->v_seqc_users == 1)
 		seqc_sleepable_write_begin(&vp->v_seqc);
 }
 
 void
 vn_seqc_write_begin_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	vn_seqc_write_begin_unheld_locked(vp);
 }
 
 void
 vn_seqc_write_begin(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_begin_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_seqc_write_begin_unheld(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_begin_unheld_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vn_seqc_write_end_locked(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_seqc_users > 0, vp);
 	vp->v_seqc_users--;
 	if (vp->v_seqc_users == 0)
 		seqc_sleepable_write_end(&vp->v_seqc);
 }
 
 void
 vn_seqc_write_end(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vn_seqc_write_end_locked(vp);
 	VI_UNLOCK(vp);
 }
Index: projects/clang1100-import/sys/modules/nvd/Makefile
===================================================================
--- projects/clang1100-import/sys/modules/nvd/Makefile	(revision 364278)
+++ projects/clang1100-import/sys/modules/nvd/Makefile	(revision 364279)
@@ -1,8 +1,8 @@
 # $FreeBSD$
 
 .PATH: ${SRCTOP}/sys/dev/nvd
 
 KMOD=	nvd
-SRCS=	nvd.c opt_geom.h device_if.h bus_if.h
+SRCS=	nvd.c opt_geom.h device_if.h bus_if.h pci_if.h
 
 .include <bsd.kmod.mk>
Index: projects/clang1100-import/sys/modules/usb/cp2112/Makefile
===================================================================
--- projects/clang1100-import/sys/modules/usb/cp2112/Makefile	(revision 364278)
+++ projects/clang1100-import/sys/modules/usb/cp2112/Makefile	(revision 364279)
@@ -1,37 +1,37 @@
 #
 # $FreeBSD$
 #
 # Copyright (c) Andriy Gapon <avg@FreeBSD.org>
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 
 S=     ${SRCTOP}/sys
 
 .PATH: $S/dev/usb/misc
 
 KMOD=	cp2112
 SRCS=	cp2112.c
-SRCS+=	opt_bus.h opt_usb.h
+SRCS+=	opt_bus.h opt_platform.h opt_usb.h
 SRCS+=	device_if.h bus_if.h gpio_if.h iicbus_if.h usb_if.h usbdevs.h
 
 .include <bsd.kmod.mk>
Index: projects/clang1100-import/sys/netinet/sctp_input.c
===================================================================
--- projects/clang1100-import/sys/netinet/sctp_input.c	(revision 364278)
+++ projects/clang1100-import/sys/netinet/sctp_input.c	(revision 364279)
@@ -1,5809 +1,5808 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_input.h>
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_indata.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_crc32.h>
 #include <netinet/sctp_kdtrace.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/udp.h>
 #endif
 #include <sys/smp.h>
 
 static void
 sctp_stop_all_cookie_timers(struct sctp_tcb *stcb)
 {
 	struct sctp_nets *net;
 
 	/*
 	 * This now not only stops all cookie timers it also stops any INIT
 	 * timers as well. This will make sure that the timers are stopped
 	 * in all collision cases.
 	 */
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		if (net->rxt_timer.type == SCTP_TIMER_TYPE_COOKIE) {
 			sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE,
 			    stcb->sctp_ep,
 			    stcb,
 			    net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_1);
 		} else if (net->rxt_timer.type == SCTP_TIMER_TYPE_INIT) {
 			sctp_timer_stop(SCTP_TIMER_TYPE_INIT,
 			    stcb->sctp_ep,
 			    stcb,
 			    net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_2);
 		}
 	}
 }
 
 /* INIT handler */
 static void
 sctp_handle_init(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
     struct sctp_init_chunk *cp, struct sctp_inpcb *inp,
     struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_init *init;
 	struct mbuf *op_err;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init: handling INIT tcb:%p\n",
 	    (void *)stcb);
 	if (stcb == NULL) {
 		SCTP_INP_RLOCK(inp);
 	}
 	/* validate length */
 	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) {
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
 	}
 	/* validate parameters */
 	init = &cp->init;
 	if (init->initiate_tag == 0) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
 	}
 	if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) {
 		/* invalid parameter... send abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
 	}
 	if (init->num_inbound_streams == 0) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
 	}
 	if (init->num_outbound_streams == 0) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
 	}
 	if (sctp_validate_init_auth_params(m, offset + sizeof(*cp),
 	    offset + ntohs(cp->ch.chunk_length))) {
 		/* auth parameter(s) error... send abort */
 		op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 		    "Problem with AUTH parameters");
 		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
 	}
 	/* We are only accepting if we have a listening socket. */
 	if ((stcb == NULL) &&
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (!SCTP_IS_LISTENING(inp)))) {
 		/*
 		 * FIX ME ?? What about TCP model and we have a
 		 * match/restart case? Actually no fix is needed. the lookup
 		 * will always find the existing assoc so stcb would not be
 		 * NULL. It may be questionable to do this since we COULD
 		 * just send back the INIT-ACK and hope that the app did
 		 * accept()'s by the time the COOKIE was sent. But there is
 		 * a price to pay for COOKIE generation and I don't want to
 		 * pay it on the chance that the app will actually do some
 		 * accepts(). The App just looses and should NOT be in this
 		 * state :-)
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_blackhole) == 0) {
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    "No listener");
 			sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err,
 			    mflowtype, mflowid, inp->fibnum,
 			    vrf_id, port);
 		}
 		goto outnow;
 	}
 	if ((stcb != NULL) &&
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 		SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending SHUTDOWN-ACK\n");
 		sctp_send_shutdown_ack(stcb, NULL);
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
 	} else {
 		SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n");
 		sctp_send_initiate_ack(inp, stcb, net, m, iphlen, offset,
 		    src, dst, sh, cp,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 	}
 outnow:
 	if (stcb == NULL) {
 		SCTP_INP_RUNLOCK(inp);
 	}
 }
 
 /*
  * process peer "INIT/INIT-ACK" chunk returns value < 0 on error
  */
 
 int
 sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked)
 {
 	int unsent_data;
 	unsigned int i;
 	struct sctp_stream_queue_pending *sp;
 	struct sctp_association *asoc;
 
 	/*
 	 * This function returns if any stream has true unsent data on it.
 	 * Note that as it looks through it will clean up any places that
 	 * have old data that has been sent but left at top of stream queue.
 	 */
 	asoc = &stcb->asoc;
 	unsent_data = 0;
 	SCTP_TCB_SEND_LOCK(stcb);
 	if (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
 		/* Check to see if some data queued */
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			/* sa_ignore FREED_MEMORY */
 			sp = TAILQ_FIRST(&stcb->asoc.strmout[i].outqueue);
 			if (sp == NULL) {
 				continue;
 			}
 			if ((sp->msg_is_complete) &&
 			    (sp->length == 0) &&
 			    (sp->sender_all_done)) {
 				/*
 				 * We are doing differed cleanup. Last time
 				 * through when we took all the data the
 				 * sender_all_done was not set.
 				 */
 				if (sp->put_last_out == 0) {
 					SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n");
 					SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d\n",
 					    sp->sender_all_done,
 					    sp->length,
 					    sp->msg_is_complete,
 					    sp->put_last_out);
 				}
 				atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1);
 				TAILQ_REMOVE(&stcb->asoc.strmout[i].outqueue, sp, next);
 				stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, &asoc->strmout[i], sp, 1);
 				if (sp->net) {
 					sctp_free_remote_addr(sp->net);
 					sp->net = NULL;
 				}
 				if (sp->data) {
 					sctp_m_freem(sp->data);
 					sp->data = NULL;
 				}
 				sctp_free_a_strmoq(stcb, sp, so_locked);
 				if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
 					unsent_data++;
 				}
 			} else {
 				unsent_data++;
 			}
 			if (unsent_data > 0) {
 				break;
 			}
 		}
 	}
 	SCTP_TCB_SEND_UNLOCK(stcb);
 	return (unsent_data);
 }
 
 static int
 sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
 {
 	struct sctp_init *init;
 	struct sctp_association *asoc;
 	struct sctp_nets *lnet;
 	unsigned int i;
 
 	init = &cp->init;
 	asoc = &stcb->asoc;
 	/* save off parameters */
 	asoc->peer_vtag = ntohl(init->initiate_tag);
 	asoc->peers_rwnd = ntohl(init->a_rwnd);
 	/* init tsn's */
 	asoc->highest_tsn_inside_map = asoc->asconf_seq_in = ntohl(init->initial_tsn) - 1;
 
 	if (!TAILQ_EMPTY(&asoc->nets)) {
 		/* update any ssthresh's that may have a default */
 		TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) {
 			lnet->ssthresh = asoc->peers_rwnd;
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) {
 				sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION);
 			}
 
 		}
 	}
 	SCTP_TCB_SEND_LOCK(stcb);
 	if (asoc->pre_open_streams > ntohs(init->num_inbound_streams)) {
 		unsigned int newcnt;
 		struct sctp_stream_out *outs;
 		struct sctp_stream_queue_pending *sp, *nsp;
 		struct sctp_tmit_chunk *chk, *nchk;
 
 		/* abandon the upper streams */
 		newcnt = ntohs(init->num_inbound_streams);
 		TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
 			if (chk->rec.data.sid >= newcnt) {
 				TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
 				asoc->send_queue_cnt--;
 				if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) {
 					asoc->strmout[chk->rec.data.sid].chunks_on_queues--;
 #ifdef INVARIANTS
 				} else {
 					panic("No chunks on the queues for sid %u.", chk->rec.data.sid);
 #endif
 				}
 				if (chk->data != NULL) {
 					sctp_free_bufspace(stcb, asoc, chk, 1);
 					sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
 					    0, chk, SCTP_SO_NOT_LOCKED);
 					if (chk->data) {
 						sctp_m_freem(chk->data);
 						chk->data = NULL;
 					}
 				}
 				sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 				/* sa_ignore FREED_MEMORY */
 			}
 		}
 		if (asoc->strmout) {
 			for (i = newcnt; i < asoc->pre_open_streams; i++) {
 				outs = &asoc->strmout[i];
 				TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
 					atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1);
 					TAILQ_REMOVE(&outs->outqueue, sp, next);
 					stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1);
 					sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL,
 					    stcb, 0, sp, SCTP_SO_NOT_LOCKED);
 					if (sp->data) {
 						sctp_m_freem(sp->data);
 						sp->data = NULL;
 					}
 					if (sp->net) {
 						sctp_free_remote_addr(sp->net);
 						sp->net = NULL;
 					}
 					/* Free the chunk */
 					sctp_free_a_strmoq(stcb, sp, SCTP_SO_NOT_LOCKED);
 					/* sa_ignore FREED_MEMORY */
 				}
 				outs->state = SCTP_STREAM_CLOSED;
 			}
 		}
 		/* cut back the count */
 		asoc->pre_open_streams = newcnt;
 	}
 	SCTP_TCB_SEND_UNLOCK(stcb);
 	asoc->streamoutcnt = asoc->pre_open_streams;
 	if (asoc->strmout) {
 		for (i = 0; i < asoc->streamoutcnt; i++) {
 			asoc->strmout[i].state = SCTP_STREAM_OPEN;
 		}
 	}
 	/* EY - nr_sack: initialize highest tsn in nr_mapping_array */
 	asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map;
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
 		sctp_log_map(0, 5, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
 	}
 	/* This is the next one we expect */
 	asoc->str_reset_seq_in = asoc->asconf_seq_in + 1;
 
 	asoc->mapping_array_base_tsn = ntohl(init->initial_tsn);
 	asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->asconf_seq_in;
 
 	asoc->advanced_peer_ack_point = asoc->last_acked_seq;
 	/* open the requested streams */
 
 	if (asoc->strmin != NULL) {
 		/* Free the old ones */
 		for (i = 0; i < asoc->streamincnt; i++) {
 			sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue);
 			sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue);
 		}
 		SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
 	}
 	if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) {
 		asoc->streamincnt = ntohs(init->num_outbound_streams);
 	} else {
 		asoc->streamincnt = asoc->max_inbound_streams;
 	}
 	SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt *
 	    sizeof(struct sctp_stream_in), SCTP_M_STRMI);
 	if (asoc->strmin == NULL) {
 		/* we didn't get memory for the streams! */
 		SCTPDBG(SCTP_DEBUG_INPUT2, "process_init: couldn't get memory for the streams!\n");
 		return (-1);
 	}
 	for (i = 0; i < asoc->streamincnt; i++) {
 		asoc->strmin[i].sid = i;
 		asoc->strmin[i].last_mid_delivered = 0xffffffff;
 		TAILQ_INIT(&asoc->strmin[i].inqueue);
 		TAILQ_INIT(&asoc->strmin[i].uno_inqueue);
 		asoc->strmin[i].pd_api_started = 0;
 		asoc->strmin[i].delivery_started = 0;
 	}
 	/*
 	 * load_address_from_init will put the addresses into the
 	 * association when the COOKIE is processed or the INIT-ACK is
 	 * processed. Both types of COOKIE's existing and new call this
 	 * routine. It will remove addresses that are no longer in the
 	 * association (for the restarting case where addresses are
 	 * removed). Up front when the INIT arrives we will discard it if it
 	 * is a restart and new addresses have been added.
 	 */
 	/* sa_ignore MEMLEAK */
 	return (0);
 }
 
 /*
  * INIT-ACK message processing/consumption returns value < 0 on error
  */
 static int
 sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
     struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
     struct sctp_nets *net, int *abort_no_unlock,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id)
 {
 	struct sctp_association *asoc;
 	struct mbuf *op_err;
 	int retval, abort_flag, cookie_found;
 	int initack_limit;
 	int nat_friendly = 0;
 
 	/* First verify that we have no illegal param's */
 	abort_flag = 0;
 	cookie_found = 0;
 
 	op_err = sctp_arethere_unrecognized_parameters(m,
 	    (offset + sizeof(struct sctp_init_chunk)),
 	    &abort_flag, (struct sctp_chunkhdr *)cp,
 	    &nat_friendly, &cookie_found);
 	if (abort_flag) {
 		/* Send an abort and notify peer */
 		sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (!cookie_found) {
 		uint16_t len;
 
 		/* Only report the missing cookie parameter */
 		if (op_err != NULL) {
 			sctp_m_freem(op_err);
 		}
 		len = (uint16_t)(sizeof(struct sctp_error_missing_param) + sizeof(uint16_t));
 		/* We abort with an error of missing mandatory param */
 		op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
 		if (op_err != NULL) {
 			struct sctp_error_missing_param *cause;
 
 			SCTP_BUF_LEN(op_err) = len;
 			cause = mtod(op_err, struct sctp_error_missing_param *);
 			/* Subtract the reserved param */
 			cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM);
 			cause->cause.length = htons(len);
 			cause->num_missing_params = htonl(1);
 			cause->type[0] = htons(SCTP_STATE_COOKIE);
 		}
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-3);
 	}
 	asoc = &stcb->asoc;
 	asoc->peer_supports_nat = (uint8_t)nat_friendly;
 	/* process the peer's parameters in the INIT-ACK */
 	retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb);
 	if (retval < 0) {
 		if (op_err != NULL) {
 			sctp_m_freem(op_err);
 		}
 		return (retval);
 	}
 	initack_limit = offset + ntohs(cp->ch.chunk_length);
 	/* load all addresses */
 	if ((retval = sctp_load_addresses_from_init(stcb, m,
 	    (offset + sizeof(struct sctp_init_chunk)), initack_limit,
 	    src, dst, NULL, stcb->asoc.port))) {
 		if (op_err != NULL) {
 			sctp_m_freem(op_err);
 		}
 		op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 		    "Problem with address parameters");
 		SCTPDBG(SCTP_DEBUG_INPUT1,
 		    "Load addresses from INIT causes an abort %d\n",
 		    retval);
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	/* if the peer doesn't support asconf, flush the asconf queue */
 	if (asoc->asconf_supported == 0) {
 		struct sctp_asconf_addr *param, *nparam;
 
 		TAILQ_FOREACH_SAFE(param, &asoc->asconf_queue, next, nparam) {
 			TAILQ_REMOVE(&asoc->asconf_queue, param, next);
 			SCTP_FREE(param, SCTP_M_ASC_ADDR);
 		}
 	}
 
 	stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs,
 	    stcb->asoc.local_hmacs);
 	if (op_err) {
 		sctp_queue_op_err(stcb, op_err);
 		/* queuing will steal away the mbuf chain to the out queue */
 		op_err = NULL;
 	}
 	/* extract the cookie and queue it to "echo" it back... */
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 		sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
 		    stcb->asoc.overall_error_count,
 		    0,
 		    SCTP_FROM_SCTP_INPUT,
 		    __LINE__);
 	}
 	stcb->asoc.overall_error_count = 0;
 	net->error_count = 0;
 
 	/*
 	 * Cancel the INIT timer, We do this first before queueing the
 	 * cookie. We always cancel at the primary to assue that we are
 	 * canceling the timer started by the INIT which always goes to the
 	 * primary.
 	 */
 	sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb,
 	    asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3);
 
 	/* calculate the RTO */
 	sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered,
 	    SCTP_RTT_FROM_NON_DATA);
 	retval = sctp_send_cookie_echo(m, offset, initack_limit, stcb, net);
 	return (retval);
 }
 
 static void
 sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
     struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	union sctp_sockstore store;
 	struct sctp_nets *r_net, *f_net;
 	struct timeval tv;
 	int req_prim = 0;
 	uint16_t old_error_counter;
 
 	if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) {
 		/* Invalid length */
 		return;
 	}
 
 	memset(&store, 0, sizeof(store));
 	switch (cp->heartbeat.hb_info.addr_family) {
 #ifdef INET
 	case AF_INET:
 		if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) {
 			store.sin.sin_family = cp->heartbeat.hb_info.addr_family;
 			store.sin.sin_len = cp->heartbeat.hb_info.addr_len;
 			store.sin.sin_port = stcb->rport;
 			memcpy(&store.sin.sin_addr, cp->heartbeat.hb_info.address,
 			    sizeof(store.sin.sin_addr));
 		} else {
 			return;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) {
 			store.sin6.sin6_family = cp->heartbeat.hb_info.addr_family;
 			store.sin6.sin6_len = cp->heartbeat.hb_info.addr_len;
 			store.sin6.sin6_port = stcb->rport;
 			memcpy(&store.sin6.sin6_addr, cp->heartbeat.hb_info.address, sizeof(struct in6_addr));
 		} else {
 			return;
 		}
 		break;
 #endif
 	default:
 		return;
 	}
 	r_net = sctp_findnet(stcb, &store.sa);
 	if (r_net == NULL) {
 		SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n");
 		return;
 	}
 	if ((r_net && (r_net->dest_state & SCTP_ADDR_UNCONFIRMED)) &&
 	    (r_net->heartbeat_random1 == cp->heartbeat.hb_info.random_value1) &&
 	    (r_net->heartbeat_random2 == cp->heartbeat.hb_info.random_value2)) {
 		/*
 		 * If the its a HB and it's random value is correct when can
 		 * confirm the destination.
 		 */
 		r_net->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
 		if (r_net->dest_state & SCTP_ADDR_REQ_PRIMARY) {
 			stcb->asoc.primary_destination = r_net;
 			r_net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY;
 			f_net = TAILQ_FIRST(&stcb->asoc.nets);
 			if (f_net != r_net) {
 				/*
 				 * first one on the list is NOT the primary
 				 * sctp_cmpaddr() is much more efficient if
 				 * the primary is the first on the list,
 				 * make it so.
 				 */
 				TAILQ_REMOVE(&stcb->asoc.nets, r_net, sctp_next);
 				TAILQ_INSERT_HEAD(&stcb->asoc.nets, r_net, sctp_next);
 			}
 			req_prim = 1;
 		}
 		sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
 		    stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED);
 		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb,
 		    r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4);
 		sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net);
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 		sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
 		    stcb->asoc.overall_error_count,
 		    0,
 		    SCTP_FROM_SCTP_INPUT,
 		    __LINE__);
 	}
 	stcb->asoc.overall_error_count = 0;
 	old_error_counter = r_net->error_count;
 	r_net->error_count = 0;
 	r_net->hb_responded = 1;
 	tv.tv_sec = cp->heartbeat.hb_info.time_value_1;
 	tv.tv_usec = cp->heartbeat.hb_info.time_value_2;
 	/* Now lets do a RTO with this */
 	sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv,
 	    SCTP_RTT_FROM_NON_DATA);
 	if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) {
 		r_net->dest_state |= SCTP_ADDR_REACHABLE;
 		sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
 		    0, (void *)r_net, SCTP_SO_NOT_LOCKED);
 	}
 	if (r_net->dest_state & SCTP_ADDR_PF) {
 		r_net->dest_state &= ~SCTP_ADDR_PF;
 		stcb->asoc.cc_functions.sctp_cwnd_update_exit_pf(stcb, net);
 	}
 	if (old_error_counter > 0) {
 		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep,
 		    stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_5);
 		sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net);
 	}
 	if (r_net == stcb->asoc.primary_destination) {
 		if (stcb->asoc.alternate) {
 			/* release the alternate, primary is good */
 			sctp_free_remote_addr(stcb->asoc.alternate);
 			stcb->asoc.alternate = NULL;
 		}
 	}
 	/* Mobility adaptation */
 	if (req_prim) {
 		if ((sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_BASE) ||
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_FASTHANDOFF)) &&
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_PRIM_DELETED)) {
 
 			sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED,
 			    stcb->sctp_ep, stcb, NULL,
 			    SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
 			if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_FASTHANDOFF)) {
 				sctp_assoc_immediate_retrans(stcb,
 				    stcb->asoc.primary_destination);
 			}
 			if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_BASE)) {
 				sctp_move_chunks_from_net(stcb,
 				    stcb->asoc.deleted_primary);
 			}
 			sctp_delete_prim_timer(stcb->sctp_ep, stcb);
 		}
 	}
 }
 
 static int
 sctp_handle_nat_colliding_state(struct sctp_tcb *stcb)
 {
 	/*
 	 * Return 0 means we want you to proceed with the abort non-zero
 	 * means no abort processing.
 	 */
 	uint32_t new_vtag;
 	struct sctpasochead *head;
 
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 		new_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_INFO_WLOCK();
 		SCTP_TCB_LOCK(stcb);
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
 	} else {
 		return (0);
 	}
 	if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) {
 		/* generate a new vtag and send init */
 		LIST_REMOVE(stcb, sctp_asocs);
 		stcb->asoc.my_vtag = new_vtag;
 		head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
 		/*
 		 * put it in the bucket in the vtag hash of assoc's for the
 		 * system
 		 */
 		LIST_INSERT_HEAD(head, stcb, sctp_asocs);
 		SCTP_INP_INFO_WUNLOCK();
 		sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
 		return (1);
 	} else {
 		/*
 		 * treat like a case where the cookie expired i.e.: - dump
 		 * current cookie. - generate a new vtag. - resend init.
 		 */
 		/* generate a new vtag and send init */
 		LIST_REMOVE(stcb, sctp_asocs);
 		SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 		sctp_stop_all_cookie_timers(stcb);
 		sctp_toss_old_cookies(stcb, &stcb->asoc);
 		stcb->asoc.my_vtag = new_vtag;
 		head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
 		/*
 		 * put it in the bucket in the vtag hash of assoc's for the
 		 * system
 		 */
 		LIST_INSERT_HEAD(head, stcb, sctp_asocs);
 		SCTP_INP_INFO_WUNLOCK();
 		sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
 		return (1);
 	}
 	return (0);
 }
 
 static int
 sctp_handle_nat_missing_state(struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 	/*
 	 * return 0 means we want you to proceed with the abort non-zero
 	 * means no abort processing
 	 */
 	if (stcb->asoc.auth_supported == 0) {
 		SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n");
 		return (0);
 	}
 	sctp_asconf_send_nat_state_update(stcb, net);
 	return (1);
 }
 
 
 /* Returns 1 if the stcb was aborted, 0 otherwise */
 static int
 sctp_handle_abort(struct sctp_abort_chunk *abort,
     struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	uint16_t len;
 	uint16_t error;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n");
 	if (stcb == NULL)
 		return (0);
 
 	len = ntohs(abort->ch.chunk_length);
 	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_error_cause)) {
 		/*
 		 * Need to check the cause codes for our two magic nat
 		 * aborts which don't kill the assoc necessarily.
 		 */
 		struct sctp_error_cause *cause;
 
 		cause = (struct sctp_error_cause *)(abort + 1);
 		error = ntohs(cause->code);
 		if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) {
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state, ABORT flags:%x\n",
 			    abort->ch.chunk_flags);
 			if (sctp_handle_nat_colliding_state(stcb)) {
 				return (0);
 			}
 		} else if (error == SCTP_CAUSE_NAT_MISSING_STATE) {
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state, ABORT flags:%x\n",
 			    abort->ch.chunk_flags);
 			if (sctp_handle_nat_missing_state(stcb, net)) {
 				return (0);
 			}
 		}
 	} else {
 		error = 0;
 	}
 	/* stop any receive timers */
 	sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_7);
 	/* notify user of the abort and clean up... */
 	sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED);
 	/* free the tcb */
 	SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 		SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 	}
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	sctp_print_out_track_log(stcb);
 #endif
-	SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED);
 	(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_8);
 	SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n");
 	return (1);
 }
 
 static void
 sctp_start_net_timers(struct sctp_tcb *stcb)
 {
 	uint32_t cnt_hb_sent;
 	struct sctp_nets *net;
 
 	cnt_hb_sent = 0;
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/*
 		 * For each network start: 1) A pmtu timer. 2) A HB timer 3)
 		 * If the dest in unconfirmed send a hb as well if under
 		 * max_hb_burst have been sent.
 		 */
 		sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net);
 		sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 		if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) &&
 		    (cnt_hb_sent < SCTP_BASE_SYSCTL(sctp_hb_maxburst))) {
 			sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED);
 			cnt_hb_sent++;
 		}
 	}
 	if (cnt_hb_sent) {
 		sctp_chunk_output(stcb->sctp_ep, stcb,
 		    SCTP_OUTPUT_FROM_COOKIE_ACK,
 		    SCTP_SO_NOT_LOCKED);
 	}
 }
 
 
 static void
 sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
     struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_flag)
 {
 	struct sctp_association *asoc;
 	int some_on_streamwheel;
 	int old_state;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_shutdown: handling SHUTDOWN\n");
 	if (stcb == NULL)
 		return;
 	asoc = &stcb->asoc;
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 		return;
 	}
 	if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) {
 		/* Shutdown NOT the expected size */
 		return;
 	}
 	old_state = SCTP_GET_STATE(stcb);
 	sctp_update_acked(stcb, cp, abort_flag);
 	if (*abort_flag) {
 		return;
 	}
 	if (asoc->control_pdapi) {
 		/*
 		 * With a normal shutdown we assume the end of last record.
 		 */
 		SCTP_INP_READ_LOCK(stcb->sctp_ep);
 		if (asoc->control_pdapi->on_strm_q) {
 			struct sctp_stream_in *strm;
 
 			strm = &asoc->strmin[asoc->control_pdapi->sinfo_stream];
 			if (asoc->control_pdapi->on_strm_q == SCTP_ON_UNORDERED) {
 				/* Unordered */
 				TAILQ_REMOVE(&strm->uno_inqueue, asoc->control_pdapi, next_instrm);
 				asoc->control_pdapi->on_strm_q = 0;
 			} else if (asoc->control_pdapi->on_strm_q == SCTP_ON_ORDERED) {
 				/* Ordered */
 				TAILQ_REMOVE(&strm->inqueue, asoc->control_pdapi, next_instrm);
 				asoc->control_pdapi->on_strm_q = 0;
 #ifdef INVARIANTS
 			} else {
 				panic("Unknown state on ctrl:%p on_strm_q:%d",
 				    asoc->control_pdapi,
 				    asoc->control_pdapi->on_strm_q);
 #endif
 			}
 		}
 		asoc->control_pdapi->end_added = 1;
 		asoc->control_pdapi->pdapi_aborted = 1;
 		asoc->control_pdapi = NULL;
 		SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
 		if (stcb->sctp_socket) {
 			sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
 		}
 	}
 	/* goto SHUTDOWN_RECEIVED state to block new requests */
 	if (stcb->sctp_socket) {
 		if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
 		    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) &&
 		    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT)) {
 			SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_RECEIVED);
 			/*
 			 * notify upper layer that peer has initiated a
 			 * shutdown
 			 */
 			sctp_ulp_notify(SCTP_NOTIFY_PEER_SHUTDOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 
 			/* reset time */
 			(void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered);
 		}
 	}
 	if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) {
 		/*
 		 * stop the shutdown timer, since we WILL move to
 		 * SHUTDOWN-ACK-SENT.
 		 */
 		sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb,
 		    net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9);
 	}
 	/* Now is there unsent data on a stream somewhere? */
 	some_on_streamwheel = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED);
 
 	if (!TAILQ_EMPTY(&asoc->send_queue) ||
 	    !TAILQ_EMPTY(&asoc->sent_queue) ||
 	    some_on_streamwheel) {
 		/* By returning we will push more data out */
 		return;
 	} else {
 		/* no outstanding data to send, so move on... */
 		/* send SHUTDOWN-ACK */
 		/* move to SHUTDOWN-ACK-SENT state */
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 		}
 		if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) {
 			SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_ACK_SENT);
 			sctp_stop_timers_for_shutdown(stcb);
 			sctp_send_shutdown_ack(stcb, net);
 			sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK,
 			    stcb->sctp_ep, stcb, net);
 		} else if (old_state == SCTP_STATE_SHUTDOWN_ACK_SENT) {
 			sctp_send_shutdown_ack(stcb, net);
 		}
 	}
 }
 
 static void
 sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED,
     struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 	struct sctp_association *asoc;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_shutdown_ack: handling SHUTDOWN ACK\n");
 	if (stcb == NULL)
 		return;
 
 	asoc = &stcb->asoc;
 	/* process according to association state */
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 		/* unexpected SHUTDOWN-ACK... do OOTB handling... */
 		sctp_send_shutdown_complete(stcb, net, 1);
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) &&
 	    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 		/* unexpected SHUTDOWN-ACK... so ignore... */
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	if (asoc->control_pdapi) {
 		/*
 		 * With a normal shutdown we assume the end of last record.
 		 */
 		SCTP_INP_READ_LOCK(stcb->sctp_ep);
 		asoc->control_pdapi->end_added = 1;
 		asoc->control_pdapi->pdapi_aborted = 1;
 		asoc->control_pdapi = NULL;
 		SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
 		sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
 	}
 #ifdef INVARIANTS
 	if (!TAILQ_EMPTY(&asoc->send_queue) ||
 	    !TAILQ_EMPTY(&asoc->sent_queue) ||
 	    sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) {
 		panic("Queues are not empty when handling SHUTDOWN-ACK");
 	}
 #endif
 	/* stop the timer */
 	sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_10);
 	/* send SHUTDOWN-COMPLETE */
 	sctp_send_shutdown_complete(stcb, net, 0);
 	/* notify upper layer protocol */
 	if (stcb->sctp_socket) {
 		if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 		    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 			stcb->sctp_socket->so_snd.sb_cc = 0;
 		}
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 	}
 	SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
 	/* free the TCB but first save off the ep */
 	(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_11);
 }
 
 static void
 sctp_process_unrecog_chunk(struct sctp_tcb *stcb, uint8_t chunk_type)
 {
 	switch (chunk_type) {
 	case SCTP_ASCONF_ACK:
 	case SCTP_ASCONF:
 		sctp_asconf_cleanup(stcb);
 		break;
 	case SCTP_IFORWARD_CUM_TSN:
 	case SCTP_FORWARD_CUM_TSN:
 		stcb->asoc.prsctp_supported = 0;
 		break;
 	default:
 		SCTPDBG(SCTP_DEBUG_INPUT2,
 		    "Peer does not support chunk type %d (0x%x).\n",
 		    chunk_type, chunk_type);
 		break;
 	}
 }
 
 /*
  * Skip past the param header and then we will find the param that caused the
  * problem.  There are a number of param's in a ASCONF OR the prsctp param
  * these will turn of specific features.
  * XXX: Is this the right thing to do?
  */
 static void
 sctp_process_unrecog_param(struct sctp_tcb *stcb, uint16_t parameter_type)
 {
 	switch (parameter_type) {
 		/* pr-sctp draft */
 	case SCTP_PRSCTP_SUPPORTED:
 		stcb->asoc.prsctp_supported = 0;
 		break;
 	case SCTP_SUPPORTED_CHUNK_EXT:
 		break;
 		/* draft-ietf-tsvwg-addip-sctp */
 	case SCTP_HAS_NAT_SUPPORT:
 		stcb->asoc.peer_supports_nat = 0;
 		break;
 	case SCTP_ADD_IP_ADDRESS:
 	case SCTP_DEL_IP_ADDRESS:
 	case SCTP_SET_PRIM_ADDR:
 		stcb->asoc.asconf_supported = 0;
 		break;
 	case SCTP_SUCCESS_REPORT:
 	case SCTP_ERROR_CAUSE_IND:
 		SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n");
 		SCTPDBG(SCTP_DEBUG_INPUT2,
 		    "Turning off ASCONF to this strange peer\n");
 		stcb->asoc.asconf_supported = 0;
 		break;
 	default:
 		SCTPDBG(SCTP_DEBUG_INPUT2,
 		    "Peer does not support param type %d (0x%x)??\n",
 		    parameter_type, parameter_type);
 		break;
 	}
 }
 
 static int
 sctp_handle_error(struct sctp_chunkhdr *ch,
     struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit)
 {
 	struct sctp_error_cause *cause;
 	struct sctp_association *asoc;
 	uint32_t remaining_length, adjust;
 	uint16_t code, cause_code, cause_length;
 
 	/* parse through all of the errors and process */
 	asoc = &stcb->asoc;
 	cause = (struct sctp_error_cause *)((caddr_t)ch +
 	    sizeof(struct sctp_chunkhdr));
 	remaining_length = ntohs(ch->chunk_length);
 	if (remaining_length > limit) {
 		remaining_length = limit;
 	}
 	if (remaining_length >= sizeof(struct sctp_chunkhdr)) {
 		remaining_length -= sizeof(struct sctp_chunkhdr);
 	} else {
 		remaining_length = 0;
 	}
 	code = 0;
 	while (remaining_length >= sizeof(struct sctp_error_cause)) {
 		/* Process an Error Cause */
 		cause_code = ntohs(cause->code);
 		cause_length = ntohs(cause->length);
 		if ((cause_length > remaining_length) || (cause_length == 0)) {
 			/* Invalid cause length, possibly due to truncation. */
 			SCTPDBG(SCTP_DEBUG_INPUT1, "Bogus length in cause - bytes left: %u cause length: %u\n",
 			    remaining_length, cause_length);
 			return (0);
 		}
 		if (code == 0) {
 			/* report the first error cause */
 			code = cause_code;
 		}
 		switch (cause_code) {
 		case SCTP_CAUSE_INVALID_STREAM:
 		case SCTP_CAUSE_MISSING_PARAM:
 		case SCTP_CAUSE_INVALID_PARAM:
 		case SCTP_CAUSE_NO_USER_DATA:
 			SCTPDBG(SCTP_DEBUG_INPUT1, "Software error we got a %u back? We have a bug :/ (or do they?)\n",
 			    cause_code);
 			break;
 		case SCTP_CAUSE_NAT_COLLIDING_STATE:
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state, ERROR flags: %x\n",
 			    ch->chunk_flags);
 			if (sctp_handle_nat_colliding_state(stcb)) {
 				return (0);
 			}
 			break;
 		case SCTP_CAUSE_NAT_MISSING_STATE:
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state, ERROR flags: %x\n",
 			    ch->chunk_flags);
 			if (sctp_handle_nat_missing_state(stcb, net)) {
 				return (0);
 			}
 			break;
 		case SCTP_CAUSE_STALE_COOKIE:
 			/*
 			 * We only act if we have echoed a cookie and are
 			 * waiting.
 			 */
 			if ((cause_length >= sizeof(struct sctp_error_stale_cookie)) &&
 			    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 				struct sctp_error_stale_cookie *stale_cookie;
 
 				stale_cookie = (struct sctp_error_stale_cookie *)cause;
 				asoc->cookie_preserve_req = ntohl(stale_cookie->stale_time);
 				/* Double it to be more robust on RTX */
 				if (asoc->cookie_preserve_req <= UINT32_MAX / 2) {
 					asoc->cookie_preserve_req *= 2;
 				} else {
 					asoc->cookie_preserve_req = UINT32_MAX;
 				}
 				asoc->stale_cookie_count++;
 				if (asoc->stale_cookie_count >
 				    asoc->max_init_times) {
 					sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED);
 					/* now free the asoc */
 					(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 					    SCTP_FROM_SCTP_INPUT + SCTP_LOC_12);
 					return (-1);
 				}
 				/* blast back to INIT state */
 				sctp_toss_old_cookies(stcb, &stcb->asoc);
 				SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 				sctp_stop_all_cookie_timers(stcb);
 				sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
 			}
 			break;
 		case SCTP_CAUSE_UNRESOLVABLE_ADDR:
 			/*
 			 * Nothing we can do here, we don't do hostname
 			 * addresses so if the peer does not like my IPv6
 			 * (or IPv4 for that matter) it does not matter. If
 			 * they don't support that type of address, they can
 			 * NOT possibly get that packet type... i.e. with no
 			 * IPv6 you can't receive a IPv6 packet. so we can
 			 * safely ignore this one. If we ever added support
 			 * for HOSTNAME Addresses, then we would need to do
 			 * something here.
 			 */
 			break;
 		case SCTP_CAUSE_UNRECOG_CHUNK:
 			if (cause_length >= sizeof(struct sctp_error_unrecognized_chunk)) {
 				struct sctp_error_unrecognized_chunk *unrec_chunk;
 
 				unrec_chunk = (struct sctp_error_unrecognized_chunk *)cause;
 				sctp_process_unrecog_chunk(stcb, unrec_chunk->ch.chunk_type);
 			}
 			break;
 		case SCTP_CAUSE_UNRECOG_PARAM:
 			/* XXX: We only consider the first parameter */
 			if (cause_length >= sizeof(struct sctp_error_cause) + sizeof(struct sctp_paramhdr)) {
 				struct sctp_paramhdr *unrec_parameter;
 
 				unrec_parameter = (struct sctp_paramhdr *)(cause + 1);
 				sctp_process_unrecog_param(stcb, ntohs(unrec_parameter->param_type));
 			}
 			break;
 		case SCTP_CAUSE_COOKIE_IN_SHUTDOWN:
 			/*
 			 * We ignore this since the timer will drive out a
 			 * new cookie anyway and there timer will drive us
 			 * to send a SHUTDOWN_COMPLETE. We can't send one
 			 * here since we don't have their tag.
 			 */
 			break;
 		case SCTP_CAUSE_DELETING_LAST_ADDR:
 		case SCTP_CAUSE_RESOURCE_SHORTAGE:
 		case SCTP_CAUSE_DELETING_SRC_ADDR:
 			/*
 			 * We should NOT get these here, but in a
 			 * ASCONF-ACK.
 			 */
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Peer sends ASCONF errors in a error cause with code %u.\n",
 			    cause_code);
 			break;
 		case SCTP_CAUSE_OUT_OF_RESC:
 			/*
 			 * And what, pray tell do we do with the fact that
 			 * the peer is out of resources? Not really sure we
 			 * could do anything but abort. I suspect this
 			 * should have came WITH an abort instead of in a
 			 * OP-ERROR.
 			 */
 			break;
 		default:
 			SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_handle_error: unknown code 0x%x\n",
 			    cause_code);
 			break;
 		}
 		adjust = SCTP_SIZE32(cause_length);
 		if (remaining_length >= adjust) {
 			remaining_length -= adjust;
 		} else {
 			remaining_length = 0;
 		}
 		cause = (struct sctp_error_cause *)((caddr_t)cause + adjust);
 	}
 	sctp_ulp_notify(SCTP_NOTIFY_REMOTE_ERROR, stcb, code, ch, SCTP_SO_NOT_LOCKED);
 	return (0);
 }
 
 static int
 sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
     struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
     struct sctp_nets *net, int *abort_no_unlock,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id)
 {
 	struct sctp_init_ack *init_ack;
 	struct mbuf *op_err;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_init_ack: handling INIT-ACK\n");
 
 	if (stcb == NULL) {
 		SCTPDBG(SCTP_DEBUG_INPUT2,
 		    "sctp_handle_init_ack: TCB is null\n");
 		return (-1);
 	}
 	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) {
 		/* Invalid length */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	init_ack = &cp->init;
 	/* validate parameters */
 	if (init_ack->initiate_tag == 0) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (init_ack->num_inbound_streams == 0) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (init_ack->num_outbound_streams == 0) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
 		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	/* process according to association state... */
 	switch (SCTP_GET_STATE(stcb)) {
 	case SCTP_STATE_COOKIE_WAIT:
 		/* this is the expected state for this chunk */
 		/* process the INIT-ACK parameters */
 		if (stcb->asoc.primary_destination->dest_state &
 		    SCTP_ADDR_UNCONFIRMED) {
 			/*
 			 * The primary is where we sent the INIT, we can
 			 * always consider it confirmed when the INIT-ACK is
 			 * returned. Do this before we load addresses
 			 * though.
 			 */
 			stcb->asoc.primary_destination->dest_state &=
 			    ~SCTP_ADDR_UNCONFIRMED;
 			sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
 			    stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED);
 		}
 		if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb,
 		    net, abort_no_unlock,
 		    mflowtype, mflowid,
 		    vrf_id) < 0) {
 			/* error in parsing parameters */
 			return (-1);
 		}
 		/* update our state */
 		SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n");
 		SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_ECHOED);
 
 		/* reset the RTO calc */
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 			sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
 			    stcb->asoc.overall_error_count,
 			    0,
 			    SCTP_FROM_SCTP_INPUT,
 			    __LINE__);
 		}
 		stcb->asoc.overall_error_count = 0;
 		(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 		/*
 		 * collapse the init timer back in case of a exponential
 		 * backoff
 		 */
 		sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep,
 		    stcb, net);
 		/*
 		 * the send at the end of the inbound data processing will
 		 * cause the cookie to be sent
 		 */
 		break;
 	case SCTP_STATE_SHUTDOWN_SENT:
 		/* incorrect state... discard */
 		break;
 	case SCTP_STATE_COOKIE_ECHOED:
 		/* incorrect state... discard */
 		break;
 	case SCTP_STATE_OPEN:
 		/* incorrect state... discard */
 		break;
 	case SCTP_STATE_EMPTY:
 	case SCTP_STATE_INUSE:
 	default:
 		/* incorrect state... discard */
 		return (-1);
 		break;
 	}
 	SCTPDBG(SCTP_DEBUG_INPUT1, "Leaving handle-init-ack end\n");
 	return (0);
 }
 
 static struct sctp_tcb *
 sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
     struct sctp_inpcb *inp, struct sctp_nets **netp,
     struct sockaddr *init_src, int *notification,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port);
 
 
 /*
  * handle a state cookie for an existing association m: input packet mbuf
  * chain-- assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a
  * "split" mbuf and the cookie signature does not exist offset: offset into
  * mbuf to the cookie-echo chunk
  */
 static struct sctp_tcb *
 sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
     struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp,
     struct sockaddr *init_src, int *notification,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_association *asoc;
 	struct sctp_init_chunk *init_cp, init_buf;
 	struct sctp_init_ack_chunk *initack_cp, initack_buf;
 	struct sctp_nets *net;
 	struct mbuf *op_err;
 	struct timeval old;
 	int init_offset, initack_offset, i;
 	int retval;
 	int spec_flag = 0;
 	uint32_t how_indx;
 #if defined(SCTP_DETAILED_STR_STATS)
 	int j;
 #endif
 
 	net = *netp;
 	/* I know that the TCB is non-NULL from the caller */
 	asoc = &stcb->asoc;
 	for (how_indx = 0; how_indx < sizeof(asoc->cookie_how); how_indx++) {
 		if (asoc->cookie_how[how_indx] == 0)
 			break;
 	}
 	if (how_indx < sizeof(asoc->cookie_how)) {
 		asoc->cookie_how[how_indx] = 1;
 	}
 	if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) {
 		/* SHUTDOWN came in after sending INIT-ACK */
 		sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination);
 		op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, "");
 		sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err,
 		    mflowtype, mflowid, inp->fibnum,
 		    vrf_id, net->port);
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 2;
 		return (NULL);
 	}
 	/*
 	 * find and validate the INIT chunk in the cookie (peer's info) the
 	 * INIT should start after the cookie-echo header struct (chunk
 	 * header, state cookie header struct)
 	 */
 	init_offset = offset += sizeof(struct sctp_cookie_echo_chunk);
 
 	init_cp = (struct sctp_init_chunk *)
 	    sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk),
 	    (uint8_t *)&init_buf);
 	if (init_cp == NULL) {
 		/* could not pull a INIT chunk in cookie */
 		return (NULL);
 	}
 	if (init_cp->ch.chunk_type != SCTP_INITIATION) {
 		return (NULL);
 	}
 	/*
 	 * find and validate the INIT-ACK chunk in the cookie (my info) the
 	 * INIT-ACK follows the INIT chunk
 	 */
 	initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length));
 	initack_cp = (struct sctp_init_ack_chunk *)
 	    sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk),
 	    (uint8_t *)&initack_buf);
 	if (initack_cp == NULL) {
 		/* could not pull INIT-ACK chunk in cookie */
 		return (NULL);
 	}
 	if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) {
 		return (NULL);
 	}
 	if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
 	    (ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag)) {
 		/*
 		 * case D in Section 5.2.4 Table 2: MMAA process accordingly
 		 * to get into the OPEN state
 		 */
 		if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) {
 			/*-
 			 * Opps, this means that we somehow generated two vtag's
 			 * the same. I.e. we did:
 			 *  Us               Peer
 			 *   <---INIT(tag=a)------
 			 *   ----INIT-ACK(tag=t)-->
 			 *   ----INIT(tag=t)------> *1
 			 *   <---INIT-ACK(tag=a)---
                          *   <----CE(tag=t)------------- *2
 			 *
 			 * At point *1 we should be generating a different
 			 * tag t'. Which means we would throw away the CE and send
 			 * ours instead. Basically this is case C (throw away side).
 			 */
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 17;
 			return (NULL);
 
 		}
 		switch (SCTP_GET_STATE(stcb)) {
 		case SCTP_STATE_COOKIE_WAIT:
 		case SCTP_STATE_COOKIE_ECHOED:
 			/*
 			 * INIT was sent but got a COOKIE_ECHO with the
 			 * correct tags... just accept it...but we must
 			 * process the init so that we can make sure we have
 			 * the right seq no's.
 			 */
 			/* First we must process the INIT !! */
 			retval = sctp_process_init(init_cp, stcb);
 			if (retval < 0) {
 				if (how_indx < sizeof(asoc->cookie_how))
 					asoc->cookie_how[how_indx] = 3;
 				return (NULL);
 			}
 			/* we have already processed the INIT so no problem */
 			sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp,
 			    stcb, net,
 			    SCTP_FROM_SCTP_INPUT + SCTP_LOC_13);
 			sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp,
 			    stcb, net,
 			    SCTP_FROM_SCTP_INPUT + SCTP_LOC_14);
 			/* update current state */
 			if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)
 				SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
 			else
 				SCTP_STAT_INCR_COUNTER32(sctps_collisionestab);
 
 			SCTP_SET_STATE(stcb, SCTP_STATE_OPEN);
 			if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 				    stcb->sctp_ep, stcb, NULL);
 			}
 			SCTP_STAT_INCR_GAUGE32(sctps_currestab);
 			sctp_stop_all_cookie_timers(stcb);
 			if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 			    (!SCTP_IS_LISTENING(inp))) {
 				/*
 				 * Here is where collision would go if we
 				 * did a connect() and instead got a
 				 * init/init-ack/cookie done before the
 				 * init-ack came back..
 				 */
 				stcb->sctp_ep->sctp_flags |=
 				    SCTP_PCB_FLAGS_CONNECTED;
 				soisconnected(stcb->sctp_socket);
 			}
 			/* notify upper layer */
 			*notification = SCTP_NOTIFY_ASSOC_UP;
 			/*
 			 * since we did not send a HB make sure we don't
 			 * double things
 			 */
 			old.tv_sec = cookie->time_entered.tv_sec;
 			old.tv_usec = cookie->time_entered.tv_usec;
 			net->hb_responded = 1;
 			sctp_calculate_rto(stcb, asoc, net, &old,
 			    SCTP_RTT_FROM_NON_DATA);
 
 			if (stcb->asoc.sctp_autoclose_ticks &&
 			    (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))) {
 				sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE,
 				    inp, stcb, NULL);
 			}
 			break;
 		default:
 			/*
 			 * we're in the OPEN state (or beyond), so peer must
 			 * have simply lost the COOKIE-ACK
 			 */
 			break;
 		}		/* end switch */
 		sctp_stop_all_cookie_timers(stcb);
 		/*
 		 * We ignore the return code here.. not sure if we should
 		 * somehow abort.. but we do have an existing asoc. This
 		 * really should not fail.
 		 */
 		if (sctp_load_addresses_from_init(stcb, m,
 		    init_offset + sizeof(struct sctp_init_chunk),
 		    initack_offset, src, dst, init_src, stcb->asoc.port)) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 4;
 			return (NULL);
 		}
 		/* respond with a COOKIE-ACK */
 		sctp_toss_old_cookies(stcb, asoc);
 		sctp_send_cookie_ack(stcb);
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 5;
 		return (stcb);
 	}
 
 	if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag &&
 	    ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag &&
 	    cookie->tie_tag_my_vtag == 0 &&
 	    cookie->tie_tag_peer_vtag == 0) {
 		/*
 		 * case C in Section 5.2.4 Table 2: XMOO silently discard
 		 */
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 6;
 		return (NULL);
 	}
 	/*
 	 * If nat support, and the below and stcb is established, send back
 	 * a ABORT(colliding state) if we are established.
 	 */
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) &&
 	    (asoc->peer_supports_nat) &&
 	    ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
 	    ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) ||
 	    (asoc->peer_vtag == 0)))) {
 		/*
 		 * Special case - Peer's support nat. We may have two init's
 		 * that we gave out the same tag on since one was not
 		 * established.. i.e. we get INIT from host-1 behind the nat
 		 * and we respond tag-a, we get a INIT from host-2 behind
 		 * the nat and we get tag-a again. Then we bring up host-1
 		 * (or 2's) assoc, Then comes the cookie from hsot-2 (or 1).
 		 * Now we have colliding state. We must send an abort here
 		 * with colliding state indication.
 		 */
 		op_err = sctp_generate_cause(SCTP_CAUSE_NAT_COLLIDING_STATE, "");
 		sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err,
 		    mflowtype, mflowid, inp->fibnum,
 		    vrf_id, port);
 		return (NULL);
 	}
 	if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
 	    ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) ||
 	    (asoc->peer_vtag == 0))) {
 		/*
 		 * case B in Section 5.2.4 Table 2: MXAA or MOAA my info
 		 * should be ok, re-accept peer info
 		 */
 		if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) {
 			/*
 			 * Extension of case C. If we hit this, then the
 			 * random number generator returned the same vtag
 			 * when we first sent our INIT-ACK and when we later
 			 * sent our INIT. The side with the seq numbers that
 			 * are different will be the one that normnally
 			 * would have hit case C. This in effect "extends"
 			 * our vtags in this collision case to be 64 bits.
 			 * The same collision could occur aka you get both
 			 * vtag and seq number the same twice in a row.. but
 			 * is much less likely. If it did happen then we
 			 * would proceed through and bring up the assoc.. we
 			 * may end up with the wrong stream setup however..
 			 * which would be bad.. but there is no way to
 			 * tell.. until we send on a stream that does not
 			 * exist :-)
 			 */
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 7;
 
 			return (NULL);
 		}
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 8;
 		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_15);
 		sctp_stop_all_cookie_timers(stcb);
 		/*
 		 * since we did not send a HB make sure we don't double
 		 * things
 		 */
 		net->hb_responded = 1;
 		if (stcb->asoc.sctp_autoclose_ticks &&
 		    sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
 			sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb,
 			    NULL);
 		}
 		asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd);
 		asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams);
 
 		if (ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) {
 			/*
 			 * Ok the peer probably discarded our data (if we
 			 * echoed a cookie+data). So anything on the
 			 * sent_queue should be marked for retransmit, we
 			 * may not get something to kick us so it COULD
 			 * still take a timeout to move these.. but it can't
 			 * hurt to mark them.
 			 */
 			struct sctp_tmit_chunk *chk;
 
 			TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 				if (chk->sent < SCTP_DATAGRAM_RESEND) {
 					chk->sent = SCTP_DATAGRAM_RESEND;
 					sctp_flight_size_decrease(chk);
 					sctp_total_flight_decrease(stcb, chk);
 					sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
 					spec_flag++;
 				}
 			}
 
 		}
 		/* process the INIT info (peer's info) */
 		retval = sctp_process_init(init_cp, stcb);
 		if (retval < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 9;
 			return (NULL);
 		}
 		if (sctp_load_addresses_from_init(stcb, m,
 		    init_offset + sizeof(struct sctp_init_chunk),
 		    initack_offset, src, dst, init_src, stcb->asoc.port)) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 10;
 			return (NULL);
 		}
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 			*notification = SCTP_NOTIFY_ASSOC_UP;
 
 			if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 			    (!SCTP_IS_LISTENING(inp))) {
 				stcb->sctp_ep->sctp_flags |=
 				    SCTP_PCB_FLAGS_CONNECTED;
 				soisconnected(stcb->sctp_socket);
 			}
 			if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)
 				SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
 			else
 				SCTP_STAT_INCR_COUNTER32(sctps_collisionestab);
 			SCTP_STAT_INCR_GAUGE32(sctps_currestab);
 		} else if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 			SCTP_STAT_INCR_COUNTER32(sctps_restartestab);
 		} else {
 			SCTP_STAT_INCR_COUNTER32(sctps_collisionestab);
 		}
 		SCTP_SET_STATE(stcb, SCTP_STATE_OPEN);
 		if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
 			sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 			    stcb->sctp_ep, stcb, NULL);
 		}
 		sctp_stop_all_cookie_timers(stcb);
 		sctp_toss_old_cookies(stcb, asoc);
 		sctp_send_cookie_ack(stcb);
 		if (spec_flag) {
 			/*
 			 * only if we have retrans set do we do this. What
 			 * this call does is get only the COOKIE-ACK out and
 			 * then when we return the normal call to
 			 * sctp_chunk_output will get the retrans out behind
 			 * this.
 			 */
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED);
 		}
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 11;
 
 		return (stcb);
 	}
 	if ((ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag &&
 	    ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) &&
 	    cookie->tie_tag_my_vtag == asoc->my_vtag_nonce &&
 	    cookie->tie_tag_peer_vtag == asoc->peer_vtag_nonce &&
 	    cookie->tie_tag_peer_vtag != 0) {
 		struct sctpasochead *head;
 
 		if (asoc->peer_supports_nat) {
 			/*
 			 * This is a gross gross hack. Just call the
 			 * cookie_new code since we are allowing a duplicate
 			 * association. I hope this works...
 			 */
 			return (sctp_process_cookie_new(m, iphlen, offset, src, dst,
 			    sh, cookie, cookie_len,
 			    inp, netp, init_src, notification,
 			    auth_skipped, auth_offset, auth_len,
 			    mflowtype, mflowid,
 			    vrf_id, port));
 		}
 		/*
 		 * case A in Section 5.2.4 Table 2: XXMM (peer restarted)
 		 */
 		/* temp code */
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 12;
 		sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
 		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_17);
 
 		/* notify upper layer */
 		*notification = SCTP_NOTIFY_ASSOC_RESTART;
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		if ((SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) &&
 		    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
 		    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT)) {
 			SCTP_STAT_INCR_GAUGE32(sctps_currestab);
 		}
 		if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 			SCTP_STAT_INCR_GAUGE32(sctps_restartestab);
 		} else if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) {
 			SCTP_STAT_INCR_GAUGE32(sctps_collisionestab);
 		}
 		if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
 			SCTP_SET_STATE(stcb, SCTP_STATE_OPEN);
 			sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 			    stcb->sctp_ep, stcb, NULL);
 
 		} else if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) {
 			/* move to OPEN state, if not in SHUTDOWN_SENT */
 			SCTP_SET_STATE(stcb, SCTP_STATE_OPEN);
 		}
 		asoc->pre_open_streams =
 		    ntohs(initack_cp->init.num_outbound_streams);
 		asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn);
 		asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number;
 		asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
 
 		asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1;
 
 		asoc->str_reset_seq_in = asoc->init_seq_number;
 
 		asoc->advanced_peer_ack_point = asoc->last_acked_seq;
 		if (asoc->mapping_array) {
 			memset(asoc->mapping_array, 0,
 			    asoc->mapping_array_size);
 		}
 		if (asoc->nr_mapping_array) {
 			memset(asoc->nr_mapping_array, 0,
 			    asoc->mapping_array_size);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_INFO_WLOCK();
 		SCTP_INP_WLOCK(stcb->sctp_ep);
 		SCTP_TCB_LOCK(stcb);
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 		/* send up all the data */
 		SCTP_TCB_SEND_LOCK(stcb);
 
-		sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED);
+		sctp_report_all_outbound(stcb, 0, SCTP_SO_LOCKED);
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			stcb->asoc.strmout[i].chunks_on_queues = 0;
 #if defined(SCTP_DETAILED_STR_STATS)
 			for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
 				asoc->strmout[i].abandoned_sent[j] = 0;
 				asoc->strmout[i].abandoned_unsent[j] = 0;
 			}
 #else
 			asoc->strmout[i].abandoned_sent[0] = 0;
 			asoc->strmout[i].abandoned_unsent[0] = 0;
 #endif
 			stcb->asoc.strmout[i].sid = i;
 			stcb->asoc.strmout[i].next_mid_ordered = 0;
 			stcb->asoc.strmout[i].next_mid_unordered = 0;
 			stcb->asoc.strmout[i].last_msg_incomplete = 0;
 		}
 		/* process the INIT-ACK info (my info) */
 		asoc->my_vtag = ntohl(initack_cp->init.initiate_tag);
 		asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd);
 
 		/* pull from vtag hash */
 		LIST_REMOVE(stcb, sctp_asocs);
 		/* re-insert to new vtag position */
 		head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag,
 		    SCTP_BASE_INFO(hashasocmark))];
 		/*
 		 * put it in the bucket in the vtag hash of assoc's for the
 		 * system
 		 */
 		LIST_INSERT_HEAD(head, stcb, sctp_asocs);
 
 		SCTP_TCB_SEND_UNLOCK(stcb);
 		SCTP_INP_WUNLOCK(stcb->sctp_ep);
 		SCTP_INP_INFO_WUNLOCK();
 		asoc->total_flight = 0;
 		asoc->total_flight_count = 0;
 		/* process the INIT info (peer's info) */
 		retval = sctp_process_init(init_cp, stcb);
 		if (retval < 0) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 13;
 
 			return (NULL);
 		}
 		/*
 		 * since we did not send a HB make sure we don't double
 		 * things
 		 */
 		net->hb_responded = 1;
 
 		if (sctp_load_addresses_from_init(stcb, m,
 		    init_offset + sizeof(struct sctp_init_chunk),
 		    initack_offset, src, dst, init_src, stcb->asoc.port)) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 14;
 
 			return (NULL);
 		}
 		/* respond with a COOKIE-ACK */
 		sctp_stop_all_cookie_timers(stcb);
 		sctp_toss_old_cookies(stcb, asoc);
 		sctp_send_cookie_ack(stcb);
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 15;
 
 		return (stcb);
 	}
 	if (how_indx < sizeof(asoc->cookie_how))
 		asoc->cookie_how[how_indx] = 16;
 	/* all other cases... */
 	return (NULL);
 }
 
 
 /*
  * handle a state cookie for a new association m: input packet mbuf chain--
  * assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a "split" mbuf
  * and the cookie signature does not exist offset: offset into mbuf to the
  * cookie-echo chunk length: length of the cookie chunk to: where the init
  * was from returns a new TCB
  */
 static struct sctp_tcb *
 sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
     struct sctp_inpcb *inp, struct sctp_nets **netp,
     struct sockaddr *init_src, int *notification,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_tcb *stcb;
 	struct sctp_init_chunk *init_cp, init_buf;
 	struct sctp_init_ack_chunk *initack_cp, initack_buf;
 	union sctp_sockstore store;
 	struct sctp_association *asoc;
 	int init_offset, initack_offset, initack_limit;
 	int retval;
 	int error = 0;
 	uint8_t auth_chunk_buf[SCTP_CHUNK_BUFFER_SIZE];
 
 	/*
 	 * find and validate the INIT chunk in the cookie (peer's info) the
 	 * INIT should start after the cookie-echo header struct (chunk
 	 * header, state cookie header struct)
 	 */
 	init_offset = offset + sizeof(struct sctp_cookie_echo_chunk);
 	init_cp = (struct sctp_init_chunk *)
 	    sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk),
 	    (uint8_t *)&init_buf);
 	if (init_cp == NULL) {
 		/* could not pull a INIT chunk in cookie */
 		SCTPDBG(SCTP_DEBUG_INPUT1,
 		    "process_cookie_new: could not pull INIT chunk hdr\n");
 		return (NULL);
 	}
 	if (init_cp->ch.chunk_type != SCTP_INITIATION) {
 		SCTPDBG(SCTP_DEBUG_INPUT1, "HUH? process_cookie_new: could not find INIT chunk!\n");
 		return (NULL);
 	}
 	initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length));
 	/*
 	 * find and validate the INIT-ACK chunk in the cookie (my info) the
 	 * INIT-ACK follows the INIT chunk
 	 */
 	initack_cp = (struct sctp_init_ack_chunk *)
 	    sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk),
 	    (uint8_t *)&initack_buf);
 	if (initack_cp == NULL) {
 		/* could not pull INIT-ACK chunk in cookie */
 		SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT-ACK chunk hdr\n");
 		return (NULL);
 	}
 	if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) {
 		return (NULL);
 	}
 	/*
 	 * NOTE: We can't use the INIT_ACK's chk_length to determine the
 	 * "initack_limit" value.  This is because the chk_length field
 	 * includes the length of the cookie, but the cookie is omitted when
 	 * the INIT and INIT_ACK are tacked onto the cookie...
 	 */
 	initack_limit = offset + cookie_len;
 
 	/*
 	 * now that we know the INIT/INIT-ACK are in place, create a new TCB
 	 * and popluate
 	 */
 
 	/*
 	 * Here we do a trick, we set in NULL for the proc/thread argument.
 	 * We do this since in effect we only use the p argument when the
 	 * socket is unbound and we must do an implicit bind. Since we are
 	 * getting a cookie, we cannot be unbound.
 	 */
 	stcb = sctp_aloc_assoc(inp, init_src, &error,
 	    ntohl(initack_cp->init.initiate_tag), vrf_id,
 	    ntohs(initack_cp->init.num_outbound_streams),
 	    port,
 	    (struct thread *)NULL,
 	    SCTP_DONT_INITIALIZE_AUTH_PARAMS);
 	if (stcb == NULL) {
 		struct mbuf *op_err;
 
 		/* memory problem? */
 		SCTPDBG(SCTP_DEBUG_INPUT1,
 		    "process_cookie_new: no room for another TCB!\n");
 		op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
 		sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		return (NULL);
 	}
 	/* get the correct sctp_nets */
 	if (netp)
 		*netp = sctp_findnet(stcb, init_src);
 
 	asoc = &stcb->asoc;
 	/* get scope variables out of cookie */
 	asoc->scope.ipv4_local_scope = cookie->ipv4_scope;
 	asoc->scope.site_scope = cookie->site_scope;
 	asoc->scope.local_scope = cookie->local_scope;
 	asoc->scope.loopback_scope = cookie->loopback_scope;
 
 	if ((asoc->scope.ipv4_addr_legal != cookie->ipv4_addr_legal) ||
 	    (asoc->scope.ipv6_addr_legal != cookie->ipv6_addr_legal)) {
 		struct mbuf *op_err;
 
 		/*
 		 * Houston we have a problem. The EP changed while the
 		 * cookie was in flight. Only recourse is to abort the
 		 * association.
 		 */
 		op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
 		sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
 		    src, dst, sh, op_err,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_18);
 		return (NULL);
 	}
 	/* process the INIT-ACK info (my info) */
 	asoc->my_vtag = ntohl(initack_cp->init.initiate_tag);
 	asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd);
 	asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams);
 	asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn);
 	asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number;
 	asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
 	asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1;
 	asoc->str_reset_seq_in = asoc->init_seq_number;
 
 	asoc->advanced_peer_ack_point = asoc->last_acked_seq;
 
 	/* process the INIT info (peer's info) */
 	if (netp)
 		retval = sctp_process_init(init_cp, stcb);
 	else
 		retval = 0;
 	if (retval < 0) {
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_19);
 		return (NULL);
 	}
 	/* load all addresses */
 	if (sctp_load_addresses_from_init(stcb, m,
 	    init_offset + sizeof(struct sctp_init_chunk), initack_offset,
 	    src, dst, init_src, port)) {
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_20);
 		return (NULL);
 	}
 	/*
 	 * verify any preceding AUTH chunk that was skipped
 	 */
 	/* pull the local authentication parameters from the cookie/init-ack */
 	sctp_auth_get_cookie_params(stcb, m,
 	    initack_offset + sizeof(struct sctp_init_ack_chunk),
 	    initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)));
 	if (auth_skipped) {
 		struct sctp_auth_chunk *auth;
 
 		if (auth_len <= SCTP_CHUNK_BUFFER_SIZE) {
 			auth = (struct sctp_auth_chunk *)sctp_m_getptr(m, auth_offset, auth_len, auth_chunk_buf);
 		} else {
 			auth = NULL;
 		}
 		if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) {
 			/* auth HMAC failed, dump the assoc and packet */
 			SCTPDBG(SCTP_DEBUG_AUTH1,
 			    "COOKIE-ECHO: AUTH failed\n");
 			(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 			    SCTP_FROM_SCTP_INPUT + SCTP_LOC_21);
 			return (NULL);
 		} else {
 			/* remaining chunks checked... good to go */
 			stcb->asoc.authenticated = 1;
 		}
 	}
 
 	/*
 	 * if we're doing ASCONFs, check to see if we have any new local
 	 * addresses that need to get added to the peer (eg. addresses
 	 * changed while cookie echo in flight).  This needs to be done
 	 * after we go to the OPEN state to do the correct asconf
 	 * processing. else, make sure we have the correct addresses in our
 	 * lists
 	 */
 
 	/* warning, we re-use sin, sin6, sa_store here! */
 	/* pull in local_address (our "from" address) */
 	switch (cookie->laddr_type) {
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		/* source addr is IPv4 */
 		memset(&store.sin, 0, sizeof(struct sockaddr_in));
 		store.sin.sin_family = AF_INET;
 		store.sin.sin_len = sizeof(struct sockaddr_in);
 		store.sin.sin_addr.s_addr = cookie->laddress[0];
 		break;
 #endif
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		/* source addr is IPv6 */
 		memset(&store.sin6, 0, sizeof(struct sockaddr_in6));
 		store.sin6.sin6_family = AF_INET6;
 		store.sin6.sin6_len = sizeof(struct sockaddr_in6);
 		store.sin6.sin6_scope_id = cookie->scope_id;
 		memcpy(&store.sin6.sin6_addr, cookie->laddress, sizeof(struct in6_addr));
 		break;
 #endif
 	default:
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_22);
 		return (NULL);
 	}
 
 	/* update current state */
 	SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n");
 	SCTP_SET_STATE(stcb, SCTP_STATE_OPEN);
 	if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
 		sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 		    stcb->sctp_ep, stcb, NULL);
 	}
 	sctp_stop_all_cookie_timers(stcb);
 	SCTP_STAT_INCR_COUNTER32(sctps_passiveestab);
 	SCTP_STAT_INCR_GAUGE32(sctps_currestab);
 
 	/* set up to notify upper layer */
 	*notification = SCTP_NOTIFY_ASSOC_UP;
 	if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 	    (!SCTP_IS_LISTENING(inp))) {
 		/*
 		 * This is an endpoint that called connect() how it got a
 		 * cookie that is NEW is a bit of a mystery. It must be that
 		 * the INIT was sent, but before it got there.. a complete
 		 * INIT/INIT-ACK/COOKIE arrived. But of course then it
 		 * should have went to the other code.. not here.. oh well..
 		 * a bit of protection is worth having..
 		 */
 		stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
 		soisconnected(stcb->sctp_socket);
 	} else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (SCTP_IS_LISTENING(inp))) {
 		/*
 		 * We don't want to do anything with this one. Since it is
 		 * the listening guy. The timer will get started for
 		 * accepted connections in the caller.
 		 */
 		;
 	}
 	/* since we did not send a HB make sure we don't double things */
 	if ((netp) && (*netp))
 		(*netp)->hb_responded = 1;
 
 	if (stcb->asoc.sctp_autoclose_ticks &&
 	    sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
 		sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL);
 	}
 	(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 	if ((netp != NULL) && (*netp != NULL)) {
 		struct timeval old;
 
 		/* calculate the RTT and set the encaps port */
 		old.tv_sec = cookie->time_entered.tv_sec;
 		old.tv_usec = cookie->time_entered.tv_usec;
 		sctp_calculate_rto(stcb, asoc, *netp, &old, SCTP_RTT_FROM_NON_DATA);
 	}
 	/* respond with a COOKIE-ACK */
 	sctp_send_cookie_ack(stcb);
 
 	/*
 	 * check the address lists for any ASCONFs that need to be sent
 	 * AFTER the cookie-ack is sent
 	 */
 	sctp_check_address_list(stcb, m,
 	    initack_offset + sizeof(struct sctp_init_ack_chunk),
 	    initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)),
 	    &store.sa, cookie->local_scope, cookie->site_scope,
 	    cookie->ipv4_scope, cookie->loopback_scope);
 
 
 	return (stcb);
 }
 
 /*
  * CODE LIKE THIS NEEDS TO RUN IF the peer supports the NAT extension, i.e
  * we NEED to make sure we are not already using the vtag. If so we
  * need to send back an ABORT-TRY-AGAIN-WITH-NEW-TAG No middle box bit!
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag,
 							    SCTP_BASE_INFO(hashasocmark))];
 	LIST_FOREACH(stcb, head, sctp_asocs) {
 	        if ((stcb->asoc.my_vtag == tag) && (stcb->rport == rport) && (inp == stcb->sctp_ep))  {
 		       -- SEND ABORT - TRY AGAIN --
 		}
 	}
 */
 
 /*
  * handles a COOKIE-ECHO message stcb: modified to either a new or left as
  * existing (non-NULL) TCB
  */
 static struct mbuf *
 sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp,
     struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
     struct sctp_tcb **locked_tcb,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_state_cookie *cookie;
 	struct sctp_tcb *l_stcb = *stcb;
 	struct sctp_inpcb *l_inp;
 	struct sockaddr *to;
 	struct sctp_pcb *ep;
 	struct mbuf *m_sig;
 	uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE];
 	uint8_t *sig;
 	uint8_t cookie_ok = 0;
 	unsigned int sig_offset, cookie_offset;
 	unsigned int cookie_len;
 	struct timeval now;
 	struct timeval time_expires;
 	int notification = 0;
 	struct sctp_nets *netl;
 	int had_a_existing_tcb = 0;
 	int send_int_conf = 0;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_cookie: handling COOKIE-ECHO\n");
 
 	if (inp_p == NULL) {
 		return (NULL);
 	}
 	cookie = &cp->cookie;
 	cookie_offset = offset + sizeof(struct sctp_chunkhdr);
 	cookie_len = ntohs(cp->ch.chunk_length);
 
 	if (cookie_len < sizeof(struct sctp_cookie_echo_chunk) +
 	    sizeof(struct sctp_init_chunk) +
 	    sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) {
 		/* cookie too small */
 		return (NULL);
 	}
 	if ((cookie->peerport != sh->src_port) ||
 	    (cookie->myport != sh->dest_port) ||
 	    (cookie->my_vtag != sh->v_tag)) {
 		/*
 		 * invalid ports or bad tag.  Note that we always leave the
 		 * v_tag in the header in network order and when we stored
 		 * it in the my_vtag slot we also left it in network order.
 		 * This maintains the match even though it may be in the
 		 * opposite byte order of the machine :->
 		 */
 		return (NULL);
 	}
 	/*
 	 * split off the signature into its own mbuf (since it should not be
 	 * calculated in the sctp_hmac_m() call).
 	 */
 	sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE;
 	m_sig = m_split(m, sig_offset, M_NOWAIT);
 	if (m_sig == NULL) {
 		/* out of memory or ?? */
 		return (NULL);
 	}
 #ifdef SCTP_MBUF_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mbc(m_sig, SCTP_MBUF_SPLIT);
 	}
 #endif
 
 	/*
 	 * compute the signature/digest for the cookie
 	 */
 	ep = &(*inp_p)->sctp_ep;
 	l_inp = *inp_p;
 	if (l_stcb) {
 		SCTP_TCB_UNLOCK(l_stcb);
 	}
 	SCTP_INP_RLOCK(l_inp);
 	if (l_stcb) {
 		SCTP_TCB_LOCK(l_stcb);
 	}
 	/* which cookie is it? */
 	if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) &&
 	    (ep->current_secret_number != ep->last_secret_number)) {
 		/* it's the old cookie */
 		(void)sctp_hmac_m(SCTP_HMAC,
 		    (uint8_t *)ep->secret_key[(int)ep->last_secret_number],
 		    SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0);
 	} else {
 		/* it's the current cookie */
 		(void)sctp_hmac_m(SCTP_HMAC,
 		    (uint8_t *)ep->secret_key[(int)ep->current_secret_number],
 		    SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0);
 	}
 	/* get the signature */
 	SCTP_INP_RUNLOCK(l_inp);
 	sig = (uint8_t *)sctp_m_getptr(m_sig, 0, SCTP_SIGNATURE_SIZE, (uint8_t *)&tmp_sig);
 	if (sig == NULL) {
 		/* couldn't find signature */
 		sctp_m_freem(m_sig);
 		return (NULL);
 	}
 	/* compare the received digest with the computed digest */
 	if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) {
 		/* try the old cookie? */
 		if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) &&
 		    (ep->current_secret_number != ep->last_secret_number)) {
 			/* compute digest with old */
 			(void)sctp_hmac_m(SCTP_HMAC,
 			    (uint8_t *)ep->secret_key[(int)ep->last_secret_number],
 			    SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0);
 			/* compare */
 			if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0)
 				cookie_ok = 1;
 		}
 	} else {
 		cookie_ok = 1;
 	}
 
 	/*
 	 * Now before we continue we must reconstruct our mbuf so that
 	 * normal processing of any other chunks will work.
 	 */
 	{
 		struct mbuf *m_at;
 
 		m_at = m;
 		while (SCTP_BUF_NEXT(m_at) != NULL) {
 			m_at = SCTP_BUF_NEXT(m_at);
 		}
 		SCTP_BUF_NEXT(m_at) = m_sig;
 	}
 
 	if (cookie_ok == 0) {
 		SCTPDBG(SCTP_DEBUG_INPUT2, "handle_cookie_echo: cookie signature validation failed!\n");
 		SCTPDBG(SCTP_DEBUG_INPUT2,
 		    "offset = %u, cookie_offset = %u, sig_offset = %u\n",
 		    (uint32_t)offset, cookie_offset, sig_offset);
 		return (NULL);
 	}
 
 	/*
 	 * check the cookie timestamps to be sure it's not stale
 	 */
 	(void)SCTP_GETTIME_TIMEVAL(&now);
 	/* Expire time is in Ticks, so we convert to seconds */
 	time_expires.tv_sec = cookie->time_entered.tv_sec + sctp_ticks_to_secs(cookie->cookie_life);
 	time_expires.tv_usec = cookie->time_entered.tv_usec;
 	if (timevalcmp(&now, &time_expires, >)) {
 		/* cookie is stale! */
 		struct mbuf *op_err;
 		struct sctp_error_stale_cookie *cause;
 		struct timeval diff;
 		uint32_t staleness;
 
 		op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_stale_cookie),
 		    0, M_NOWAIT, 1, MT_DATA);
 		if (op_err == NULL) {
 			/* FOOBAR */
 			return (NULL);
 		}
 		/* Set the len */
 		SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_stale_cookie);
 		cause = mtod(op_err, struct sctp_error_stale_cookie *);
 		cause->cause.code = htons(SCTP_CAUSE_STALE_COOKIE);
 		cause->cause.length = htons((sizeof(struct sctp_paramhdr) +
 		    (sizeof(uint32_t))));
 		diff = now;
 		timevalsub(&diff, &time_expires);
 		if ((uint32_t)diff.tv_sec > UINT32_MAX / 1000000) {
 			staleness = UINT32_MAX;
 		} else {
 			staleness = diff.tv_sec * 1000000;
 		}
 		if (UINT32_MAX - staleness >= (uint32_t)diff.tv_usec) {
 			staleness += diff.tv_usec;
 		} else {
 			staleness = UINT32_MAX;
 		}
 		cause->stale_time = htonl(staleness);
 		sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err,
 		    mflowtype, mflowid, l_inp->fibnum,
 		    vrf_id, port);
 		return (NULL);
 	}
 	/*
 	 * Now we must see with the lookup address if we have an existing
 	 * asoc. This will only happen if we were in the COOKIE-WAIT state
 	 * and a INIT collided with us and somewhere the peer sent the
 	 * cookie on another address besides the single address our assoc
 	 * had for him. In this case we will have one of the tie-tags set at
 	 * least AND the address field in the cookie can be used to look it
 	 * up.
 	 */
 	to = NULL;
 	switch (cookie->addr_type) {
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		memset(&sin6, 0, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_port = sh->src_port;
 		sin6.sin6_scope_id = cookie->scope_id;
 		memcpy(&sin6.sin6_addr.s6_addr, cookie->address,
 		    sizeof(sin6.sin6_addr.s6_addr));
 		to = (struct sockaddr *)&sin6;
 		break;
 #endif
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		memset(&sin, 0, sizeof(sin));
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
 		sin.sin_port = sh->src_port;
 		sin.sin_addr.s_addr = cookie->address[0];
 		to = (struct sockaddr *)&sin;
 		break;
 #endif
 	default:
 		/* This should not happen */
 		return (NULL);
 	}
 	if (*stcb == NULL) {
 		/* Yep, lets check */
 		*stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL);
 		if (*stcb == NULL) {
 			/*
 			 * We should have only got back the same inp. If we
 			 * got back a different ep we have a problem. The
 			 * original findep got back l_inp and now
 			 */
 			if (l_inp != *inp_p) {
 				SCTP_PRINTF("Bad problem find_ep got a diff inp then special_locate?\n");
 			}
 		} else {
 			if (*locked_tcb == NULL) {
 				/*
 				 * In this case we found the assoc only
 				 * after we locked the create lock. This
 				 * means we are in a colliding case and we
 				 * must make sure that we unlock the tcb if
 				 * its one of the cases where we throw away
 				 * the incoming packets.
 				 */
 				*locked_tcb = *stcb;
 
 				/*
 				 * We must also increment the inp ref count
 				 * since the ref_count flags was set when we
 				 * did not find the TCB, now we found it
 				 * which reduces the refcount.. we must
 				 * raise it back out to balance it all :-)
 				 */
 				SCTP_INP_INCR_REF((*stcb)->sctp_ep);
 				if ((*stcb)->sctp_ep != l_inp) {
 					SCTP_PRINTF("Huh? ep:%p diff then l_inp:%p?\n",
 					    (void *)(*stcb)->sctp_ep, (void *)l_inp);
 				}
 			}
 		}
 	}
 
 	cookie_len -= SCTP_SIGNATURE_SIZE;
 	if (*stcb == NULL) {
 		/* this is the "normal" case... get a new TCB */
 		*stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst, sh,
 		    cookie, cookie_len, *inp_p,
 		    netp, to, &notification,
 		    auth_skipped, auth_offset, auth_len,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 	} else {
 		/* this is abnormal... cookie-echo on existing TCB */
 		had_a_existing_tcb = 1;
 		*stcb = sctp_process_cookie_existing(m, iphlen, offset,
 		    src, dst, sh,
 		    cookie, cookie_len, *inp_p, *stcb, netp, to,
 		    &notification, auth_skipped, auth_offset, auth_len,
 		    mflowtype, mflowid,
 		    vrf_id, port);
 	}
 
 	if (*stcb == NULL) {
 		/* still no TCB... must be bad cookie-echo */
 		return (NULL);
 	}
 	if (*netp != NULL) {
 		(*netp)->flowtype = mflowtype;
 		(*netp)->flowid = mflowid;
 	}
 	/*
 	 * Ok, we built an association so confirm the address we sent the
 	 * INIT-ACK to.
 	 */
 	netl = sctp_findnet(*stcb, to);
 	/*
 	 * This code should in theory NOT run but
 	 */
 	if (netl == NULL) {
 		/* TSNH! Huh, why do I need to add this address here? */
 		if (sctp_add_remote_addr(*stcb, to, NULL, port,
 		    SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) {
 			return (NULL);
 		}
 		netl = sctp_findnet(*stcb, to);
 	}
 	if (netl) {
 		if (netl->dest_state & SCTP_ADDR_UNCONFIRMED) {
 			netl->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
 			(void)sctp_set_primary_addr((*stcb), (struct sockaddr *)NULL,
 			    netl);
 			send_int_conf = 1;
 		}
 	}
 	sctp_start_net_timers(*stcb);
 	if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
 		if (!had_a_existing_tcb ||
 		    (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) {
 			/*
 			 * If we have a NEW cookie or the connect never
 			 * reached the connected state during collision we
 			 * must do the TCP accept thing.
 			 */
 			struct socket *so, *oso;
 			struct sctp_inpcb *inp;
 
 			if (notification == SCTP_NOTIFY_ASSOC_RESTART) {
 				/*
 				 * For a restart we will keep the same
 				 * socket, no need to do anything. I THINK!!
 				 */
 				sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 				if (send_int_conf) {
 					sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
 					    (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED);
 				}
 				return (m);
 			}
 			oso = (*inp_p)->sctp_socket;
 			atomic_add_int(&(*stcb)->asoc.refcnt, 1);
 			SCTP_TCB_UNLOCK((*stcb));
 			CURVNET_SET(oso->so_vnet);
 			so = sonewconn(oso, 0
 			    );
 			CURVNET_RESTORE();
 			SCTP_TCB_LOCK((*stcb));
 			atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
 
 			if (so == NULL) {
 				struct mbuf *op_err;
 
 				/* Too many sockets */
 				SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n");
 				op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
 				sctp_abort_association(*inp_p, NULL, m, iphlen,
 				    src, dst, sh, op_err,
 				    mflowtype, mflowid,
 				    vrf_id, port);
 				(void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTP_INPUT + SCTP_LOC_23);
 				return (NULL);
 			}
 			inp = (struct sctp_inpcb *)so->so_pcb;
 			SCTP_INP_INCR_REF(inp);
 			/*
 			 * We add the unbound flag here so that if we get an
 			 * soabort() before we get the move_pcb done, we
 			 * will properly cleanup.
 			 */
 			inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE |
 			    SCTP_PCB_FLAGS_CONNECTED |
 			    SCTP_PCB_FLAGS_IN_TCPPOOL |
 			    SCTP_PCB_FLAGS_UNBOUND |
 			    (SCTP_PCB_COPY_FLAGS & (*inp_p)->sctp_flags) |
 			    SCTP_PCB_FLAGS_DONT_WAKE);
 			inp->sctp_features = (*inp_p)->sctp_features;
 			inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features;
 			inp->sctp_socket = so;
 			inp->sctp_frag_point = (*inp_p)->sctp_frag_point;
 			inp->max_cwnd = (*inp_p)->max_cwnd;
 			inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off;
 			inp->ecn_supported = (*inp_p)->ecn_supported;
 			inp->prsctp_supported = (*inp_p)->prsctp_supported;
 			inp->auth_supported = (*inp_p)->auth_supported;
 			inp->asconf_supported = (*inp_p)->asconf_supported;
 			inp->reconfig_supported = (*inp_p)->reconfig_supported;
 			inp->nrsack_supported = (*inp_p)->nrsack_supported;
 			inp->pktdrop_supported = (*inp_p)->pktdrop_supported;
 			inp->partial_delivery_point = (*inp_p)->partial_delivery_point;
 			inp->sctp_context = (*inp_p)->sctp_context;
 			inp->local_strreset_support = (*inp_p)->local_strreset_support;
 			inp->fibnum = (*inp_p)->fibnum;
 			inp->inp_starting_point_for_iterator = NULL;
 			/*
 			 * copy in the authentication parameters from the
 			 * original endpoint
 			 */
 			if (inp->sctp_ep.local_hmacs)
 				sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
 			inp->sctp_ep.local_hmacs =
 			    sctp_copy_hmaclist((*inp_p)->sctp_ep.local_hmacs);
 			if (inp->sctp_ep.local_auth_chunks)
 				sctp_free_chunklist(inp->sctp_ep.local_auth_chunks);
 			inp->sctp_ep.local_auth_chunks =
 			    sctp_copy_chunklist((*inp_p)->sctp_ep.local_auth_chunks);
 
 			/*
 			 * Now we must move it from one hash table to
 			 * another and get the tcb in the right place.
 			 */
 
 			/*
 			 * This is where the one-2-one socket is put into
 			 * the accept state waiting for the accept!
 			 */
 			if (*stcb) {
 				SCTP_ADD_SUBSTATE(*stcb, SCTP_STATE_IN_ACCEPT_QUEUE);
 			}
 			sctp_move_pcb_and_assoc(*inp_p, inp, *stcb);
 
 			atomic_add_int(&(*stcb)->asoc.refcnt, 1);
 			SCTP_TCB_UNLOCK((*stcb));
 
 			sctp_pull_off_control_to_new_inp((*inp_p), inp, *stcb,
 			    0);
 			SCTP_TCB_LOCK((*stcb));
 			atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
 
 
 			/*
 			 * now we must check to see if we were aborted while
 			 * the move was going on and the lock/unlock
 			 * happened.
 			 */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 				/*
 				 * yep it was, we leave the assoc attached
 				 * to the socket since the sctp_inpcb_free()
 				 * call will send an abort for us.
 				 */
 				SCTP_INP_DECR_REF(inp);
 				return (NULL);
 			}
 			SCTP_INP_DECR_REF(inp);
 			/* Switch over to the new guy */
 			*inp_p = inp;
 			sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 			if (send_int_conf) {
 				sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
 				    (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED);
 			}
 
 			/*
 			 * Pull it from the incomplete queue and wake the
 			 * guy
 			 */
 			soisconnected(so);
 			return (m);
 		}
 	}
 	if (notification) {
 		sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 	}
 	if (send_int_conf) {
 		sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
 		    (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED);
 	}
 	return (m);
 }
 
 static void
 sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED,
     struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	/* cp must not be used, others call this without a c-ack :-) */
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_cookie_ack: handling COOKIE-ACK\n");
 	if ((stcb == NULL) || (net == NULL)) {
 		return;
 	}
 
 	asoc = &stcb->asoc;
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 		sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
 		    asoc->overall_error_count,
 		    0,
 		    SCTP_FROM_SCTP_INPUT,
 		    __LINE__);
 	}
 	asoc->overall_error_count = 0;
 	sctp_stop_all_cookie_timers(stcb);
 	/* process according to association state */
 	if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) {
 		/* state change only needed when I am in right state */
 		SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n");
 		SCTP_SET_STATE(stcb, SCTP_STATE_OPEN);
 		sctp_start_net_timers(stcb);
 		if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) {
 			sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 			    stcb->sctp_ep, stcb, NULL);
 
 		}
 		/* update RTO */
 		SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
 		SCTP_STAT_INCR_GAUGE32(sctps_currestab);
 		if (asoc->overall_error_count == 0) {
 			sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered,
 			    SCTP_RTT_FROM_NON_DATA);
 		}
 		(void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered);
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 		if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 		    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 			stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
 			if ((stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) == 0) {
 				soisconnected(stcb->sctp_socket);
 			}
 		}
 		/*
 		 * since we did not send a HB make sure we don't double
 		 * things
 		 */
 		net->hb_responded = 1;
 
 		if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
 			/*
 			 * We don't need to do the asconf thing, nor hb or
 			 * autoclose if the socket is closed.
 			 */
 			goto closed_socket;
 		}
 
 		sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep,
 		    stcb, net);
 
 
 		if (stcb->asoc.sctp_autoclose_ticks &&
 		    sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTOCLOSE)) {
 			sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE,
 			    stcb->sctp_ep, stcb, NULL);
 		}
 		/*
 		 * send ASCONF if parameters are pending and ASCONFs are
 		 * allowed (eg. addresses changed when init/cookie echo were
 		 * in flight)
 		 */
 		if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) &&
 		    (stcb->asoc.asconf_supported == 1) &&
 		    (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) {
 #ifdef SCTP_TIMER_BASED_ASCONF
 			sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
 			    stcb->sctp_ep, stcb,
 			    stcb->asoc.primary_destination);
 #else
 			sctp_send_asconf(stcb, stcb->asoc.primary_destination,
 			    SCTP_ADDR_NOT_LOCKED);
 #endif
 		}
 	}
 closed_socket:
 	/* Toss the cookie if I can */
 	sctp_toss_old_cookies(stcb, asoc);
 	/* Restart the timer if we have pending data */
 	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 		if (chk->whoTo != NULL) {
 			break;
 		}
 	}
 	if (chk != NULL) {
 		sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo);
 	}
 }
 
 static void
 sctp_handle_ecn_echo(struct sctp_ecne_chunk *cp,
     struct sctp_tcb *stcb)
 {
 	struct sctp_nets *net;
 	struct sctp_tmit_chunk *lchk;
 	struct sctp_ecne_chunk bkup;
 	uint8_t override_bit;
 	uint32_t tsn, window_data_tsn;
 	int len;
 	unsigned int pkt_cnt;
 
 	len = ntohs(cp->ch.chunk_length);
 	if ((len != sizeof(struct sctp_ecne_chunk)) &&
 	    (len != sizeof(struct old_sctp_ecne_chunk))) {
 		return;
 	}
 	if (len == sizeof(struct old_sctp_ecne_chunk)) {
 		/* Its the old format */
 		memcpy(&bkup, cp, sizeof(struct old_sctp_ecne_chunk));
 		bkup.num_pkts_since_cwr = htonl(1);
 		cp = &bkup;
 	}
 	SCTP_STAT_INCR(sctps_recvecne);
 	tsn = ntohl(cp->tsn);
 	pkt_cnt = ntohl(cp->num_pkts_since_cwr);
 	lchk = TAILQ_LAST(&stcb->asoc.send_queue, sctpchunk_listhead);
 	if (lchk == NULL) {
 		window_data_tsn = stcb->asoc.sending_seq - 1;
 	} else {
 		window_data_tsn = lchk->rec.data.tsn;
 	}
 
 	/* Find where it was sent to if possible. */
 	net = NULL;
 	TAILQ_FOREACH(lchk, &stcb->asoc.sent_queue, sctp_next) {
 		if (lchk->rec.data.tsn == tsn) {
 			net = lchk->whoTo;
 			net->ecn_prev_cwnd = lchk->rec.data.cwnd_at_send;
 			break;
 		}
 		if (SCTP_TSN_GT(lchk->rec.data.tsn, tsn)) {
 			break;
 		}
 	}
 	if (net == NULL) {
 		/*
 		 * What to do. A previous send of a CWR was possibly lost.
 		 * See how old it is, we may have it marked on the actual
 		 * net.
 		 */
 		TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 			if (tsn == net->last_cwr_tsn) {
 				/* Found him, send it off */
 				break;
 			}
 		}
 		if (net == NULL) {
 			/*
 			 * If we reach here, we need to send a special CWR
 			 * that says hey, we did this a long time ago and
 			 * you lost the response.
 			 */
 			net = TAILQ_FIRST(&stcb->asoc.nets);
 			if (net == NULL) {
 				/* TSNH */
 				return;
 			}
 			override_bit = SCTP_CWR_REDUCE_OVERRIDE;
 		} else {
 			override_bit = 0;
 		}
 	} else {
 		override_bit = 0;
 	}
 	if (SCTP_TSN_GT(tsn, net->cwr_window_tsn) &&
 	    ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) {
 		/*
 		 * JRS - Use the congestion control given in the pluggable
 		 * CC module
 		 */
 		stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 0, pkt_cnt);
 		/*
 		 * We reduce once every RTT. So we will only lower cwnd at
 		 * the next sending seq i.e. the window_data_tsn
 		 */
 		net->cwr_window_tsn = window_data_tsn;
 		net->ecn_ce_pkt_cnt += pkt_cnt;
 		net->lost_cnt = pkt_cnt;
 		net->last_cwr_tsn = tsn;
 	} else {
 		override_bit |= SCTP_CWR_IN_SAME_WINDOW;
 		if (SCTP_TSN_GT(tsn, net->last_cwr_tsn) &&
 		    ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) {
 			/*
 			 * Another loss in the same window update how many
 			 * marks/packets lost we have had.
 			 */
 			int cnt = 1;
 
 			if (pkt_cnt > net->lost_cnt) {
 				/* Should be the case */
 				cnt = (pkt_cnt - net->lost_cnt);
 				net->ecn_ce_pkt_cnt += cnt;
 			}
 			net->lost_cnt = pkt_cnt;
 			net->last_cwr_tsn = tsn;
 			/*
 			 * Most CC functions will ignore this call, since we
 			 * are in-window yet of the initial CE the peer saw.
 			 */
 			stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 1, cnt);
 		}
 	}
 	/*
 	 * We always send a CWR this way if our previous one was lost our
 	 * peer will get an update, or if it is not time again to reduce we
 	 * still get the cwr to the peer. Note we set the override when we
 	 * could not find the TSN on the chunk or the destination network.
 	 */
 	sctp_send_cwr(stcb, net, net->last_cwr_tsn, override_bit);
 }
 
 static void
 sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	/*
 	 * Here we get a CWR from the peer. We must look in the outqueue and
 	 * make sure that we have a covered ECNE in the control chunk part.
 	 * If so remove it.
 	 */
 	struct sctp_tmit_chunk *chk, *nchk;
 	struct sctp_ecne_chunk *ecne;
 	int override;
 	uint32_t cwr_tsn;
 
 	cwr_tsn = ntohl(cp->tsn);
 	override = cp->ch.chunk_flags & SCTP_CWR_REDUCE_OVERRIDE;
 	TAILQ_FOREACH_SAFE(chk, &stcb->asoc.control_send_queue, sctp_next, nchk) {
 		if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) {
 			continue;
 		}
 		if ((override == 0) && (chk->whoTo != net)) {
 			/* Must be from the right src unless override is set */
 			continue;
 		}
 		ecne = mtod(chk->data, struct sctp_ecne_chunk *);
 		if (SCTP_TSN_GE(cwr_tsn, ntohl(ecne->tsn))) {
 			/* this covers this ECNE, we can remove it */
 			stcb->asoc.ecn_echo_cnt_onq--;
 			TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk,
 			    sctp_next);
 			stcb->asoc.ctrl_queue_cnt--;
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 			sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 			if (override == 0) {
 				break;
 			}
 		}
 	}
 }
 
 static void
 sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSED,
     struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_shutdown_complete: handling SHUTDOWN-COMPLETE\n");
 	if (stcb == NULL)
 		return;
 
 	/* process according to association state */
 	if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) {
 		/* unexpected SHUTDOWN-COMPLETE... so ignore... */
 		SCTPDBG(SCTP_DEBUG_INPUT2,
 		    "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n");
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	/* notify upper layer protocol */
 	if (stcb->sctp_socket) {
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 	}
 #ifdef INVARIANTS
 	if (!TAILQ_EMPTY(&stcb->asoc.send_queue) ||
 	    !TAILQ_EMPTY(&stcb->asoc.sent_queue) ||
 	    sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) {
 		panic("Queues are not empty when handling SHUTDOWN-COMPLETE");
 	}
 #endif
 	/* stop the timer */
 	sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_24);
 	SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
 	/* free the TCB */
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_shutdown_complete: calls free-asoc\n");
 	(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_25);
 	return;
 }
 
 static int
 process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc,
     struct sctp_nets *net, uint8_t flg)
 {
 	switch (desc->chunk_type) {
 	case SCTP_DATA:
 	case SCTP_IDATA:
 		/* find the tsn to resend (possibly) */
 		{
 			uint32_t tsn;
 			struct sctp_tmit_chunk *tp1;
 
 			tsn = ntohl(desc->tsn_ifany);
 			TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) {
 				if (tp1->rec.data.tsn == tsn) {
 					/* found it */
 					break;
 				}
 				if (SCTP_TSN_GT(tp1->rec.data.tsn, tsn)) {
 					/* not found */
 					tp1 = NULL;
 					break;
 				}
 			}
 			if (tp1 == NULL) {
 				/*
 				 * Do it the other way , aka without paying
 				 * attention to queue seq order.
 				 */
 				SCTP_STAT_INCR(sctps_pdrpdnfnd);
 				TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) {
 					if (tp1->rec.data.tsn == tsn) {
 						/* found it */
 						break;
 					}
 				}
 			}
 			if (tp1 == NULL) {
 				SCTP_STAT_INCR(sctps_pdrptsnnf);
 			}
 			if ((tp1) && (tp1->sent < SCTP_DATAGRAM_ACKED)) {
 				if (((flg & SCTP_BADCRC) == 0) &&
 				    ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) {
 					return (0);
 				}
 				if ((stcb->asoc.peers_rwnd == 0) &&
 				    ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) {
 					SCTP_STAT_INCR(sctps_pdrpdiwnp);
 					return (0);
 				}
 				if (stcb->asoc.peers_rwnd == 0 &&
 				    (flg & SCTP_FROM_MIDDLE_BOX)) {
 					SCTP_STAT_INCR(sctps_pdrpdizrw);
 					return (0);
 				}
 				if ((uint32_t)SCTP_BUF_LEN(tp1->data) <
 				    SCTP_DATA_CHUNK_OVERHEAD(stcb) + SCTP_NUM_DB_TO_VERIFY) {
 					/* Payload not matching. */
 					SCTP_STAT_INCR(sctps_pdrpbadd);
 					return (-1);
 				}
 				if (memcmp(mtod(tp1->data, caddr_t)+SCTP_DATA_CHUNK_OVERHEAD(stcb),
 				    desc->data_bytes, SCTP_NUM_DB_TO_VERIFY) != 0) {
 					/* Payload not matching. */
 					SCTP_STAT_INCR(sctps_pdrpbadd);
 					return (-1);
 				}
 				if (tp1->do_rtt) {
 					/*
 					 * this guy had a RTO calculation
 					 * pending on it, cancel it
 					 */
 					if (tp1->whoTo->rto_needed == 0) {
 						tp1->whoTo->rto_needed = 1;
 					}
 					tp1->do_rtt = 0;
 				}
 				SCTP_STAT_INCR(sctps_pdrpmark);
 				if (tp1->sent != SCTP_DATAGRAM_RESEND)
 					sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
 				/*
 				 * mark it as if we were doing a FR, since
 				 * we will be getting gap ack reports behind
 				 * the info from the router.
 				 */
 				tp1->rec.data.doing_fast_retransmit = 1;
 				/*
 				 * mark the tsn with what sequences can
 				 * cause a new FR.
 				 */
 				if (TAILQ_EMPTY(&stcb->asoc.send_queue)) {
 					tp1->rec.data.fast_retran_tsn = stcb->asoc.sending_seq;
 				} else {
 					tp1->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.tsn;
 				}
 
 				/* restart the timer */
 				sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
 				    stcb, tp1->whoTo,
 				    SCTP_FROM_SCTP_INPUT + SCTP_LOC_26);
 				sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
 				    stcb, tp1->whoTo);
 
 				/* fix counts and things */
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
 					sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP,
 					    tp1->whoTo->flight_size,
 					    tp1->book_size,
 					    (uint32_t)(uintptr_t)stcb,
 					    tp1->rec.data.tsn);
 				}
 				if (tp1->sent < SCTP_DATAGRAM_RESEND) {
 					sctp_flight_size_decrease(tp1);
 					sctp_total_flight_decrease(stcb, tp1);
 				}
 				tp1->sent = SCTP_DATAGRAM_RESEND;
 			} {
 				/* audit code */
 				unsigned int audit;
 
 				audit = 0;
 				TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) {
 					if (tp1->sent == SCTP_DATAGRAM_RESEND)
 						audit++;
 				}
 				TAILQ_FOREACH(tp1, &stcb->asoc.control_send_queue,
 				    sctp_next) {
 					if (tp1->sent == SCTP_DATAGRAM_RESEND)
 						audit++;
 				}
 				if (audit != stcb->asoc.sent_queue_retran_cnt) {
 					SCTP_PRINTF("**Local Audit finds cnt:%d asoc cnt:%d\n",
 					    audit, stcb->asoc.sent_queue_retran_cnt);
 #ifndef SCTP_AUDITING_ENABLED
 					stcb->asoc.sent_queue_retran_cnt = audit;
 #endif
 				}
 			}
 		}
 		break;
 	case SCTP_ASCONF:
 		{
 			struct sctp_tmit_chunk *asconf;
 
 			TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue,
 			    sctp_next) {
 				if (asconf->rec.chunk_id.id == SCTP_ASCONF) {
 					break;
 				}
 			}
 			if (asconf) {
 				if (asconf->sent != SCTP_DATAGRAM_RESEND)
 					sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
 				asconf->sent = SCTP_DATAGRAM_RESEND;
 				asconf->snd_count--;
 			}
 		}
 		break;
 	case SCTP_INITIATION:
 		/* resend the INIT */
 		stcb->asoc.dropped_special_cnt++;
 		if (stcb->asoc.dropped_special_cnt < SCTP_RETRY_DROPPED_THRESH) {
 			/*
 			 * If we can get it in, in a few attempts we do
 			 * this, otherwise we let the timer fire.
 			 */
 			sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep,
 			    stcb, net,
 			    SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
 			sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
 		}
 		break;
 	case SCTP_SELECTIVE_ACK:
 	case SCTP_NR_SELECTIVE_ACK:
 		/* resend the sack */
 		sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED);
 		break;
 	case SCTP_HEARTBEAT_REQUEST:
 		/* resend a demand HB */
 		if ((stcb->asoc.overall_error_count + 3) < stcb->asoc.max_send_times) {
 			/*
 			 * Only retransmit if we KNOW we wont destroy the
 			 * tcb
 			 */
 			sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED);
 		}
 		break;
 	case SCTP_SHUTDOWN:
 		sctp_send_shutdown(stcb, net);
 		break;
 	case SCTP_SHUTDOWN_ACK:
 		sctp_send_shutdown_ack(stcb, net);
 		break;
 	case SCTP_COOKIE_ECHO:
 		{
 			struct sctp_tmit_chunk *cookie;
 
 			cookie = NULL;
 			TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue,
 			    sctp_next) {
 				if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
 					break;
 				}
 			}
 			if (cookie) {
 				if (cookie->sent != SCTP_DATAGRAM_RESEND)
 					sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
 				cookie->sent = SCTP_DATAGRAM_RESEND;
 				sctp_stop_all_cookie_timers(stcb);
 			}
 		}
 		break;
 	case SCTP_COOKIE_ACK:
 		sctp_send_cookie_ack(stcb);
 		break;
 	case SCTP_ASCONF_ACK:
 		/* resend last asconf ack */
 		sctp_send_asconf_ack(stcb);
 		break;
 	case SCTP_IFORWARD_CUM_TSN:
 	case SCTP_FORWARD_CUM_TSN:
 		send_forward_tsn(stcb, &stcb->asoc);
 		break;
 		/* can't do anything with these */
 	case SCTP_PACKET_DROPPED:
 	case SCTP_INITIATION_ACK:	/* this should not happen */
 	case SCTP_HEARTBEAT_ACK:
 	case SCTP_ABORT_ASSOCIATION:
 	case SCTP_OPERATION_ERROR:
 	case SCTP_SHUTDOWN_COMPLETE:
 	case SCTP_ECN_ECHO:
 	case SCTP_ECN_CWR:
 	default:
 		break;
 	}
 	return (0);
 }
 
 void
 sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list)
 {
 	uint32_t i;
 	uint16_t temp;
 
 	/*
 	 * We set things to 0xffffffff since this is the last delivered
 	 * sequence and we will be sending in 0 after the reset.
 	 */
 
 	if (number_entries) {
 		for (i = 0; i < number_entries; i++) {
 			temp = ntohs(list[i]);
 			if (temp >= stcb->asoc.streamincnt) {
 				continue;
 			}
 			stcb->asoc.strmin[temp].last_mid_delivered = 0xffffffff;
 		}
 	} else {
 		list = NULL;
 		for (i = 0; i < stcb->asoc.streamincnt; i++) {
 			stcb->asoc.strmin[i].last_mid_delivered = 0xffffffff;
 		}
 	}
 	sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
 }
 
 static void
 sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list)
 {
 	uint32_t i;
 	uint16_t temp;
 
 	if (number_entries > 0) {
 		for (i = 0; i < number_entries; i++) {
 			temp = ntohs(list[i]);
 			if (temp >= stcb->asoc.streamoutcnt) {
 				/* no such stream */
 				continue;
 			}
 			stcb->asoc.strmout[temp].next_mid_ordered = 0;
 			stcb->asoc.strmout[temp].next_mid_unordered = 0;
 		}
 	} else {
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			stcb->asoc.strmout[i].next_mid_ordered = 0;
 			stcb->asoc.strmout[i].next_mid_unordered = 0;
 		}
 	}
 	sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
 }
 
 static void
 sctp_reset_clear_pending(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list)
 {
 	uint32_t i;
 	uint16_t temp;
 
 	if (number_entries > 0) {
 		for (i = 0; i < number_entries; i++) {
 			temp = ntohs(list[i]);
 			if (temp >= stcb->asoc.streamoutcnt) {
 				/* no such stream */
 				continue;
 			}
 			stcb->asoc.strmout[temp].state = SCTP_STREAM_OPEN;
 		}
 	} else {
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			stcb->asoc.strmout[i].state = SCTP_STREAM_OPEN;
 		}
 	}
 }
 
 
 struct sctp_stream_reset_request *
 sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk)
 {
 	struct sctp_association *asoc;
 	struct sctp_chunkhdr *ch;
 	struct sctp_stream_reset_request *r;
 	struct sctp_tmit_chunk *chk;
 	int len, clen;
 
 	asoc = &stcb->asoc;
 	if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
 		asoc->stream_reset_outstanding = 0;
 		return (NULL);
 	}
 	if (stcb->asoc.str_reset == NULL) {
 		asoc->stream_reset_outstanding = 0;
 		return (NULL);
 	}
 	chk = stcb->asoc.str_reset;
 	if (chk->data == NULL) {
 		return (NULL);
 	}
 	if (bchk) {
 		/* he wants a copy of the chk pointer */
 		*bchk = chk;
 	}
 	clen = chk->send_size;
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	r = (struct sctp_stream_reset_request *)(ch + 1);
 	if (ntohl(r->request_seq) == seq) {
 		/* found it */
 		return (r);
 	}
 	len = SCTP_SIZE32(ntohs(r->ph.param_length));
 	if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) {
 		/* move to the next one, there can only be a max of two */
 		r = (struct sctp_stream_reset_request *)((caddr_t)r + len);
 		if (ntohl(r->request_seq) == seq) {
 			return (r);
 		}
 	}
 	/* that seq is not here */
 	return (NULL);
 }
 
 static void
 sctp_clean_up_stream_reset(struct sctp_tcb *stcb)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 
 	asoc = &stcb->asoc;
 	chk = asoc->str_reset;
 	if (chk == NULL) {
 		return;
 	}
 	asoc->str_reset = NULL;
 	sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb,
 	    NULL, SCTP_FROM_SCTP_INPUT + SCTP_LOC_28);
 	TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
 	asoc->ctrl_queue_cnt--;
 	if (chk->data) {
 		sctp_m_freem(chk->data);
 		chk->data = NULL;
 	}
 	sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 }
 
 
 static int
 sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
     uint32_t seq, uint32_t action,
     struct sctp_stream_reset_response *respin)
 {
 	uint16_t type;
 	int lparam_len;
 	struct sctp_association *asoc = &stcb->asoc;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_stream_reset_request *req_param;
 	struct sctp_stream_reset_out_request *req_out_param;
 	struct sctp_stream_reset_in_request *req_in_param;
 	uint32_t number_entries;
 
 	if (asoc->stream_reset_outstanding == 0) {
 		/* duplicate */
 		return (0);
 	}
 	if (seq == stcb->asoc.str_reset_seq_out) {
 		req_param = sctp_find_stream_reset(stcb, seq, &chk);
 		if (req_param != NULL) {
 			stcb->asoc.str_reset_seq_out++;
 			type = ntohs(req_param->ph.param_type);
 			lparam_len = ntohs(req_param->ph.param_length);
 			if (type == SCTP_STR_RESET_OUT_REQUEST) {
 				int no_clear = 0;
 
 				req_out_param = (struct sctp_stream_reset_out_request *)req_param;
 				number_entries = (lparam_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t);
 				asoc->stream_reset_out_is_outstanding = 0;
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
 				if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					/* do it */
 					sctp_reset_out_streams(stcb, number_entries, req_out_param->list_of_streams);
 				} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
 					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED);
 				} else if (action == SCTP_STREAM_RESET_RESULT_IN_PROGRESS) {
 					/*
 					 * Set it up so we don't stop
 					 * retransmitting
 					 */
 					asoc->stream_reset_outstanding++;
 					stcb->asoc.str_reset_seq_out--;
 					asoc->stream_reset_out_is_outstanding = 1;
 					no_clear = 1;
 				} else {
 					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED);
 				}
 				if (no_clear == 0) {
 					sctp_reset_clear_pending(stcb, number_entries, req_out_param->list_of_streams);
 				}
 			} else if (type == SCTP_STR_RESET_IN_REQUEST) {
 				req_in_param = (struct sctp_stream_reset_in_request *)req_param;
 				number_entries = (lparam_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t);
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
 				if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
 					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb,
 					    number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED);
 				} else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb,
 					    number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED);
 				}
 			} else if (type == SCTP_STR_RESET_ADD_OUT_STREAMS) {
 				/* Ok we now may have more streams */
 				int num_stream;
 
 				num_stream = stcb->asoc.strm_pending_add_size;
 				if (num_stream > (stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt)) {
 					/* TSNH */
 					num_stream = stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt;
 				}
 				stcb->asoc.strm_pending_add_size = 0;
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
 				if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					/* Put the new streams into effect */
 					int i;
 
 					for (i = asoc->streamoutcnt; i < (asoc->streamoutcnt + num_stream); i++) {
 						asoc->strmout[i].state = SCTP_STREAM_OPEN;
 					}
 					asoc->streamoutcnt += num_stream;
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0);
 				} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
 					    SCTP_STREAM_CHANGE_DENIED);
 				} else {
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
 					    SCTP_STREAM_CHANGE_FAILED);
 				}
 			} else if (type == SCTP_STR_RESET_ADD_IN_STREAMS) {
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
 				if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
 					    SCTP_STREAM_CHANGE_DENIED);
 				} else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
 					    SCTP_STREAM_CHANGE_FAILED);
 				}
 			} else if (type == SCTP_STR_RESET_TSN_REQUEST) {
 				/**
 				 * a) Adopt the new in tsn.
 				 * b) reset the map
 				 * c) Adopt the new out-tsn
 				 */
 				struct sctp_stream_reset_response_tsn *resp;
 				struct sctp_forward_tsn_chunk fwdtsn;
 				int abort_flag = 0;
 
 				if (respin == NULL) {
 					/* huh ? */
 					return (0);
 				}
 				if (ntohs(respin->ph.param_length) < sizeof(struct sctp_stream_reset_response_tsn)) {
 					return (0);
 				}
 				if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					resp = (struct sctp_stream_reset_response_tsn *)respin;
 					asoc->stream_reset_outstanding--;
 					fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
 					fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN;
 					fwdtsn.new_cumulative_tsn = htonl(ntohl(resp->senders_next_tsn) - 1);
 					sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0);
 					if (abort_flag) {
 						return (1);
 					}
 					stcb->asoc.highest_tsn_inside_map = (ntohl(resp->senders_next_tsn) - 1);
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
 						sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
 					}
 
 					stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map;
 					stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn);
 					memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size);
 
 					stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map;
 					memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size);
 
 					stcb->asoc.sending_seq = ntohl(resp->receivers_next_tsn);
 					stcb->asoc.last_acked_seq = stcb->asoc.cumulative_tsn;
 
 					sctp_reset_out_streams(stcb, 0, (uint16_t *)NULL);
 					sctp_reset_in_stream(stcb, 0, (uint16_t *)NULL);
 					sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), 0);
 				} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
 					sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1),
 					    SCTP_ASSOC_RESET_DENIED);
 				} else {
 					sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1),
 					    SCTP_ASSOC_RESET_FAILED);
 				}
 			}
 			/* get rid of the request and get the request flags */
 			if (asoc->stream_reset_outstanding == 0) {
 				sctp_clean_up_stream_reset(stcb);
 			}
 		}
 	}
 	if (asoc->stream_reset_outstanding == 0) {
 		sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED);
 	}
 	return (0);
 }
 
 static void
 sctp_handle_str_reset_request_in(struct sctp_tcb *stcb,
     struct sctp_tmit_chunk *chk,
     struct sctp_stream_reset_in_request *req, int trunc)
 {
 	uint32_t seq;
 	int len, i;
 	int number_entries;
 	uint16_t temp;
 
 	/*
 	 * peer wants me to send a str-reset to him for my outgoing seq's if
 	 * seq_in is right.
 	 */
 	struct sctp_association *asoc = &stcb->asoc;
 
 	seq = ntohl(req->request_seq);
 	if (asoc->str_reset_seq_in == seq) {
 		asoc->last_reset_action[1] = asoc->last_reset_action[0];
 		if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) {
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (trunc) {
 			/* Can't do it, since they exceeded our buffer size  */
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (stcb->asoc.stream_reset_out_is_outstanding == 0) {
 			len = ntohs(req->ph.param_length);
 			number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t));
 			if (number_entries) {
 				for (i = 0; i < number_entries; i++) {
 					temp = ntohs(req->list_of_streams[i]);
 					if (temp >= stcb->asoc.streamoutcnt) {
 						asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 						goto bad_boy;
 					}
 					req->list_of_streams[i] = temp;
 				}
 				for (i = 0; i < number_entries; i++) {
 					if (stcb->asoc.strmout[req->list_of_streams[i]].state == SCTP_STREAM_OPEN) {
 						stcb->asoc.strmout[req->list_of_streams[i]].state = SCTP_STREAM_RESET_PENDING;
 					}
 				}
 			} else {
 				/* Its all */
 				for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 					if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN)
 						stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING;
 				}
 			}
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 		} else {
 			/* Can't do it, since we have sent one out */
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS;
 		}
 bad_boy:
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 		asoc->str_reset_seq_in++;
 	} else if (asoc->str_reset_seq_in - 1 == seq) {
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 	} else if (asoc->str_reset_seq_in - 2 == seq) {
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
 		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 	sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED);
 }
 
 static int
 sctp_handle_str_reset_request_tsn(struct sctp_tcb *stcb,
     struct sctp_tmit_chunk *chk,
     struct sctp_stream_reset_tsn_request *req)
 {
 	/* reset all in and out and update the tsn */
 	/*
 	 * A) reset my str-seq's on in and out. B) Select a receive next,
 	 * and set cum-ack to it. Also process this selected number as a
 	 * fwd-tsn as well. C) set in the response my next sending seq.
 	 */
 	struct sctp_forward_tsn_chunk fwdtsn;
 	struct sctp_association *asoc = &stcb->asoc;
 	int abort_flag = 0;
 	uint32_t seq;
 
 	seq = ntohl(req->request_seq);
 	if (asoc->str_reset_seq_in == seq) {
 		asoc->last_reset_action[1] = stcb->asoc.last_reset_action[0];
 		if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else {
 			fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
 			fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN;
 			fwdtsn.ch.chunk_flags = 0;
 			fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1);
 			sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0);
 			if (abort_flag) {
 				return (1);
 			}
 			asoc->highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA;
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
 				sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
 			}
 			asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->highest_tsn_inside_map;
 			asoc->mapping_array_base_tsn = asoc->highest_tsn_inside_map + 1;
 			memset(asoc->mapping_array, 0, asoc->mapping_array_size);
 			asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map;
 			memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size);
 			atomic_add_int(&asoc->sending_seq, 1);
 			/* save off historical data for retrans */
 			asoc->last_sending_seq[1] = asoc->last_sending_seq[0];
 			asoc->last_sending_seq[0] = asoc->sending_seq;
 			asoc->last_base_tsnsent[1] = asoc->last_base_tsnsent[0];
 			asoc->last_base_tsnsent[0] = asoc->mapping_array_base_tsn;
 			sctp_reset_out_streams(stcb, 0, (uint16_t *)NULL);
 			sctp_reset_in_stream(stcb, 0, (uint16_t *)NULL);
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 			sctp_notify_stream_reset_tsn(stcb, asoc->sending_seq, (asoc->mapping_array_base_tsn + 1), 0);
 		}
 		sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0],
 		    asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]);
 		asoc->str_reset_seq_in++;
 	} else if (asoc->str_reset_seq_in - 1 == seq) {
 		sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0],
 		    asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]);
 	} else if (asoc->str_reset_seq_in - 2 == seq) {
 		sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1],
 		    asoc->last_sending_seq[1], asoc->last_base_tsnsent[1]);
 	} else {
 		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 	return (0);
 }
 
 static void
 sctp_handle_str_reset_request_out(struct sctp_tcb *stcb,
     struct sctp_tmit_chunk *chk,
     struct sctp_stream_reset_out_request *req, int trunc)
 {
 	uint32_t seq, tsn;
 	int number_entries, len;
 	struct sctp_association *asoc = &stcb->asoc;
 
 	seq = ntohl(req->request_seq);
 
 	/* now if its not a duplicate we process it */
 	if (asoc->str_reset_seq_in == seq) {
 		len = ntohs(req->ph.param_length);
 		number_entries = ((len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t));
 		/*
 		 * the sender is resetting, handle the list issue.. we must
 		 * a) verify if we can do the reset, if so no problem b) If
 		 * we can't do the reset we must copy the request. c) queue
 		 * it, and setup the data in processor to trigger it off
 		 * when needed and dequeue all the queued data.
 		 */
 		tsn = ntohl(req->send_reset_at_tsn);
 
 		/* move the reset action back one */
 		asoc->last_reset_action[1] = asoc->last_reset_action[0];
 		if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) {
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (trunc) {
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (SCTP_TSN_GE(asoc->cumulative_tsn, tsn)) {
 			/* we can do it now */
 			sctp_reset_in_stream(stcb, number_entries, req->list_of_streams);
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 		} else {
 			/*
 			 * we must queue it up and thus wait for the TSN's
 			 * to arrive that are at or before tsn
 			 */
 			struct sctp_stream_reset_list *liste;
 			int siz;
 
 			siz = sizeof(struct sctp_stream_reset_list) + (number_entries * sizeof(uint16_t));
 			SCTP_MALLOC(liste, struct sctp_stream_reset_list *,
 			    siz, SCTP_M_STRESET);
 			if (liste == NULL) {
 				/* gak out of memory */
 				asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 				sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 				return;
 			}
 			liste->seq = seq;
 			liste->tsn = tsn;
 			liste->number_entries = number_entries;
 			memcpy(&liste->list_of_streams, req->list_of_streams, number_entries * sizeof(uint16_t));
 			TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp);
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_IN_PROGRESS;
 		}
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 		asoc->str_reset_seq_in++;
 	} else if ((asoc->str_reset_seq_in - 1) == seq) {
 		/*
 		 * one seq back, just echo back last action since my
 		 * response was lost.
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 	} else if ((asoc->str_reset_seq_in - 2) == seq) {
 		/*
 		 * two seq back, just echo back last action since my
 		 * response was lost.
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
 		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 }
 
 static void
 sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk,
     struct sctp_stream_reset_add_strm *str_add)
 {
 	/*
 	 * Peer is requesting to add more streams. If its within our
 	 * max-streams we will allow it.
 	 */
 	uint32_t num_stream, i;
 	uint32_t seq;
 	struct sctp_association *asoc = &stcb->asoc;
 	struct sctp_queued_to_read *ctl, *nctl;
 
 	/* Get the number. */
 	seq = ntohl(str_add->request_seq);
 	num_stream = ntohs(str_add->number_of_streams);
 	/* Now what would be the new total? */
 	if (asoc->str_reset_seq_in == seq) {
 		num_stream += stcb->asoc.streamincnt;
 		stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
 		if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if ((num_stream > stcb->asoc.max_inbound_streams) ||
 		    (num_stream > 0xffff)) {
 			/* We must reject it they ask for to many */
 	denied:
 			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else {
 			/* Ok, we can do that :-) */
 			struct sctp_stream_in *oldstrm;
 
 			/* save off the old */
 			oldstrm = stcb->asoc.strmin;
 			SCTP_MALLOC(stcb->asoc.strmin, struct sctp_stream_in *,
 			    (num_stream * sizeof(struct sctp_stream_in)),
 			    SCTP_M_STRMI);
 			if (stcb->asoc.strmin == NULL) {
 				stcb->asoc.strmin = oldstrm;
 				goto denied;
 			}
 			/* copy off the old data */
 			for (i = 0; i < stcb->asoc.streamincnt; i++) {
 				TAILQ_INIT(&stcb->asoc.strmin[i].inqueue);
 				TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue);
 				stcb->asoc.strmin[i].sid = i;
 				stcb->asoc.strmin[i].last_mid_delivered = oldstrm[i].last_mid_delivered;
 				stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started;
 				stcb->asoc.strmin[i].pd_api_started = oldstrm[i].pd_api_started;
 				/* now anything on those queues? */
 				TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next_instrm, nctl) {
 					TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next_instrm);
 					TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next_instrm);
 				}
 				TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].uno_inqueue, next_instrm, nctl) {
 					TAILQ_REMOVE(&oldstrm[i].uno_inqueue, ctl, next_instrm);
 					TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].uno_inqueue, ctl, next_instrm);
 				}
 			}
 			/* Init the new streams */
 			for (i = stcb->asoc.streamincnt; i < num_stream; i++) {
 				TAILQ_INIT(&stcb->asoc.strmin[i].inqueue);
 				TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue);
 				stcb->asoc.strmin[i].sid = i;
 				stcb->asoc.strmin[i].last_mid_delivered = 0xffffffff;
 				stcb->asoc.strmin[i].pd_api_started = 0;
 				stcb->asoc.strmin[i].delivery_started = 0;
 			}
 			SCTP_FREE(oldstrm, SCTP_M_STRMI);
 			/* update the size */
 			stcb->asoc.streamincnt = num_stream;
 			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 			sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0);
 		}
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 		asoc->str_reset_seq_in++;
 	} else if ((asoc->str_reset_seq_in - 1) == seq) {
 		/*
 		 * one seq back, just echo back last action since my
 		 * response was lost.
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 	} else if ((asoc->str_reset_seq_in - 2) == seq) {
 		/*
 		 * two seq back, just echo back last action since my
 		 * response was lost.
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
 		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 
 	}
 }
 
 static void
 sctp_handle_str_reset_add_out_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk,
     struct sctp_stream_reset_add_strm *str_add)
 {
 	/*
 	 * Peer is requesting to add more streams. If its within our
 	 * max-streams we will allow it.
 	 */
 	uint16_t num_stream;
 	uint32_t seq;
 	struct sctp_association *asoc = &stcb->asoc;
 
 	/* Get the number. */
 	seq = ntohl(str_add->request_seq);
 	num_stream = ntohs(str_add->number_of_streams);
 	/* Now what would be the new total? */
 	if (asoc->str_reset_seq_in == seq) {
 		stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
 		if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
 			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (stcb->asoc.stream_reset_outstanding) {
 			/* We must reject it we have something pending */
 			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS;
 		} else {
 			/* Ok, we can do that :-) */
 			int mychk;
 
 			mychk = stcb->asoc.streamoutcnt;
 			mychk += num_stream;
 			if (mychk < 0x10000) {
 				stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 				if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, num_stream, 0, 1)) {
 					stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 				}
 			} else {
 				stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 			}
 		}
 		sctp_add_stream_reset_result(chk, seq, stcb->asoc.last_reset_action[0]);
 		asoc->str_reset_seq_in++;
 	} else if ((asoc->str_reset_seq_in - 1) == seq) {
 		/*
 		 * one seq back, just echo back last action since my
 		 * response was lost.
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 	} else if ((asoc->str_reset_seq_in - 2) == seq) {
 		/*
 		 * two seq back, just echo back last action since my
 		 * response was lost.
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
 		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 }
 
 #ifdef __GNUC__
 __attribute__((noinline))
 #endif
 static int
 sctp_handle_stream_reset(struct sctp_tcb *stcb, struct mbuf *m, int offset,
     struct sctp_chunkhdr *ch_req)
 {
 	uint16_t remaining_length, param_len, ptype;
 	struct sctp_paramhdr pstore;
 	uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE];
 	uint32_t seq = 0;
 	int num_req = 0;
 	int trunc = 0;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_chunkhdr *ch;
 	struct sctp_paramhdr *ph;
 	int ret_code = 0;
 	int num_param = 0;
 
 	/* now it may be a reset or a reset-response */
 	remaining_length = ntohs(ch_req->chunk_length) - sizeof(struct sctp_chunkhdr);
 
 	/* setup for adding the response */
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		return (ret_code);
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_STREAM_RESET;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->no_fr_allowed = 0;
 	chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr);
 	chk->book_size_scale = 0;
 	chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (chk->data == NULL) {
 strres_nochunk:
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		return (ret_code);
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 
 	/* setup chunk parameters */
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->whoTo = NULL;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	ch->chunk_type = SCTP_STREAM_RESET;
 	ch->chunk_flags = 0;
 	ch->chunk_length = htons(chk->send_size);
 	SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
 	offset += sizeof(struct sctp_chunkhdr);
 	while (remaining_length >= sizeof(struct sctp_paramhdr)) {
 		ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(pstore), (uint8_t *)&pstore);
 		if (ph == NULL) {
 			/* TSNH */
 			break;
 		}
 		param_len = ntohs(ph->param_length);
 		if ((param_len > remaining_length) ||
 		    (param_len < (sizeof(struct sctp_paramhdr) + sizeof(uint32_t)))) {
 			/* bad parameter length */
 			break;
 		}
 		ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, min(param_len, sizeof(cstore)),
 		    (uint8_t *)&cstore);
 		if (ph == NULL) {
 			/* TSNH */
 			break;
 		}
 		ptype = ntohs(ph->param_type);
 		num_param++;
 		if (param_len > sizeof(cstore)) {
 			trunc = 1;
 		} else {
 			trunc = 0;
 		}
 		if (num_param > SCTP_MAX_RESET_PARAMS) {
 			/* hit the max of parameters already sorry.. */
 			break;
 		}
 		if (ptype == SCTP_STR_RESET_OUT_REQUEST) {
 			struct sctp_stream_reset_out_request *req_out;
 
 			if (param_len < sizeof(struct sctp_stream_reset_out_request)) {
 				break;
 			}
 			req_out = (struct sctp_stream_reset_out_request *)ph;
 			num_req++;
 			if (stcb->asoc.stream_reset_outstanding) {
 				seq = ntohl(req_out->response_seq);
 				if (seq == stcb->asoc.str_reset_seq_out) {
 					/* implicit ack */
 					(void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_RESULT_PERFORMED, NULL);
 				}
 			}
 			sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc);
 		} else if (ptype == SCTP_STR_RESET_ADD_OUT_STREAMS) {
 			struct sctp_stream_reset_add_strm *str_add;
 
 			if (param_len < sizeof(struct sctp_stream_reset_add_strm)) {
 				break;
 			}
 			str_add = (struct sctp_stream_reset_add_strm *)ph;
 			num_req++;
 			sctp_handle_str_reset_add_strm(stcb, chk, str_add);
 		} else if (ptype == SCTP_STR_RESET_ADD_IN_STREAMS) {
 			struct sctp_stream_reset_add_strm *str_add;
 
 			if (param_len < sizeof(struct sctp_stream_reset_add_strm)) {
 				break;
 			}
 			str_add = (struct sctp_stream_reset_add_strm *)ph;
 			num_req++;
 			sctp_handle_str_reset_add_out_strm(stcb, chk, str_add);
 		} else if (ptype == SCTP_STR_RESET_IN_REQUEST) {
 			struct sctp_stream_reset_in_request *req_in;
 
 			num_req++;
 			req_in = (struct sctp_stream_reset_in_request *)ph;
 			sctp_handle_str_reset_request_in(stcb, chk, req_in, trunc);
 		} else if (ptype == SCTP_STR_RESET_TSN_REQUEST) {
 			struct sctp_stream_reset_tsn_request *req_tsn;
 
 			num_req++;
 			req_tsn = (struct sctp_stream_reset_tsn_request *)ph;
 			if (sctp_handle_str_reset_request_tsn(stcb, chk, req_tsn)) {
 				ret_code = 1;
 				goto strres_nochunk;
 			}
 			/* no more */
 			break;
 		} else if (ptype == SCTP_STR_RESET_RESPONSE) {
 			struct sctp_stream_reset_response *resp;
 			uint32_t result;
 
 			if (param_len < sizeof(struct sctp_stream_reset_response)) {
 				break;
 			}
 			resp = (struct sctp_stream_reset_response *)ph;
 			seq = ntohl(resp->response_seq);
 			result = ntohl(resp->result);
 			if (sctp_handle_stream_reset_response(stcb, seq, result, resp)) {
 				ret_code = 1;
 				goto strres_nochunk;
 			}
 		} else {
 			break;
 		}
 		offset += SCTP_SIZE32(param_len);
 		if (remaining_length >= SCTP_SIZE32(param_len)) {
 			remaining_length -= SCTP_SIZE32(param_len);
 		} else {
 			remaining_length = 0;
 		}
 	}
 	if (num_req == 0) {
 		/* we have no response free the stuff */
 		goto strres_nochunk;
 	}
 	/* ok we have a chunk to link in */
 	TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue,
 	    chk,
 	    sctp_next);
 	stcb->asoc.ctrl_queue_cnt++;
 	return (ret_code);
 }
 
 /*
  * Handle a router or endpoints report of a packet loss, there are two ways
  * to handle this, either we get the whole packet and must disect it
  * ourselves (possibly with truncation and or corruption) or it is a summary
  * from a middle box that did the disectting for us.
  */
 static void
 sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp,
     struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit)
 {
 	struct sctp_chunk_desc desc;
 	struct sctp_chunkhdr *chk_hdr;
 	struct sctp_data_chunk *data_chunk;
 	struct sctp_idata_chunk *idata_chunk;
 	uint32_t bottle_bw, on_queue;
 	uint32_t offset, chk_len;
 	uint16_t trunc_len;
 	uint16_t pktdrp_len;
 	uint8_t pktdrp_flags;
 
 	KASSERT(sizeof(struct sctp_pktdrop_chunk) <= limit,
 	    ("PKTDROP chunk too small"));
 	pktdrp_flags = cp->ch.chunk_flags;
 	pktdrp_len = ntohs(cp->ch.chunk_length);
 	KASSERT(limit <= pktdrp_len, ("Inconsistent limit"));
 	if (pktdrp_flags & SCTP_PACKET_TRUNCATED) {
 		trunc_len = ntohs(cp->trunc_len);
 		if (trunc_len <= pktdrp_len - sizeof(struct sctp_pktdrop_chunk)) {
 			/* The peer plays games with us. */
 			return;
 		}
 	} else {
 		trunc_len = 0;
 	}
 	limit -= sizeof(struct sctp_pktdrop_chunk);
 	offset = 0;
 	if (offset == limit) {
 		if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) {
 			SCTP_STAT_INCR(sctps_pdrpbwrpt);
 		}
 	} else if (offset + sizeof(struct sctphdr) > limit) {
 		/* Only a partial SCTP common header. */
 		SCTP_STAT_INCR(sctps_pdrpcrupt);
 		offset = limit;
 	} else {
 		/* XXX: Check embedded SCTP common header. */
 		offset += sizeof(struct sctphdr);
 	}
 	/* Now parse through the chunks themselves. */
 	while (offset < limit) {
 		if (offset + sizeof(struct sctp_chunkhdr) > limit) {
 			SCTP_STAT_INCR(sctps_pdrpcrupt);
 			break;
 		}
 		chk_hdr = (struct sctp_chunkhdr *)(cp->data + offset);
 		desc.chunk_type = chk_hdr->chunk_type;
 		/* get amount we need to move */
 		chk_len = (uint32_t)ntohs(chk_hdr->chunk_length);
 		if (chk_len < sizeof(struct sctp_chunkhdr)) {
 			/* Someone is lying... */
 			break;
 		}
 		if (desc.chunk_type == SCTP_DATA) {
 			if (stcb->asoc.idata_supported) {
 				/* Some is playing games with us. */
 				break;
 			}
 			if (chk_len <= sizeof(struct sctp_data_chunk)) {
 				/* Some is playing games with us. */
 				break;
 			}
 			if (chk_len < sizeof(struct sctp_data_chunk) + SCTP_NUM_DB_TO_VERIFY) {
 				/*
 				 * Not enough data bytes available in the
 				 * chunk.
 				 */
 				SCTP_STAT_INCR(sctps_pdrpnedat);
 				goto next_chunk;
 			}
 			if (offset + sizeof(struct sctp_data_chunk) + SCTP_NUM_DB_TO_VERIFY > limit) {
 				/* Not enough data in buffer. */
 				break;
 			}
 			data_chunk = (struct sctp_data_chunk *)(cp->data + offset);
 			memcpy(desc.data_bytes, data_chunk + 1, SCTP_NUM_DB_TO_VERIFY);
 			desc.tsn_ifany = data_chunk->dp.tsn;
 			if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) {
 				SCTP_STAT_INCR(sctps_pdrpmbda);
 			}
 		} else if (desc.chunk_type == SCTP_IDATA) {
 			if (!stcb->asoc.idata_supported) {
 				/* Some is playing games with us. */
 				break;
 			}
 			if (chk_len <= sizeof(struct sctp_idata_chunk)) {
 				/* Some is playing games with us. */
 				break;
 			}
 			if (chk_len < sizeof(struct sctp_idata_chunk) + SCTP_NUM_DB_TO_VERIFY) {
 				/*
 				 * Not enough data bytes available in the
 				 * chunk.
 				 */
 				SCTP_STAT_INCR(sctps_pdrpnedat);
 				goto next_chunk;
 			}
 			if (offset + sizeof(struct sctp_idata_chunk) + SCTP_NUM_DB_TO_VERIFY > limit) {
 				/* Not enough data in buffer. */
 				break;
 			}
 			idata_chunk = (struct sctp_idata_chunk *)(cp->data + offset);
 			memcpy(desc.data_bytes, idata_chunk + 1, SCTP_NUM_DB_TO_VERIFY);
 			desc.tsn_ifany = idata_chunk->dp.tsn;
 			if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) {
 				SCTP_STAT_INCR(sctps_pdrpmbda);
 			}
 		} else {
 			if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) {
 				SCTP_STAT_INCR(sctps_pdrpmbct);
 			}
 		}
 		if (process_chunk_drop(stcb, &desc, net, pktdrp_flags)) {
 			SCTP_STAT_INCR(sctps_pdrppdbrk);
 			break;
 		}
 next_chunk:
 		offset += SCTP_SIZE32(chk_len);
 	}
 	/* Now update any rwnd --- possibly */
 	if ((pktdrp_flags & SCTP_FROM_MIDDLE_BOX) == 0) {
 		/* From a peer, we get a rwnd report */
 		uint32_t a_rwnd;
 
 		SCTP_STAT_INCR(sctps_pdrpfehos);
 
 		bottle_bw = ntohl(cp->bottle_bw);
 		on_queue = ntohl(cp->current_onq);
 		if (bottle_bw && on_queue) {
 			/* a rwnd report is in here */
 			if (bottle_bw > on_queue)
 				a_rwnd = bottle_bw - on_queue;
 			else
 				a_rwnd = 0;
 
 			if (a_rwnd == 0)
 				stcb->asoc.peers_rwnd = 0;
 			else {
 				if (a_rwnd > stcb->asoc.total_flight) {
 					stcb->asoc.peers_rwnd =
 					    a_rwnd - stcb->asoc.total_flight;
 				} else {
 					stcb->asoc.peers_rwnd = 0;
 				}
 				if (stcb->asoc.peers_rwnd <
 				    stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
 					/* SWS sender side engages */
 					stcb->asoc.peers_rwnd = 0;
 				}
 			}
 		}
 	} else {
 		SCTP_STAT_INCR(sctps_pdrpfmbox);
 	}
 
 	/* now middle boxes in sat networks get a cwnd bump */
 	if ((pktdrp_flags & SCTP_FROM_MIDDLE_BOX) &&
 	    (stcb->asoc.sat_t3_loss_recovery == 0) &&
 	    (stcb->asoc.sat_network)) {
 		/*
 		 * This is debatable but for sat networks it makes sense
 		 * Note if a T3 timer has went off, we will prohibit any
 		 * changes to cwnd until we exit the t3 loss recovery.
 		 */
 		stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped(stcb,
 		    net, cp, &bottle_bw, &on_queue);
 	}
 }
 
 /*
  * handles all control chunks in a packet inputs: - m: mbuf chain, assumed to
  * still contain IP/SCTP header - stcb: is the tcb found for this packet -
  * offset: offset into the mbuf chain to first chunkhdr - length: is the
  * length of the complete packet outputs: - length: modified to remaining
  * length after control processing - netp: modified to new sctp_nets after
  * cookie-echo processing - return NULL to discard the packet (ie. no asoc,
  * bad packet,...) otherwise return the tcb for this packet
  */
 #ifdef __GNUC__
 __attribute__((noinline))
 #endif
 static struct sctp_tcb *
 sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp,
     struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_association *asoc;
 	struct mbuf *op_err;
 	char msg[SCTP_DIAG_INFO_LEN];
 	uint32_t vtag_in;
 	int num_chunks = 0;	/* number of control chunks processed */
 	uint32_t chk_length, contiguous;
 	int ret;
 	int abort_no_unlock = 0;
 	int ecne_seen = 0;
 
 	/*
 	 * How big should this be, and should it be alloc'd? Lets try the
 	 * d-mtu-ceiling for now (2k) and that should hopefully work ...
 	 * until we get into jumbo grams and such..
 	 */
 	uint8_t chunk_buf[SCTP_CHUNK_BUFFER_SIZE];
 	int got_auth = 0;
 	uint32_t auth_offset = 0, auth_len = 0;
 	int auth_skipped = 0;
 	int asconf_cnt = 0;
 
 	SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_control: iphlen=%u, offset=%u, length=%u stcb:%p\n",
 	    iphlen, *offset, length, (void *)stcb);
 
 	if (stcb) {
 		SCTP_TCB_LOCK_ASSERT(stcb);
 	}
 	/* validate chunk header length... */
 	if (ntohs(ch->chunk_length) < sizeof(*ch)) {
 		SCTPDBG(SCTP_DEBUG_INPUT1, "Invalid header length %d\n",
 		    ntohs(ch->chunk_length));
 		*offset = length;
 		return (stcb);
 	}
 	/*
 	 * validate the verification tag
 	 */
 	vtag_in = ntohl(sh->v_tag);
 
 	if (ch->chunk_type == SCTP_INITIATION) {
 		SCTPDBG(SCTP_DEBUG_INPUT1, "Its an INIT of len:%d vtag:%x\n",
 		    ntohs(ch->chunk_length), vtag_in);
 		if (vtag_in != 0) {
 			/* protocol error- silently discard... */
 			SCTP_STAT_INCR(sctps_badvtag);
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			return (NULL);
 		}
 	} else if (ch->chunk_type != SCTP_COOKIE_ECHO) {
 		/*
 		 * If there is no stcb, skip the AUTH chunk and process
 		 * later after a stcb is found (to validate the lookup was
 		 * valid.
 		 */
 		if ((ch->chunk_type == SCTP_AUTHENTICATION) &&
 		    (stcb == NULL) &&
 		    (inp->auth_supported == 1)) {
 			/* save this chunk for later processing */
 			auth_skipped = 1;
 			auth_offset = *offset;
 			auth_len = ntohs(ch->chunk_length);
 
 			/* (temporarily) move past this chunk */
 			*offset += SCTP_SIZE32(auth_len);
 			if (*offset >= length) {
 				/* no more data left in the mbuf chain */
 				*offset = length;
 				return (NULL);
 			}
 			ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
 			    sizeof(struct sctp_chunkhdr), chunk_buf);
 		}
 		if (ch == NULL) {
 			/* Help */
 			*offset = length;
 			return (stcb);
 		}
 		if (ch->chunk_type == SCTP_COOKIE_ECHO) {
 			goto process_control_chunks;
 		}
 		/*
 		 * first check if it's an ASCONF with an unknown src addr we
 		 * need to look inside to find the association
 		 */
 		if (ch->chunk_type == SCTP_ASCONF && stcb == NULL) {
 			struct sctp_chunkhdr *asconf_ch = ch;
 			uint32_t asconf_offset = 0, asconf_len = 0;
 
 			/* inp's refcount may be reduced */
 			SCTP_INP_INCR_REF(inp);
 
 			asconf_offset = *offset;
 			do {
 				asconf_len = ntohs(asconf_ch->chunk_length);
 				if (asconf_len < sizeof(struct sctp_asconf_paramhdr))
 					break;
 				stcb = sctp_findassociation_ep_asconf(m,
 				    *offset,
 				    dst,
 				    sh, &inp, netp, vrf_id);
 				if (stcb != NULL)
 					break;
 				asconf_offset += SCTP_SIZE32(asconf_len);
 				asconf_ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, asconf_offset,
 				    sizeof(struct sctp_chunkhdr), chunk_buf);
 			} while (asconf_ch != NULL && asconf_ch->chunk_type == SCTP_ASCONF);
 			if (stcb == NULL) {
 				/*
 				 * reduce inp's refcount if not reduced in
 				 * sctp_findassociation_ep_asconf().
 				 */
 				SCTP_INP_DECR_REF(inp);
 			}
 
 			/* now go back and verify any auth chunk to be sure */
 			if (auth_skipped && (stcb != NULL)) {
 				struct sctp_auth_chunk *auth;
 
 				if (auth_len <= SCTP_CHUNK_BUFFER_SIZE) {
 					auth = (struct sctp_auth_chunk *)sctp_m_getptr(m, auth_offset, auth_len, chunk_buf);
 					got_auth = 1;
 					auth_skipped = 0;
 				} else {
 					auth = NULL;
 				}
 				if ((auth == NULL) || sctp_handle_auth(stcb, auth, m,
 				    auth_offset)) {
 					/* auth HMAC failed so dump it */
 					*offset = length;
 					return (stcb);
 				} else {
 					/* remaining chunks are HMAC checked */
 					stcb->asoc.authenticated = 1;
 				}
 			}
 		}
 		if (stcb == NULL) {
 			SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    msg);
 			/* no association, so it's out of the blue... */
 			sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err,
 			    mflowtype, mflowid, inp->fibnum,
 			    vrf_id, port);
 			*offset = length;
 			return (NULL);
 		}
 		asoc = &stcb->asoc;
 		/* ABORT and SHUTDOWN can use either v_tag... */
 		if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) ||
 		    (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) ||
 		    (ch->chunk_type == SCTP_PACKET_DROPPED)) {
 			/* Take the T-bit always into account. */
 			if ((((ch->chunk_flags & SCTP_HAD_NO_TCB) == 0) &&
 			    (vtag_in == asoc->my_vtag)) ||
 			    (((ch->chunk_flags & SCTP_HAD_NO_TCB) == SCTP_HAD_NO_TCB) &&
 			    (asoc->peer_vtag != htonl(0)) &&
 			    (vtag_in == asoc->peer_vtag))) {
 				/* this is valid */
 			} else {
 				/* drop this packet... */
 				SCTP_STAT_INCR(sctps_badvtag);
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				return (NULL);
 			}
 		} else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
 			if (vtag_in != asoc->my_vtag) {
 				/*
 				 * this could be a stale SHUTDOWN-ACK or the
 				 * peer never got the SHUTDOWN-COMPLETE and
 				 * is still hung; we have started a new asoc
 				 * but it won't complete until the shutdown
 				 * is completed
 				 */
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
 				op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 				    msg);
 				sctp_handle_ootb(m, iphlen, *offset, src, dst,
 				    sh, inp, op_err,
 				    mflowtype, mflowid, fibnum,
 				    vrf_id, port);
 				return (NULL);
 			}
 		} else {
 			/* for all other chunks, vtag must match */
 			if (vtag_in != asoc->my_vtag) {
 				/* invalid vtag... */
 				SCTPDBG(SCTP_DEBUG_INPUT3,
 				    "invalid vtag: %xh, expect %xh\n",
 				    vtag_in, asoc->my_vtag);
 				SCTP_STAT_INCR(sctps_badvtag);
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				*offset = length;
 				return (NULL);
 			}
 		}
 	}			/* end if !SCTP_COOKIE_ECHO */
 	/*
 	 * process all control chunks...
 	 */
 	if (((ch->chunk_type == SCTP_SELECTIVE_ACK) ||
 	    (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) ||
 	    (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) &&
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 		/* implied cookie-ack.. we must have lost the ack */
 		sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb,
 		    *netp);
 	}
 
 process_control_chunks:
 	while (IS_SCTP_CONTROL(ch)) {
 		/* validate chunk length */
 		chk_length = ntohs(ch->chunk_length);
 		SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_process_control: processing a chunk type=%u, len=%u\n",
 		    ch->chunk_type, chk_length);
 		SCTP_LTRACE_CHK(inp, stcb, ch->chunk_type, chk_length);
 		if (chk_length < sizeof(*ch) ||
 		    (*offset + (int)chk_length) > length) {
 			*offset = length;
 			return (stcb);
 		}
 		SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks);
 		/*
 		 * INIT and INIT-ACK only gets the init ack "header" portion
 		 * only because we don't have to process the peer's COOKIE.
 		 * All others get a complete chunk.
 		 */
 		switch (ch->chunk_type) {
 		case SCTP_INITIATION:
 			contiguous = sizeof(struct sctp_init_chunk);
 			break;
 		case SCTP_INITIATION_ACK:
 			contiguous = sizeof(struct sctp_init_ack_chunk);
 			break;
 		default:
 			contiguous = min(chk_length, sizeof(chunk_buf));
 			break;
 		}
 		ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
 		    contiguous,
 		    chunk_buf);
 		if (ch == NULL) {
 			*offset = length;
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			return (NULL);
 		}
 
 		num_chunks++;
 		/* Save off the last place we got a control from */
 		if (stcb != NULL) {
 			if (((netp != NULL) && (*netp != NULL)) || (ch->chunk_type == SCTP_ASCONF)) {
 				/*
 				 * allow last_control to be NULL if
 				 * ASCONF... ASCONF processing will find the
 				 * right net later
 				 */
 				if ((netp != NULL) && (*netp != NULL))
 					stcb->asoc.last_control_chunk_from = *netp;
 			}
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_audit_log(0xB0, ch->chunk_type);
 #endif
 
 		/* check to see if this chunk required auth, but isn't */
 		if ((stcb != NULL) &&
 		    sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) &&
 		    !stcb->asoc.authenticated) {
 			/* "silently" ignore */
 			SCTP_STAT_INCR(sctps_recvauthmissing);
 			goto next_chunk;
 		}
 		switch (ch->chunk_type) {
 		case SCTP_INITIATION:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT\n");
 			/* The INIT chunk must be the only chunk. */
 			if ((num_chunks > 1) ||
 			    (length - *offset > (int)SCTP_SIZE32(chk_length))) {
 				/* RFC 4960 requires that no ABORT is sent */
 				*offset = length;
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				return (NULL);
 			}
 			/* Honor our resource limit. */
 			if (chk_length > SCTP_LARGEST_INIT_ACCEPTED) {
 				op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
 				sctp_abort_association(inp, stcb, m, iphlen,
 				    src, dst, sh, op_err,
 				    mflowtype, mflowid,
 				    vrf_id, port);
 				*offset = length;
 				return (NULL);
 			}
 			sctp_handle_init(m, iphlen, *offset, src, dst, sh,
 			    (struct sctp_init_chunk *)ch, inp,
 			    stcb, *netp, &abort_no_unlock,
 			    mflowtype, mflowid,
 			    vrf_id, port);
 			*offset = length;
 			if ((!abort_no_unlock) && (stcb != NULL)) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			return (NULL);
 			break;
 		case SCTP_PAD_CHUNK:
 			break;
 		case SCTP_INITIATION_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT_ACK\n");
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 				/* We are not interested anymore */
 				if ((stcb != NULL) && (stcb->asoc.total_output_queue_size)) {
 					;
 				} else {
 					*offset = length;
 					if (stcb != NULL) {
 						(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 						    SCTP_FROM_SCTP_INPUT + SCTP_LOC_29);
 					}
 					return (NULL);
 				}
 			}
 			/* The INIT-ACK chunk must be the only chunk. */
 			if ((num_chunks > 1) ||
 			    (length - *offset > (int)SCTP_SIZE32(chk_length))) {
 				*offset = length;
 				return (stcb);
 			}
 			if ((netp != NULL) && (*netp != NULL)) {
 				ret = sctp_handle_init_ack(m, iphlen, *offset,
 				    src, dst, sh,
 				    (struct sctp_init_ack_chunk *)ch,
 				    stcb, *netp,
 				    &abort_no_unlock,
 				    mflowtype, mflowid,
 				    vrf_id);
 			} else {
 				ret = -1;
 			}
 			*offset = length;
 			if (abort_no_unlock) {
 				return (NULL);
 			}
 			/*
 			 * Special case, I must call the output routine to
 			 * get the cookie echoed
 			 */
 			if ((stcb != NULL) && (ret == 0)) {
 				sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
 			}
 			return (stcb);
 			break;
 		case SCTP_SELECTIVE_ACK:
 		case SCTP_NR_SELECTIVE_ACK:
 			{
 				int abort_now = 0;
 				uint32_t a_rwnd, cum_ack;
 				uint16_t num_seg, num_nr_seg, num_dup;
 				uint8_t flags;
 				int offset_seg, offset_dup;
 
 				SCTPDBG(SCTP_DEBUG_INPUT3, "%s\n",
 				    ch->chunk_type == SCTP_SELECTIVE_ACK ? "SCTP_SACK" : "SCTP_NR_SACK");
 				SCTP_STAT_INCR(sctps_recvsacks);
 				if (stcb == NULL) {
 					SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing %s chunk\n",
 					    (ch->chunk_type == SCTP_SELECTIVE_ACK) ? "SCTP_SACK" : "SCTP_NR_SACK");
 					break;
 				}
 				if (ch->chunk_type == SCTP_SELECTIVE_ACK) {
 					if (chk_length < sizeof(struct sctp_sack_chunk)) {
 						SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on SACK chunk, too small\n");
 						break;
 					}
 				} else {
 					if (stcb->asoc.nrsack_supported == 0) {
 						goto unknown_chunk;
 					}
 					if (chk_length < sizeof(struct sctp_nr_sack_chunk)) {
 						SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on NR_SACK chunk, too small\n");
 						break;
 					}
 				}
 				if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) {
 					/*-
 					 * If we have sent a shutdown-ack, we will pay no
 					 * attention to a sack sent in to us since
 					 * we don't care anymore.
 					 */
 					break;
 				}
 				flags = ch->chunk_flags;
 				if (ch->chunk_type == SCTP_SELECTIVE_ACK) {
 					struct sctp_sack_chunk *sack;
 
 					sack = (struct sctp_sack_chunk *)ch;
 					cum_ack = ntohl(sack->sack.cum_tsn_ack);
 					num_seg = ntohs(sack->sack.num_gap_ack_blks);
 					num_nr_seg = 0;
 					num_dup = ntohs(sack->sack.num_dup_tsns);
 					a_rwnd = ntohl(sack->sack.a_rwnd);
 					if (sizeof(struct sctp_sack_chunk) +
 					    num_seg * sizeof(struct sctp_gap_ack_block) +
 					    num_dup * sizeof(uint32_t) != chk_length) {
 						SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of SACK chunk\n");
 						break;
 					}
 					offset_seg = *offset + sizeof(struct sctp_sack_chunk);
 					offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block);
 				} else {
 					struct sctp_nr_sack_chunk *nr_sack;
 
 					nr_sack = (struct sctp_nr_sack_chunk *)ch;
 					cum_ack = ntohl(nr_sack->nr_sack.cum_tsn_ack);
 					num_seg = ntohs(nr_sack->nr_sack.num_gap_ack_blks);
 					num_nr_seg = ntohs(nr_sack->nr_sack.num_nr_gap_ack_blks);
 					num_dup = ntohs(nr_sack->nr_sack.num_dup_tsns);
 					a_rwnd = ntohl(nr_sack->nr_sack.a_rwnd);
 					if (sizeof(struct sctp_nr_sack_chunk) +
 					    (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block) +
 					    num_dup * sizeof(uint32_t) != chk_length) {
 						SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of NR_SACK chunk\n");
 						break;
 					}
 					offset_seg = *offset + sizeof(struct sctp_nr_sack_chunk);
 					offset_dup = offset_seg + (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block);
 				}
 				SCTPDBG(SCTP_DEBUG_INPUT3, "%s process cum_ack:%x num_seg:%d a_rwnd:%d\n",
 				    (ch->chunk_type == SCTP_SELECTIVE_ACK) ? "SCTP_SACK" : "SCTP_NR_SACK",
 				    cum_ack, num_seg, a_rwnd);
 				stcb->asoc.seen_a_sack_this_pkt = 1;
 				if ((stcb->asoc.pr_sctp_cnt == 0) &&
 				    (num_seg == 0) && (num_nr_seg == 0) &&
 				    SCTP_TSN_GE(cum_ack, stcb->asoc.last_acked_seq) &&
 				    (stcb->asoc.saw_sack_with_frags == 0) &&
 				    (stcb->asoc.saw_sack_with_nr_frags == 0) &&
 				    (!TAILQ_EMPTY(&stcb->asoc.sent_queue))) {
 					/*
 					 * We have a SIMPLE sack having no
 					 * prior segments and data on sent
 					 * queue to be acked. Use the faster
 					 * path sack processing. We also
 					 * allow window update sacks with no
 					 * missing segments to go this way
 					 * too.
 					 */
 					sctp_express_handle_sack(stcb, cum_ack, a_rwnd,
 					    &abort_now, ecne_seen);
 				} else {
 					if ((netp != NULL) && (*netp != NULL)) {
 						sctp_handle_sack(m, offset_seg, offset_dup, stcb,
 						    num_seg, num_nr_seg, num_dup, &abort_now, flags,
 						    cum_ack, a_rwnd, ecne_seen);
 					}
 				}
 				if (abort_now) {
 					/* ABORT signal from sack processing */
 					*offset = length;
 					return (NULL);
 				}
 				if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
 				    TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
 				    (stcb->asoc.stream_queue_cnt == 0)) {
 					sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 				}
 				break;
 			}
 		case SCTP_HEARTBEAT_REQUEST:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT\n");
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				SCTP_STAT_INCR(sctps_recvheartbeat);
 				sctp_send_heartbeat_ack(stcb, m, *offset,
 				    chk_length, *netp);
 			}
 			break;
 		case SCTP_HEARTBEAT_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT_ACK\n");
 			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 			SCTP_STAT_INCR(sctps_recvheartbeatack);
 			if ((netp != NULL) && (*netp != NULL)) {
 				sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch,
 				    stcb, *netp);
 			}
 			break;
 		case SCTP_ABORT_ASSOCIATION:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ABORT, stcb %p\n",
 			    (void *)stcb);
 			*offset = length;
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				if (sctp_handle_abort((struct sctp_abort_chunk *)ch, stcb, *netp)) {
 					return (NULL);
 				} else {
 					return (stcb);
 				}
 			} else {
 				return (NULL);
 			}
 			break;
 		case SCTP_SHUTDOWN:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n",
 			    (void *)stcb);
 			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) {
 				*offset = length;
 				return (stcb);
 			}
 			if ((netp != NULL) && (*netp != NULL)) {
 				int abort_flag = 0;
 
 				sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch,
 				    stcb, *netp, &abort_flag);
 				if (abort_flag) {
 					*offset = length;
 					return (NULL);
 				}
 			}
 			break;
 		case SCTP_SHUTDOWN_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN_ACK, stcb %p\n", (void *)stcb);
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp);
 			}
 			*offset = length;
 			return (NULL);
 			break;
 		case SCTP_OPERATION_ERROR:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP_ERR\n");
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL) &&
 			    sctp_handle_error(ch, stcb, *netp, contiguous) < 0) {
 				*offset = length;
 				return (NULL);
 			}
 			break;
 		case SCTP_COOKIE_ECHO:
 			SCTPDBG(SCTP_DEBUG_INPUT3,
 			    "SCTP_COOKIE_ECHO, stcb %p\n", (void *)stcb);
 			if ((stcb != NULL) && (stcb->asoc.total_output_queue_size > 0)) {
 				;
 			} else {
 				if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 					/* We are not interested anymore */
 			abend:
 					if (stcb != NULL) {
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					*offset = length;
 					return (NULL);
 				}
 			}
 			/*-
 			 * First are we accepting? We do this again here
 			 * since it is possible that a previous endpoint WAS
 			 * listening responded to a INIT-ACK and then
 			 * closed. We opened and bound.. and are now no
 			 * longer listening.
 			 *
 			 * XXXGL: notes on checking listen queue length.
 			 * 1) SCTP_IS_LISTENING() doesn't necessarily mean
 			 *    SOLISTENING(), because a listening "UDP type"
 			 *    socket isn't listening in terms of the socket
 			 *    layer.  It is a normal data flow socket, that
 			 *    can fork off new connections.  Thus, we should
 			 *    look into sol_qlen only in case we are !UDP.
 			 * 2) Checking sol_qlen in general requires locking
 			 *    the socket, and this code lacks that.
 			 */
 			if ((stcb == NULL) &&
 			    (!SCTP_IS_LISTENING(inp) ||
 			    (!(inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 			    inp->sctp_socket->sol_qlen >= inp->sctp_socket->sol_qlimit))) {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 				    (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) {
 					op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
 					sctp_abort_association(inp, stcb, m, iphlen,
 					    src, dst, sh, op_err,
 					    mflowtype, mflowid,
 					    vrf_id, port);
 				}
 				*offset = length;
 				return (NULL);
 			} else {
 				struct mbuf *ret_buf;
 				struct sctp_inpcb *linp;
 				struct sctp_tmit_chunk *chk;
 
 				if (stcb) {
 					linp = NULL;
 				} else {
 					linp = inp;
 				}
 
 				if (linp != NULL) {
 					SCTP_ASOC_CREATE_LOCK(linp);
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 					    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 						SCTP_ASOC_CREATE_UNLOCK(linp);
 						goto abend;
 					}
 				}
 
 				if (netp != NULL) {
 					struct sctp_tcb *locked_stcb;
 
 					locked_stcb = stcb;
 					ret_buf =
 					    sctp_handle_cookie_echo(m, iphlen,
 					    *offset,
 					    src, dst,
 					    sh,
 					    (struct sctp_cookie_echo_chunk *)ch,
 					    &inp, &stcb, netp,
 					    auth_skipped,
 					    auth_offset,
 					    auth_len,
 					    &locked_stcb,
 					    mflowtype,
 					    mflowid,
 					    vrf_id,
 					    port);
 					if ((locked_stcb != NULL) && (locked_stcb != stcb)) {
 						SCTP_TCB_UNLOCK(locked_stcb);
 					}
 					if (stcb != NULL) {
 						SCTP_TCB_LOCK_ASSERT(stcb);
 					}
 				} else {
 					ret_buf = NULL;
 				}
 				if (linp != NULL) {
 					SCTP_ASOC_CREATE_UNLOCK(linp);
 				}
 				if (ret_buf == NULL) {
 					if (stcb != NULL) {
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTPDBG(SCTP_DEBUG_INPUT3,
 					    "GAK, null buffer\n");
 					*offset = length;
 					return (NULL);
 				}
 				/* if AUTH skipped, see if it verified... */
 				if (auth_skipped) {
 					got_auth = 1;
 					auth_skipped = 0;
 				}
 				/* Restart the timer if we have pending data */
 				TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 					if (chk->whoTo != NULL) {
 						break;
 					}
 				}
 				if (chk != NULL) {
 					sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo);
 				}
 			}
 			break;
 		case SCTP_COOKIE_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE_ACK, stcb %p\n", (void *)stcb);
 			if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) {
 				return (stcb);
 			}
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 				/* We are not interested anymore */
 				if ((stcb) && (stcb->asoc.total_output_queue_size)) {
 					;
 				} else if (stcb) {
 					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 					    SCTP_FROM_SCTP_INPUT + SCTP_LOC_30);
 					*offset = length;
 					return (NULL);
 				}
 			}
 			if ((netp != NULL) && (*netp != NULL)) {
 				sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp);
 			}
 			break;
 		case SCTP_ECN_ECHO:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_ECHO\n");
 			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 			if (stcb->asoc.ecn_supported == 0) {
 				goto unknown_chunk;
 			}
 			sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, stcb);
 			ecne_seen = 1;
 			break;
 		case SCTP_ECN_CWR:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_CWR\n");
 			if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) {
 				*offset = length;
 				return (stcb);
 			}
 			if (stcb->asoc.ecn_supported == 0) {
 				goto unknown_chunk;
 			}
 			sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb, *netp);
 			break;
 		case SCTP_SHUTDOWN_COMPLETE:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN_COMPLETE, stcb %p\n", (void *)stcb);
 			/* must be first and only chunk */
 			if ((num_chunks > 1) ||
 			    (length - *offset > (int)SCTP_SIZE32(chk_length))) {
 				*offset = length;
 				return (stcb);
 			}
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch,
 				    stcb, *netp);
 			}
 			*offset = length;
 			return (NULL);
 			break;
 		case SCTP_ASCONF:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n");
 			if (stcb != NULL) {
 				if (stcb->asoc.asconf_supported == 0) {
 					goto unknown_chunk;
 				}
 				sctp_handle_asconf(m, *offset, src,
 				    (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0);
 				asconf_cnt++;
 			}
 			break;
 		case SCTP_ASCONF_ACK:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF_ACK\n");
 			if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				if (stcb->asoc.asconf_supported == 0) {
 					goto unknown_chunk;
 				}
 				/* He's alive so give him credit */
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
 					sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
 					    stcb->asoc.overall_error_count,
 					    0,
 					    SCTP_FROM_SCTP_INPUT,
 					    __LINE__);
 				}
 				stcb->asoc.overall_error_count = 0;
 				sctp_handle_asconf_ack(m, *offset,
 				    (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock);
 				if (abort_no_unlock)
 					return (NULL);
 			}
 			break;
 		case SCTP_FORWARD_CUM_TSN:
 		case SCTP_IFORWARD_CUM_TSN:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "%s\n",
 			    ch->chunk_type == SCTP_FORWARD_CUM_TSN ? "FORWARD_TSN" : "I_FORWARD_TSN");
 			if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 
 			if (stcb != NULL) {
 				int abort_flag = 0;
 
 				if (stcb->asoc.prsctp_supported == 0) {
 					goto unknown_chunk;
 				}
 				if (((stcb->asoc.idata_supported == 1) && (ch->chunk_type == SCTP_FORWARD_CUM_TSN)) ||
 				    ((stcb->asoc.idata_supported == 0) && (ch->chunk_type == SCTP_IFORWARD_CUM_TSN))) {
 					if (ch->chunk_type == SCTP_FORWARD_CUM_TSN) {
 						SCTP_SNPRINTF(msg, sizeof(msg), "%s", "FORWARD-TSN chunk received when I-FORWARD-TSN was negotiated");
 					} else {
 						SCTP_SNPRINTF(msg, sizeof(msg), "%s", "I-FORWARD-TSN chunk received when FORWARD-TSN was negotiated");
 					}
 					op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
 					sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
 					*offset = length;
 					return (NULL);
 				}
 				*fwd_tsn_seen = 1;
 				if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 					/* We are not interested anymore */
 					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 					    SCTP_FROM_SCTP_INPUT + SCTP_LOC_31);
 					*offset = length;
 					return (NULL);
 				}
 				/*
 				 * For sending a SACK this looks like DATA
 				 * chunks.
 				 */
 				stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from;
 				sctp_handle_forward_tsn(stcb,
 				    (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset);
 				if (abort_flag) {
 					*offset = length;
 					return (NULL);
 				}
 			}
 			break;
 		case SCTP_STREAM_RESET:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n");
 			if ((stcb == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req))) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				goto unknown_chunk;
 			}
 			if (sctp_handle_stream_reset(stcb, m, *offset, ch)) {
 				/* stop processing */
 				*offset = length;
 				return (NULL);
 			}
 			break;
 		case SCTP_PACKET_DROPPED:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n");
 			/* re-get it all please */
 			if (chk_length < sizeof(struct sctp_pktdrop_chunk)) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 
 			if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) {
 				if (stcb->asoc.pktdrop_supported == 0) {
 					goto unknown_chunk;
 				}
 				sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch,
 				    stcb, *netp,
 				    min(chk_length, contiguous));
 			}
 			break;
 		case SCTP_AUTHENTICATION:
 			SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n");
 			if (stcb == NULL) {
 				/* save the first AUTH for later processing */
 				if (auth_skipped == 0) {
 					auth_offset = *offset;
 					auth_len = chk_length;
 					auth_skipped = 1;
 				}
 				/* skip this chunk (temporarily) */
 				goto next_chunk;
 			}
 			if (stcb->asoc.auth_supported == 0) {
 				goto unknown_chunk;
 			}
 			if ((chk_length < (sizeof(struct sctp_auth_chunk))) ||
 			    (chk_length > (sizeof(struct sctp_auth_chunk) +
 			    SCTP_AUTH_DIGEST_LEN_MAX))) {
 				/* Its not ours */
 				*offset = length;
 				return (stcb);
 			}
 			if (got_auth == 1) {
 				/* skip this chunk... it's already auth'd */
 				goto next_chunk;
 			}
 			got_auth = 1;
 			if (sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, m, *offset)) {
 				/* auth HMAC failed so dump the packet */
 				*offset = length;
 				return (stcb);
 			} else {
 				/* remaining chunks are HMAC checked */
 				stcb->asoc.authenticated = 1;
 			}
 			break;
 
 		default:
 	unknown_chunk:
 			/* it's an unknown chunk! */
 			if ((ch->chunk_type & 0x40) &&
 			    (stcb != NULL) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_EMPTY) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_INUSE) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT)) {
 				struct sctp_gen_error_cause *cause;
 				int len;
 
 				op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause),
 				    0, M_NOWAIT, 1, MT_DATA);
 				if (op_err != NULL) {
 					len = min(SCTP_SIZE32(chk_length), (uint32_t)(length - *offset));
 					cause = mtod(op_err, struct sctp_gen_error_cause *);
 					cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK);
 					cause->length = htons((uint16_t)(len + sizeof(struct sctp_gen_error_cause)));
 					SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause);
 					SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT);
 					if (SCTP_BUF_NEXT(op_err) != NULL) {
 #ifdef SCTP_MBUF_LOGGING
 						if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 							sctp_log_mbc(SCTP_BUF_NEXT(op_err), SCTP_MBUF_ICOPY);
 						}
 #endif
 						sctp_queue_op_err(stcb, op_err);
 					} else {
 						sctp_m_freem(op_err);
 					}
 				}
 			}
 			if ((ch->chunk_type & 0x80) == 0) {
 				/* discard this packet */
 				*offset = length;
 				return (stcb);
 			}	/* else skip this bad chunk and continue... */
 			break;
 		}		/* switch (ch->chunk_type) */
 
 
 next_chunk:
 		/* get the next chunk */
 		*offset += SCTP_SIZE32(chk_length);
 		if (*offset >= length) {
 			/* no more data left in the mbuf chain */
 			break;
 		}
 		ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
 		    sizeof(struct sctp_chunkhdr), chunk_buf);
 		if (ch == NULL) {
 			*offset = length;
 			return (stcb);
 		}
 	}			/* while */
 
 	if ((asconf_cnt > 0) && (stcb != NULL)) {
 		sctp_send_asconf_ack(stcb);
 	}
 	return (stcb);
 }
 
 
 /*
  * common input chunk processing (v4 and v6)
  */
 void
 sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int length,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_chunkhdr *ch,
     uint8_t compute_crc,
     uint8_t ecn_bits,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	uint32_t high_tsn;
 	int fwd_tsn_seen = 0, data_processed = 0;
 	struct mbuf *m = *mm, *op_err;
 	char msg[SCTP_DIAG_INFO_LEN];
 	int un_sent;
 	int cnt_ctrl_ready = 0;
 	struct sctp_inpcb *inp = NULL, *inp_decr = NULL;
 	struct sctp_tcb *stcb = NULL;
 	struct sctp_nets *net = NULL;
 
 	SCTP_STAT_INCR(sctps_recvdatagrams);
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xE0, 1);
 	sctp_auditing(0, inp, stcb, net);
 #endif
 	if (compute_crc != 0) {
 		uint32_t check, calc_check;
 
 		check = sh->checksum;
 		sh->checksum = 0;
 		calc_check = sctp_calculate_cksum(m, iphlen);
 		sh->checksum = check;
 		if (calc_check != check) {
 			SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x  m:%p mlen:%d iphlen:%d\n",
 			    calc_check, check, (void *)m, length, iphlen);
 			stcb = sctp_findassociation_addr(m, offset, src, dst,
 			    sh, ch, &inp, &net, vrf_id);
 #if defined(INET) || defined(INET6)
 			if ((ch->chunk_type != SCTP_INITIATION) &&
 			    (net != NULL) && (net->port != port)) {
 				if (net->port == 0) {
 					/* UDP encapsulation turned on. */
 					net->mtu -= sizeof(struct udphdr);
 					if (stcb->asoc.smallest_mtu > net->mtu) {
 						sctp_pathmtu_adjustment(stcb, net->mtu);
 					}
 				} else if (port == 0) {
 					/* UDP encapsulation turned off. */
 					net->mtu += sizeof(struct udphdr);
 					/* XXX Update smallest_mtu */
 				}
 				net->port = port;
 			}
 #endif
 			if (net != NULL) {
 				net->flowtype = mflowtype;
 				net->flowid = mflowid;
 			}
 			SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh);
 			if ((inp != NULL) && (stcb != NULL)) {
 				sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1);
 				sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED);
 			} else if ((inp != NULL) && (stcb == NULL)) {
 				inp_decr = inp;
 			}
 			SCTP_STAT_INCR(sctps_badsum);
 			SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors);
 			goto out;
 		}
 	}
 	/* Destination port of 0 is illegal, based on RFC4960. */
 	if (sh->dest_port == 0) {
 		SCTP_STAT_INCR(sctps_hdrops);
 		goto out;
 	}
 	stcb = sctp_findassociation_addr(m, offset, src, dst,
 	    sh, ch, &inp, &net, vrf_id);
 #if defined(INET) || defined(INET6)
 	if ((ch->chunk_type != SCTP_INITIATION) &&
 	    (net != NULL) && (net->port != port)) {
 		if (net->port == 0) {
 			/* UDP encapsulation turned on. */
 			net->mtu -= sizeof(struct udphdr);
 			if (stcb->asoc.smallest_mtu > net->mtu) {
 				sctp_pathmtu_adjustment(stcb, net->mtu);
 			}
 		} else if (port == 0) {
 			/* UDP encapsulation turned off. */
 			net->mtu += sizeof(struct udphdr);
 			/* XXX Update smallest_mtu */
 		}
 		net->port = port;
 	}
 #endif
 	if (net != NULL) {
 		net->flowtype = mflowtype;
 		net->flowid = mflowid;
 	}
 	if (inp == NULL) {
 		SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh);
 		SCTP_STAT_INCR(sctps_noport);
 		if (badport_bandlim(BANDLIM_SCTP_OOTB) < 0) {
 			goto out;
 		}
 		if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
 			sctp_send_shutdown_complete2(src, dst, sh,
 			    mflowtype, mflowid, fibnum,
 			    vrf_id, port);
 			goto out;
 		}
 		if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) {
 			goto out;
 		}
 		if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) {
 			if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) ||
 			    ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) &&
 			    (ch->chunk_type != SCTP_INIT))) {
 				op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 				    "Out of the blue");
 				sctp_send_abort(m, iphlen, src, dst,
 				    sh, 0, op_err,
 				    mflowtype, mflowid, fibnum,
 				    vrf_id, port);
 			}
 		}
 		goto out;
 	} else if (stcb == NULL) {
 		inp_decr = inp;
 	}
 	SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n",
 	    (void *)m, iphlen, offset, length, (void *)stcb);
 	if (stcb) {
 		/* always clear this before beginning a packet */
 		stcb->asoc.authenticated = 0;
 		stcb->asoc.seen_a_sack_this_pkt = 0;
 		SCTPDBG(SCTP_DEBUG_INPUT1, "stcb:%p state:%x\n",
 		    (void *)stcb, stcb->asoc.state);
 
 		if ((stcb->asoc.state & SCTP_STATE_WAS_ABORTED) ||
 		    (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) {
 			/*-
 			 * If we hit here, we had a ref count
 			 * up when the assoc was aborted and the
 			 * timer is clearing out the assoc, we should
 			 * NOT respond to any packet.. its OOTB.
 			 */
 			SCTP_TCB_UNLOCK(stcb);
 			stcb = NULL;
 			SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh);
 			SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    msg);
 			sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err,
 			    mflowtype, mflowid, inp->fibnum,
 			    vrf_id, port);
 			goto out;
 		}
 	}
 	if (IS_SCTP_CONTROL(ch)) {
 		/* process the control portion of the SCTP packet */
 		/* sa_ignore NO_NULL_CHK */
 		stcb = sctp_process_control(m, iphlen, &offset, length,
 		    src, dst, sh, ch,
 		    inp, stcb, &net, &fwd_tsn_seen,
 		    mflowtype, mflowid, fibnum,
 		    vrf_id, port);
 		if (stcb) {
 			/*
 			 * This covers us if the cookie-echo was there and
 			 * it changes our INP.
 			 */
 			inp = stcb->sctp_ep;
 #if defined(INET) || defined(INET6)
 			if ((ch->chunk_type != SCTP_INITIATION) &&
 			    (net != NULL) && (net->port != port)) {
 				if (net->port == 0) {
 					/* UDP encapsulation turned on. */
 					net->mtu -= sizeof(struct udphdr);
 					if (stcb->asoc.smallest_mtu > net->mtu) {
 						sctp_pathmtu_adjustment(stcb, net->mtu);
 					}
 				} else if (port == 0) {
 					/* UDP encapsulation turned off. */
 					net->mtu += sizeof(struct udphdr);
 					/* XXX Update smallest_mtu */
 				}
 				net->port = port;
 			}
 #endif
 		}
 	} else {
 		/*
 		 * no control chunks, so pre-process DATA chunks (these
 		 * checks are taken care of by control processing)
 		 */
 
 		/*
 		 * if DATA only packet, and auth is required, then punt...
 		 * can't have authenticated without any AUTH (control)
 		 * chunks
 		 */
 		if ((stcb != NULL) &&
 		    sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) {
 			/* "silently" ignore */
 			SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh);
 			SCTP_STAT_INCR(sctps_recvauthmissing);
 			goto out;
 		}
 		if (stcb == NULL) {
 			/* out of the blue DATA chunk */
 			SCTP_PROBE5(receive, NULL, NULL, m, NULL, sh);
 			SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    msg);
 			sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err,
 			    mflowtype, mflowid, fibnum,
 			    vrf_id, port);
 			goto out;
 		}
 		if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) {
 			/* v_tag mismatch! */
 			SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh);
 			SCTP_STAT_INCR(sctps_badvtag);
 			goto out;
 		}
 	}
 
 	SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh);
 	if (stcb == NULL) {
 		/*
 		 * no valid TCB for this packet, or we found it's a bad
 		 * packet while processing control, or we're done with this
 		 * packet (done or skip rest of data), so we drop it...
 		 */
 		goto out;
 	}
 
 	/*
 	 * DATA chunk processing
 	 */
 	/* plow through the data chunks while length > offset */
 
 	/*
 	 * Rest should be DATA only.  Check authentication state if AUTH for
 	 * DATA is required.
 	 */
 	if ((length > offset) &&
 	    (stcb != NULL) &&
 	    sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) &&
 	    !stcb->asoc.authenticated) {
 		/* "silently" ignore */
 		SCTP_STAT_INCR(sctps_recvauthmissing);
 		SCTPDBG(SCTP_DEBUG_AUTH1,
 		    "Data chunk requires AUTH, skipped\n");
 		goto trigger_send;
 	}
 	if (length > offset) {
 		int retval;
 
 		/*
 		 * First check to make sure our state is correct. We would
 		 * not get here unless we really did have a tag, so we don't
 		 * abort if this happens, just dump the chunk silently.
 		 */
 		switch (SCTP_GET_STATE(stcb)) {
 		case SCTP_STATE_COOKIE_ECHOED:
 			/*
 			 * we consider data with valid tags in this state
 			 * shows us the cookie-ack was lost. Imply it was
 			 * there.
 			 */
 			sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, net);
 			break;
 		case SCTP_STATE_COOKIE_WAIT:
 			/*
 			 * We consider OOTB any data sent during asoc setup.
 			 */
 			SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    msg);
 			sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err,
 			    mflowtype, mflowid, inp->fibnum,
 			    vrf_id, port);
 			goto out;
 			/* sa_ignore NOTREACHED */
 			break;
 		case SCTP_STATE_EMPTY:	/* should not happen */
 		case SCTP_STATE_INUSE:	/* should not happen */
 		case SCTP_STATE_SHUTDOWN_RECEIVED:	/* This is a peer error */
 		case SCTP_STATE_SHUTDOWN_ACK_SENT:
 		default:
 			goto out;
 			/* sa_ignore NOTREACHED */
 			break;
 		case SCTP_STATE_OPEN:
 		case SCTP_STATE_SHUTDOWN_SENT:
 			break;
 		}
 		/* plow through the data chunks while length > offset */
 		retval = sctp_process_data(mm, iphlen, &offset, length,
 		    inp, stcb, net, &high_tsn);
 		if (retval == 2) {
 			/*
 			 * The association aborted, NO UNLOCK needed since
 			 * the association is destroyed.
 			 */
 			stcb = NULL;
 			goto out;
 		}
 		data_processed = 1;
 		/*
 		 * Anything important needs to have been m_copy'ed in
 		 * process_data
 		 */
 	}
 
 	/* take care of ecn */
 	if ((data_processed == 1) &&
 	    (stcb->asoc.ecn_supported == 1) &&
 	    ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS)) {
 		/* Yep, we need to add a ECNE */
 		sctp_send_ecn_echo(stcb, net, high_tsn);
 	}
 
 	if ((data_processed == 0) && (fwd_tsn_seen)) {
 		int was_a_gap;
 		uint32_t highest_tsn;
 
 		if (SCTP_TSN_GT(stcb->asoc.highest_tsn_inside_nr_map, stcb->asoc.highest_tsn_inside_map)) {
 			highest_tsn = stcb->asoc.highest_tsn_inside_nr_map;
 		} else {
 			highest_tsn = stcb->asoc.highest_tsn_inside_map;
 		}
 		was_a_gap = SCTP_TSN_GT(highest_tsn, stcb->asoc.cumulative_tsn);
 		stcb->asoc.send_sack = 1;
 		sctp_sack_check(stcb, was_a_gap);
 	} else if (fwd_tsn_seen) {
 		stcb->asoc.send_sack = 1;
 	}
 	/* trigger send of any chunks in queue... */
 trigger_send:
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xE0, 2);
 	sctp_auditing(1, inp, stcb, net);
 #endif
 	SCTPDBG(SCTP_DEBUG_INPUT1,
 	    "Check for chunk output prw:%d tqe:%d tf=%d\n",
 	    stcb->asoc.peers_rwnd,
 	    TAILQ_EMPTY(&stcb->asoc.control_send_queue),
 	    stcb->asoc.total_flight);
 	un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight);
 	if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
 		cnt_ctrl_ready = stcb->asoc.ctrl_queue_cnt - stcb->asoc.ecn_echo_cnt_onq;
 	}
 	if (!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue) ||
 	    cnt_ctrl_ready ||
 	    stcb->asoc.trigger_reset ||
 	    ((un_sent > 0) &&
 	    (stcb->asoc.peers_rwnd > 0 || stcb->asoc.total_flight == 0))) {
 		SCTPDBG(SCTP_DEBUG_INPUT3, "Calling chunk OUTPUT\n");
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
 		SCTPDBG(SCTP_DEBUG_INPUT3, "chunk OUTPUT returns\n");
 	}
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xE0, 3);
 	sctp_auditing(2, inp, stcb, net);
 #endif
 out:
 	if (stcb != NULL) {
 		SCTP_TCB_UNLOCK(stcb);
 	}
 	if (inp_decr != NULL) {
 		/* reduce ref-count */
 		SCTP_INP_WLOCK(inp_decr);
 		SCTP_INP_DECR_REF(inp_decr);
 		SCTP_INP_WUNLOCK(inp_decr);
 	}
 	return;
 }
 
 #ifdef INET
 void
 sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
 {
 	struct mbuf *m;
 	int iphlen;
 	uint32_t vrf_id = 0;
 	uint8_t ecn_bits;
 	struct sockaddr_in src, dst;
 	struct ip *ip;
 	struct sctphdr *sh;
 	struct sctp_chunkhdr *ch;
 	int length, offset;
 	uint8_t compute_crc;
 	uint32_t mflowid;
 	uint8_t mflowtype;
 	uint16_t fibnum;
 
 	iphlen = off;
 	if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) {
 		SCTP_RELEASE_PKT(i_pak);
 		return;
 	}
 	m = SCTP_HEADER_TO_CHAIN(i_pak);
 #ifdef SCTP_MBUF_LOGGING
 	/* Log in any input mbufs */
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mbc(m, SCTP_MBUF_INPUT);
 	}
 #endif
 #ifdef SCTP_PACKET_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
 		sctp_packet_log(m);
 	}
 #endif
 	SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
 	    "sctp_input(): Packet of length %d received on %s with csum_flags 0x%b.\n",
 	    m->m_pkthdr.len,
 	    if_name(m->m_pkthdr.rcvif),
 	    (int)m->m_pkthdr.csum_flags, CSUM_BITS);
 	mflowid = m->m_pkthdr.flowid;
 	mflowtype = M_HASHTYPE_GET(m);
 	fibnum = M_GETFIB(m);
 	SCTP_STAT_INCR(sctps_recvpackets);
 	SCTP_STAT_INCR_COUNTER64(sctps_inpackets);
 	/* Get IP, SCTP, and first chunk header together in the first mbuf. */
 	offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
 	if (SCTP_BUF_LEN(m) < offset) {
 		if ((m = m_pullup(m, offset)) == NULL) {
 			SCTP_STAT_INCR(sctps_hdrops);
 			return;
 		}
 	}
 	ip = mtod(m, struct ip *);
 	sh = (struct sctphdr *)((caddr_t)ip + iphlen);
 	ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr));
 	offset -= sizeof(struct sctp_chunkhdr);
 	memset(&src, 0, sizeof(struct sockaddr_in));
 	src.sin_family = AF_INET;
 	src.sin_len = sizeof(struct sockaddr_in);
 	src.sin_port = sh->src_port;
 	src.sin_addr = ip->ip_src;
 	memset(&dst, 0, sizeof(struct sockaddr_in));
 	dst.sin_family = AF_INET;
 	dst.sin_len = sizeof(struct sockaddr_in);
 	dst.sin_port = sh->dest_port;
 	dst.sin_addr = ip->ip_dst;
 	length = ntohs(ip->ip_len);
 	/* Validate mbuf chain length with IP payload length. */
 	if (SCTP_HEADER_LEN(m) != length) {
 		SCTPDBG(SCTP_DEBUG_INPUT1,
 		    "sctp_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m));
 		SCTP_STAT_INCR(sctps_hdrops);
 		goto out;
 	}
 	/* SCTP does not allow broadcasts or multicasts */
 	if (IN_MULTICAST(ntohl(dst.sin_addr.s_addr))) {
 		goto out;
 	}
 	if (SCTP_IS_IT_BROADCAST(dst.sin_addr, m)) {
 		goto out;
 	}
 	ecn_bits = ip->ip_tos;
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) {
 		SCTP_STAT_INCR(sctps_recvhwcrc);
 		compute_crc = 0;
 	} else {
 		SCTP_STAT_INCR(sctps_recvswcrc);
 		compute_crc = 1;
 	}
 	sctp_common_input_processing(&m, iphlen, offset, length,
 	    (struct sockaddr *)&src,
 	    (struct sockaddr *)&dst,
 	    sh, ch,
 	    compute_crc,
 	    ecn_bits,
 	    mflowtype, mflowid, fibnum,
 	    vrf_id, port);
 out:
 	if (m) {
 		sctp_m_freem(m);
 	}
 	return;
 }
 
 #if defined(SCTP_MCORE_INPUT) && defined(SMP)
 extern int *sctp_cpuarry;
 #endif
 
 int
 sctp_input(struct mbuf **mp, int *offp, int proto SCTP_UNUSED)
 {
 	struct mbuf *m;
 	int off;
 
 	m = *mp;
 	off = *offp;
 #if defined(SCTP_MCORE_INPUT) && defined(SMP)
 	if (mp_ncpus > 1) {
 		struct ip *ip;
 		struct sctphdr *sh;
 		int offset;
 		int cpu_to_use;
 		uint32_t flowid, tag;
 
 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 			flowid = m->m_pkthdr.flowid;
 		} else {
 			/*
 			 * No flow id built by lower layers fix it so we
 			 * create one.
 			 */
 			offset = off + sizeof(struct sctphdr);
 			if (SCTP_BUF_LEN(m) < offset) {
 				if ((m = m_pullup(m, offset)) == NULL) {
 					SCTP_STAT_INCR(sctps_hdrops);
 					return (IPPROTO_DONE);
 				}
 			}
 			ip = mtod(m, struct ip *);
 			sh = (struct sctphdr *)((caddr_t)ip + off);
 			tag = htonl(sh->v_tag);
 			flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port);
 			m->m_pkthdr.flowid = flowid;
 			M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH);
 		}
 		cpu_to_use = sctp_cpuarry[flowid % mp_ncpus];
 		sctp_queue_to_mcore(m, off, cpu_to_use);
 		return (IPPROTO_DONE);
 	}
 #endif
 	sctp_input_with_port(m, off, 0);
 	return (IPPROTO_DONE);
 }
 #endif
Index: projects/clang1100-import/sys/netinet/sctp_output.c
===================================================================
--- projects/clang1100-import/sys/netinet/sctp_output.c	(revision 364278)
+++ projects/clang1100-import/sys/netinet/sctp_output.c	(revision 364279)
@@ -1,13818 +1,13830 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_uio.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_indata.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/sctp_input.h>
 #include <netinet/sctp_crc32.h>
 #include <netinet/sctp_kdtrace.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/udp.h>
 #endif
 #include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 
 #define SCTP_MAX_GAPS_INARRAY 4
 struct sack_track {
 	uint8_t right_edge;	/* mergable on the right edge */
 	uint8_t left_edge;	/* mergable on the left edge */
 	uint8_t num_entries;
 	uint8_t spare;
 	struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY];
 };
 
 const struct sack_track sack_array[256] = {
 	{0, 0, 0, 0,		/* 0x00 */
 		{{0, 0},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x01 */
 		{{0, 0},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x02 */
 		{{1, 1},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x03 */
 		{{0, 1},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x04 */
 		{{2, 2},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x05 */
 		{{0, 0},
 		{2, 2},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x06 */
 		{{1, 2},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x07 */
 		{{0, 2},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x08 */
 		{{3, 3},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x09 */
 		{{0, 0},
 		{3, 3},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x0a */
 		{{1, 1},
 		{3, 3},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x0b */
 		{{0, 1},
 		{3, 3},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x0c */
 		{{2, 3},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x0d */
 		{{0, 0},
 		{2, 3},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x0e */
 		{{1, 3},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x0f */
 		{{0, 3},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x10 */
 		{{4, 4},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x11 */
 		{{0, 0},
 		{4, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x12 */
 		{{1, 1},
 		{4, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x13 */
 		{{0, 1},
 		{4, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x14 */
 		{{2, 2},
 		{4, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x15 */
 		{{0, 0},
 		{2, 2},
 		{4, 4},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x16 */
 		{{1, 2},
 		{4, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x17 */
 		{{0, 2},
 		{4, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x18 */
 		{{3, 4},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x19 */
 		{{0, 0},
 		{3, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x1a */
 		{{1, 1},
 		{3, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x1b */
 		{{0, 1},
 		{3, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x1c */
 		{{2, 4},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x1d */
 		{{0, 0},
 		{2, 4},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x1e */
 		{{1, 4},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x1f */
 		{{0, 4},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x20 */
 		{{5, 5},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x21 */
 		{{0, 0},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x22 */
 		{{1, 1},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x23 */
 		{{0, 1},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x24 */
 		{{2, 2},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x25 */
 		{{0, 0},
 		{2, 2},
 		{5, 5},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x26 */
 		{{1, 2},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x27 */
 		{{0, 2},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x28 */
 		{{3, 3},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x29 */
 		{{0, 0},
 		{3, 3},
 		{5, 5},
 		{0, 0}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x2a */
 		{{1, 1},
 		{3, 3},
 		{5, 5},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x2b */
 		{{0, 1},
 		{3, 3},
 		{5, 5},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x2c */
 		{{2, 3},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x2d */
 		{{0, 0},
 		{2, 3},
 		{5, 5},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x2e */
 		{{1, 3},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x2f */
 		{{0, 3},
 		{5, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x30 */
 		{{4, 5},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x31 */
 		{{0, 0},
 		{4, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x32 */
 		{{1, 1},
 		{4, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x33 */
 		{{0, 1},
 		{4, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x34 */
 		{{2, 2},
 		{4, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x35 */
 		{{0, 0},
 		{2, 2},
 		{4, 5},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x36 */
 		{{1, 2},
 		{4, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x37 */
 		{{0, 2},
 		{4, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x38 */
 		{{3, 5},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x39 */
 		{{0, 0},
 		{3, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x3a */
 		{{1, 1},
 		{3, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x3b */
 		{{0, 1},
 		{3, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x3c */
 		{{2, 5},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x3d */
 		{{0, 0},
 		{2, 5},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x3e */
 		{{1, 5},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x3f */
 		{{0, 5},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x40 */
 		{{6, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x41 */
 		{{0, 0},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x42 */
 		{{1, 1},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x43 */
 		{{0, 1},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x44 */
 		{{2, 2},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x45 */
 		{{0, 0},
 		{2, 2},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x46 */
 		{{1, 2},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x47 */
 		{{0, 2},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x48 */
 		{{3, 3},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x49 */
 		{{0, 0},
 		{3, 3},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x4a */
 		{{1, 1},
 		{3, 3},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x4b */
 		{{0, 1},
 		{3, 3},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x4c */
 		{{2, 3},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x4d */
 		{{0, 0},
 		{2, 3},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x4e */
 		{{1, 3},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x4f */
 		{{0, 3},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x50 */
 		{{4, 4},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x51 */
 		{{0, 0},
 		{4, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x52 */
 		{{1, 1},
 		{4, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x53 */
 		{{0, 1},
 		{4, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x54 */
 		{{2, 2},
 		{4, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{1, 0, 4, 0,		/* 0x55 */
 		{{0, 0},
 		{2, 2},
 		{4, 4},
 		{6, 6}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x56 */
 		{{1, 2},
 		{4, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x57 */
 		{{0, 2},
 		{4, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x58 */
 		{{3, 4},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x59 */
 		{{0, 0},
 		{3, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x5a */
 		{{1, 1},
 		{3, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x5b */
 		{{0, 1},
 		{3, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x5c */
 		{{2, 4},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x5d */
 		{{0, 0},
 		{2, 4},
 		{6, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x5e */
 		{{1, 4},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x5f */
 		{{0, 4},
 		{6, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x60 */
 		{{5, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x61 */
 		{{0, 0},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x62 */
 		{{1, 1},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x63 */
 		{{0, 1},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x64 */
 		{{2, 2},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x65 */
 		{{0, 0},
 		{2, 2},
 		{5, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x66 */
 		{{1, 2},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x67 */
 		{{0, 2},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x68 */
 		{{3, 3},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x69 */
 		{{0, 0},
 		{3, 3},
 		{5, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 3, 0,		/* 0x6a */
 		{{1, 1},
 		{3, 3},
 		{5, 6},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x6b */
 		{{0, 1},
 		{3, 3},
 		{5, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x6c */
 		{{2, 3},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x6d */
 		{{0, 0},
 		{2, 3},
 		{5, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x6e */
 		{{1, 3},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x6f */
 		{{0, 3},
 		{5, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x70 */
 		{{4, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x71 */
 		{{0, 0},
 		{4, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x72 */
 		{{1, 1},
 		{4, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x73 */
 		{{0, 1},
 		{4, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x74 */
 		{{2, 2},
 		{4, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 3, 0,		/* 0x75 */
 		{{0, 0},
 		{2, 2},
 		{4, 6},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x76 */
 		{{1, 2},
 		{4, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x77 */
 		{{0, 2},
 		{4, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x78 */
 		{{3, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x79 */
 		{{0, 0},
 		{3, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 2, 0,		/* 0x7a */
 		{{1, 1},
 		{3, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x7b */
 		{{0, 1},
 		{3, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x7c */
 		{{2, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 2, 0,		/* 0x7d */
 		{{0, 0},
 		{2, 6},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 0, 1, 0,		/* 0x7e */
 		{{1, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 0, 1, 0,		/* 0x7f */
 		{{0, 6},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0x80 */
 		{{7, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0x81 */
 		{{0, 0},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x82 */
 		{{1, 1},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0x83 */
 		{{0, 1},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x84 */
 		{{2, 2},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x85 */
 		{{0, 0},
 		{2, 2},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x86 */
 		{{1, 2},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0x87 */
 		{{0, 2},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x88 */
 		{{3, 3},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x89 */
 		{{0, 0},
 		{3, 3},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0x8a */
 		{{1, 1},
 		{3, 3},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x8b */
 		{{0, 1},
 		{3, 3},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x8c */
 		{{2, 3},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x8d */
 		{{0, 0},
 		{2, 3},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x8e */
 		{{1, 3},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0x8f */
 		{{0, 3},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x90 */
 		{{4, 4},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x91 */
 		{{0, 0},
 		{4, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0x92 */
 		{{1, 1},
 		{4, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x93 */
 		{{0, 1},
 		{4, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0x94 */
 		{{2, 2},
 		{4, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 4, 0,		/* 0x95 */
 		{{0, 0},
 		{2, 2},
 		{4, 4},
 		{7, 7}
 		}
 	},
 	{0, 1, 3, 0,		/* 0x96 */
 		{{1, 2},
 		{4, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x97 */
 		{{0, 2},
 		{4, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x98 */
 		{{3, 4},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x99 */
 		{{0, 0},
 		{3, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0x9a */
 		{{1, 1},
 		{3, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x9b */
 		{{0, 1},
 		{3, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x9c */
 		{{2, 4},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0x9d */
 		{{0, 0},
 		{2, 4},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0x9e */
 		{{1, 4},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0x9f */
 		{{0, 4},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xa0 */
 		{{5, 5},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xa1 */
 		{{0, 0},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xa2 */
 		{{1, 1},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xa3 */
 		{{0, 1},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xa4 */
 		{{2, 2},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 4, 0,		/* 0xa5 */
 		{{0, 0},
 		{2, 2},
 		{5, 5},
 		{7, 7}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xa6 */
 		{{1, 2},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xa7 */
 		{{0, 2},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xa8 */
 		{{3, 3},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 4, 0,		/* 0xa9 */
 		{{0, 0},
 		{3, 3},
 		{5, 5},
 		{7, 7}
 		}
 	},
 	{0, 1, 4, 0,		/* 0xaa */
 		{{1, 1},
 		{3, 3},
 		{5, 5},
 		{7, 7}
 		}
 	},
 	{1, 1, 4, 0,		/* 0xab */
 		{{0, 1},
 		{3, 3},
 		{5, 5},
 		{7, 7}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xac */
 		{{2, 3},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 4, 0,		/* 0xad */
 		{{0, 0},
 		{2, 3},
 		{5, 5},
 		{7, 7}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xae */
 		{{1, 3},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xaf */
 		{{0, 3},
 		{5, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xb0 */
 		{{4, 5},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xb1 */
 		{{0, 0},
 		{4, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xb2 */
 		{{1, 1},
 		{4, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xb3 */
 		{{0, 1},
 		{4, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xb4 */
 		{{2, 2},
 		{4, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 4, 0,		/* 0xb5 */
 		{{0, 0},
 		{2, 2},
 		{4, 5},
 		{7, 7}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xb6 */
 		{{1, 2},
 		{4, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xb7 */
 		{{0, 2},
 		{4, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xb8 */
 		{{3, 5},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xb9 */
 		{{0, 0},
 		{3, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xba */
 		{{1, 1},
 		{3, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xbb */
 		{{0, 1},
 		{3, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xbc */
 		{{2, 5},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xbd */
 		{{0, 0},
 		{2, 5},
 		{7, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xbe */
 		{{1, 5},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xbf */
 		{{0, 5},
 		{7, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0xc0 */
 		{{6, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xc1 */
 		{{0, 0},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xc2 */
 		{{1, 1},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xc3 */
 		{{0, 1},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xc4 */
 		{{2, 2},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xc5 */
 		{{0, 0},
 		{2, 2},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xc6 */
 		{{1, 2},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xc7 */
 		{{0, 2},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xc8 */
 		{{3, 3},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xc9 */
 		{{0, 0},
 		{3, 3},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xca */
 		{{1, 1},
 		{3, 3},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xcb */
 		{{0, 1},
 		{3, 3},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xcc */
 		{{2, 3},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xcd */
 		{{0, 0},
 		{2, 3},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xce */
 		{{1, 3},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xcf */
 		{{0, 3},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xd0 */
 		{{4, 4},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xd1 */
 		{{0, 0},
 		{4, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xd2 */
 		{{1, 1},
 		{4, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xd3 */
 		{{0, 1},
 		{4, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xd4 */
 		{{2, 2},
 		{4, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 4, 0,		/* 0xd5 */
 		{{0, 0},
 		{2, 2},
 		{4, 4},
 		{6, 7}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xd6 */
 		{{1, 2},
 		{4, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xd7 */
 		{{0, 2},
 		{4, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xd8 */
 		{{3, 4},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xd9 */
 		{{0, 0},
 		{3, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xda */
 		{{1, 1},
 		{3, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xdb */
 		{{0, 1},
 		{3, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xdc */
 		{{2, 4},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xdd */
 		{{0, 0},
 		{2, 4},
 		{6, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xde */
 		{{1, 4},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xdf */
 		{{0, 4},
 		{6, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0xe0 */
 		{{5, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xe1 */
 		{{0, 0},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xe2 */
 		{{1, 1},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xe3 */
 		{{0, 1},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xe4 */
 		{{2, 2},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xe5 */
 		{{0, 0},
 		{2, 2},
 		{5, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xe6 */
 		{{1, 2},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xe7 */
 		{{0, 2},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xe8 */
 		{{3, 3},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xe9 */
 		{{0, 0},
 		{3, 3},
 		{5, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 3, 0,		/* 0xea */
 		{{1, 1},
 		{3, 3},
 		{5, 7},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xeb */
 		{{0, 1},
 		{3, 3},
 		{5, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xec */
 		{{2, 3},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xed */
 		{{0, 0},
 		{2, 3},
 		{5, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xee */
 		{{1, 3},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xef */
 		{{0, 3},
 		{5, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0xf0 */
 		{{4, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xf1 */
 		{{0, 0},
 		{4, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xf2 */
 		{{1, 1},
 		{4, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xf3 */
 		{{0, 1},
 		{4, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xf4 */
 		{{2, 2},
 		{4, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 3, 0,		/* 0xf5 */
 		{{0, 0},
 		{2, 2},
 		{4, 7},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xf6 */
 		{{1, 2},
 		{4, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xf7 */
 		{{0, 2},
 		{4, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0xf8 */
 		{{3, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xf9 */
 		{{0, 0},
 		{3, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 2, 0,		/* 0xfa */
 		{{1, 1},
 		{3, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xfb */
 		{{0, 1},
 		{3, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0xfc */
 		{{2, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 2, 0,		/* 0xfd */
 		{{0, 0},
 		{2, 7},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{0, 1, 1, 0,		/* 0xfe */
 		{{1, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	},
 	{1, 1, 1, 0,		/* 0xff */
 		{{0, 7},
 		{0, 0},
 		{0, 0},
 		{0, 0}
 		}
 	}
 };
 
 
 int
 sctp_is_address_in_scope(struct sctp_ifa *ifa,
     struct sctp_scoping *scope,
     int do_update)
 {
 	if ((scope->loopback_scope == 0) &&
 	    (ifa->ifn_p) && SCTP_IFN_IS_IFT_LOOP(ifa->ifn_p)) {
 		/*
 		 * skip loopback if not in scope *
 		 */
 		return (0);
 	}
 	switch (ifa->address.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (scope->ipv4_addr_legal) {
 			struct sockaddr_in *sin;
 
 			sin = &ifa->address.sin;
 			if (sin->sin_addr.s_addr == 0) {
 				/* not in scope , unspecified */
 				return (0);
 			}
 			if ((scope->ipv4_local_scope == 0) &&
 			    (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 				/* private address not in scope */
 				return (0);
 			}
 		} else {
 			return (0);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (scope->ipv6_addr_legal) {
 			struct sockaddr_in6 *sin6;
 
 			/*
 			 * Must update the flags,  bummer, which means any
 			 * IFA locks must now be applied HERE <->
 			 */
 			if (do_update) {
 				sctp_gather_internal_ifa_flags(ifa);
 			}
 			if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 				return (0);
 			}
 			/* ok to use deprecated addresses? */
 			sin6 = &ifa->address.sin6;
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				/* skip unspecifed addresses */
 				return (0);
 			}
 			if (	/* (local_scope == 0) && */
 			    (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) {
 				return (0);
 			}
 			if ((scope->site_scope == 0) &&
 			    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 				return (0);
 			}
 		} else {
 			return (0);
 		}
 		break;
 #endif
 	default:
 		return (0);
 	}
 	return (1);
 }
 
 static struct mbuf *
 sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t *len)
 {
 #if defined(INET) || defined(INET6)
 	struct sctp_paramhdr *paramh;
 	struct mbuf *mret;
 	uint16_t plen;
 #endif
 
 	switch (ifa->address.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		plen = (uint16_t)sizeof(struct sctp_ipv4addr_param);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		plen = (uint16_t)sizeof(struct sctp_ipv6addr_param);
 		break;
 #endif
 	default:
 		return (m);
 	}
 #if defined(INET) || defined(INET6)
 	if (M_TRAILINGSPACE(m) >= plen) {
 		/* easy side we just drop it on the end */
 		paramh = (struct sctp_paramhdr *)(SCTP_BUF_AT(m, SCTP_BUF_LEN(m)));
 		mret = m;
 	} else {
 		/* Need more space */
 		mret = m;
 		while (SCTP_BUF_NEXT(mret) != NULL) {
 			mret = SCTP_BUF_NEXT(mret);
 		}
 		SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA);
 		if (SCTP_BUF_NEXT(mret) == NULL) {
 			/* We are hosed, can't add more addresses */
 			return (m);
 		}
 		mret = SCTP_BUF_NEXT(mret);
 		paramh = mtod(mret, struct sctp_paramhdr *);
 	}
 	/* now add the parameter */
 	switch (ifa->address.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sctp_ipv4addr_param *ipv4p;
 			struct sockaddr_in *sin;
 
 			sin = &ifa->address.sin;
 			ipv4p = (struct sctp_ipv4addr_param *)paramh;
 			paramh->param_type = htons(SCTP_IPV4_ADDRESS);
 			paramh->param_length = htons(plen);
 			ipv4p->addr = sin->sin_addr.s_addr;
 			SCTP_BUF_LEN(mret) += plen;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sctp_ipv6addr_param *ipv6p;
 			struct sockaddr_in6 *sin6;
 
 			sin6 = &ifa->address.sin6;
 			ipv6p = (struct sctp_ipv6addr_param *)paramh;
 			paramh->param_type = htons(SCTP_IPV6_ADDRESS);
 			paramh->param_length = htons(plen);
 			memcpy(ipv6p->addr, &sin6->sin6_addr,
 			    sizeof(ipv6p->addr));
 			/* clear embedded scope in the address */
 			in6_clearscope((struct in6_addr *)ipv6p->addr);
 			SCTP_BUF_LEN(mret) += plen;
 			break;
 		}
 #endif
 	default:
 		return (m);
 	}
 	if (len != NULL) {
 		*len += plen;
 	}
 	return (mret);
 #endif
 }
 
 
 struct mbuf *
 sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct sctp_scoping *scope,
     struct mbuf *m_at, int cnt_inits_to,
     uint16_t *padding_len, uint16_t *chunk_len)
 {
 	struct sctp_vrf *vrf = NULL;
 	int cnt, limit_out = 0, total_count;
 	uint32_t vrf_id;
 
 	vrf_id = inp->def_vrf_id;
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTP_IPI_ADDR_RUNLOCK();
 		return (m_at);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		struct sctp_ifa *sctp_ifap;
 		struct sctp_ifn *sctp_ifnp;
 
 		cnt = cnt_inits_to;
 		if (vrf->total_ifa_count > SCTP_COUNT_LIMIT) {
 			limit_out = 1;
 			cnt = SCTP_ADDRESS_LIMIT;
 			goto skip_count;
 		}
 		LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) {
 			if ((scope->loopback_scope == 0) &&
 			    SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) {
 				/*
 				 * Skip loopback devices if loopback_scope
 				 * not set
 				 */
 				continue;
 			}
 			LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) {
 #ifdef INET
 				if ((sctp_ifap->address.sa.sa_family == AF_INET) &&
 				    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 				    &sctp_ifap->address.sin.sin_addr) != 0)) {
 					continue;
 				}
 #endif
 #ifdef INET6
 				if ((sctp_ifap->address.sa.sa_family == AF_INET6) &&
 				    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 				    &sctp_ifap->address.sin6.sin6_addr) != 0)) {
 					continue;
 				}
 #endif
 				if (sctp_is_addr_restricted(stcb, sctp_ifap)) {
 					continue;
 				}
 				if (sctp_is_address_in_scope(sctp_ifap, scope, 1) == 0) {
 					continue;
 				}
 				cnt++;
 				if (cnt > SCTP_ADDRESS_LIMIT) {
 					break;
 				}
 			}
 			if (cnt > SCTP_ADDRESS_LIMIT) {
 				break;
 			}
 		}
 skip_count:
 		if (cnt > 1) {
 			total_count = 0;
 			LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) {
 				cnt = 0;
 				if ((scope->loopback_scope == 0) &&
 				    SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) {
 					/*
 					 * Skip loopback devices if
 					 * loopback_scope not set
 					 */
 					continue;
 				}
 				LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) {
 #ifdef INET
 					if ((sctp_ifap->address.sa.sa_family == AF_INET) &&
 					    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 					    &sctp_ifap->address.sin.sin_addr) != 0)) {
 						continue;
 					}
 #endif
 #ifdef INET6
 					if ((sctp_ifap->address.sa.sa_family == AF_INET6) &&
 					    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 					    &sctp_ifap->address.sin6.sin6_addr) != 0)) {
 						continue;
 					}
 #endif
 					if (sctp_is_addr_restricted(stcb, sctp_ifap)) {
 						continue;
 					}
 					if (sctp_is_address_in_scope(sctp_ifap,
 					    scope, 0) == 0) {
 						continue;
 					}
 					if ((chunk_len != NULL) &&
 					    (padding_len != NULL) &&
 					    (*padding_len > 0)) {
 						memset(mtod(m_at, caddr_t)+*chunk_len, 0, *padding_len);
 						SCTP_BUF_LEN(m_at) += *padding_len;
 						*chunk_len += *padding_len;
 						*padding_len = 0;
 					}
 					m_at = sctp_add_addr_to_mbuf(m_at, sctp_ifap, chunk_len);
 					if (limit_out) {
 						cnt++;
 						total_count++;
 						if (cnt >= 2) {
 							/*
 							 * two from each
 							 * address
 							 */
 							break;
 						}
 						if (total_count > SCTP_ADDRESS_LIMIT) {
 							/* No more addresses */
 							break;
 						}
 					}
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 
 		cnt = cnt_inits_to;
 		/* First, how many ? */
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa == NULL) {
 				continue;
 			}
 			if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED)
 				/*
 				 * Address being deleted by the system, dont
 				 * list.
 				 */
 				continue;
 			if (laddr->action == SCTP_DEL_IP_ADDRESS) {
 				/*
 				 * Address being deleted on this ep don't
 				 * list.
 				 */
 				continue;
 			}
 			if (sctp_is_address_in_scope(laddr->ifa,
 			    scope, 1) == 0) {
 				continue;
 			}
 			cnt++;
 		}
 		/*
 		 * To get through a NAT we only list addresses if we have
 		 * more than one. That way if you just bind a single address
 		 * we let the source of the init dictate our address.
 		 */
 		if (cnt > 1) {
 			cnt = cnt_inits_to;
 			LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 				if (laddr->ifa == NULL) {
 					continue;
 				}
 				if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 					continue;
 				}
 				if (sctp_is_address_in_scope(laddr->ifa,
 				    scope, 0) == 0) {
 					continue;
 				}
 				if ((chunk_len != NULL) &&
 				    (padding_len != NULL) &&
 				    (*padding_len > 0)) {
 					memset(mtod(m_at, caddr_t)+*chunk_len, 0, *padding_len);
 					SCTP_BUF_LEN(m_at) += *padding_len;
 					*chunk_len += *padding_len;
 					*padding_len = 0;
 				}
 				m_at = sctp_add_addr_to_mbuf(m_at, laddr->ifa, chunk_len);
 				cnt++;
 				if (cnt >= SCTP_ADDRESS_LIMIT) {
 					break;
 				}
 			}
 		}
 	}
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (m_at);
 }
 
 static struct sctp_ifa *
 sctp_is_ifa_addr_preferred(struct sctp_ifa *ifa,
     uint8_t dest_is_loop,
     uint8_t dest_is_priv,
     sa_family_t fam)
 {
 	uint8_t dest_is_global = 0;
 
 	/* dest_is_priv is true if destination is a private address */
 	/* dest_is_loop is true if destination is a loopback addresses */
 
 	/**
 	 * Here we determine if its a preferred address. A preferred address
 	 * means it is the same scope or higher scope then the destination.
 	 * L = loopback, P = private, G = global
 	 * -----------------------------------------
 	 *    src    |  dest | result
 	 *  ----------------------------------------
 	 *     L     |    L  |    yes
 	 *  -----------------------------------------
 	 *     P     |    L  |    yes-v4 no-v6
 	 *  -----------------------------------------
 	 *     G     |    L  |    yes-v4 no-v6
 	 *  -----------------------------------------
 	 *     L     |    P  |    no
 	 *  -----------------------------------------
 	 *     P     |    P  |    yes
 	 *  -----------------------------------------
 	 *     G     |    P  |    no
 	 *   -----------------------------------------
 	 *     L     |    G  |    no
 	 *   -----------------------------------------
 	 *     P     |    G  |    no
 	 *    -----------------------------------------
 	 *     G     |    G  |    yes
 	 *    -----------------------------------------
 	 */
 
 	if (ifa->address.sa.sa_family != fam) {
 		/* forget mis-matched family */
 		return (NULL);
 	}
 	if ((dest_is_priv == 0) && (dest_is_loop == 0)) {
 		dest_is_global = 1;
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "Is destination preferred:");
 	SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ifa->address.sa);
 	/* Ok the address may be ok */
 #ifdef INET6
 	if (fam == AF_INET6) {
 		/* ok to use deprecated addresses? no lets not! */
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:1\n");
 			return (NULL);
 		}
 		if (ifa->src_is_priv && !ifa->src_is_loop) {
 			if (dest_is_loop) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:2\n");
 				return (NULL);
 			}
 		}
 		if (ifa->src_is_glob) {
 			if (dest_is_loop) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:3\n");
 				return (NULL);
 			}
 		}
 	}
 #endif
 	/*
 	 * Now that we know what is what, implement or table this could in
 	 * theory be done slicker (it used to be), but this is
 	 * straightforward and easier to validate :-)
 	 */
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "src_loop:%d src_priv:%d src_glob:%d\n",
 	    ifa->src_is_loop, ifa->src_is_priv, ifa->src_is_glob);
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "dest_loop:%d dest_priv:%d dest_glob:%d\n",
 	    dest_is_loop, dest_is_priv, dest_is_global);
 
 	if ((ifa->src_is_loop) && (dest_is_priv)) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:4\n");
 		return (NULL);
 	}
 	if ((ifa->src_is_glob) && (dest_is_priv)) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:5\n");
 		return (NULL);
 	}
 	if ((ifa->src_is_loop) && (dest_is_global)) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:6\n");
 		return (NULL);
 	}
 	if ((ifa->src_is_priv) && (dest_is_global)) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:7\n");
 		return (NULL);
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "YES\n");
 	/* its a preferred address */
 	return (ifa);
 }
 
 static struct sctp_ifa *
 sctp_is_ifa_addr_acceptable(struct sctp_ifa *ifa,
     uint8_t dest_is_loop,
     uint8_t dest_is_priv,
     sa_family_t fam)
 {
 	uint8_t dest_is_global = 0;
 
 	/**
 	 * Here we determine if its a acceptable address. A acceptable
 	 * address means it is the same scope or higher scope but we can
 	 * allow for NAT which means its ok to have a global dest and a
 	 * private src.
 	 *
 	 * L = loopback, P = private, G = global
 	 * -----------------------------------------
 	 *  src    |  dest | result
 	 * -----------------------------------------
 	 *   L     |   L   |    yes
 	 *  -----------------------------------------
 	 *   P     |   L   |    yes-v4 no-v6
 	 *  -----------------------------------------
 	 *   G     |   L   |    yes
 	 * -----------------------------------------
 	 *   L     |   P   |    no
 	 * -----------------------------------------
 	 *   P     |   P   |    yes
 	 * -----------------------------------------
 	 *   G     |   P   |    yes - May not work
 	 * -----------------------------------------
 	 *   L     |   G   |    no
 	 * -----------------------------------------
 	 *   P     |   G   |    yes - May not work
 	 * -----------------------------------------
 	 *   G     |   G   |    yes
 	 * -----------------------------------------
 	 */
 
 	if (ifa->address.sa.sa_family != fam) {
 		/* forget non matching family */
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa_fam:%d fam:%d\n",
 		    ifa->address.sa.sa_family, fam);
 		return (NULL);
 	}
 	/* Ok the address may be ok */
 	SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, &ifa->address.sa);
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst_is_loop:%d dest_is_priv:%d\n",
 	    dest_is_loop, dest_is_priv);
 	if ((dest_is_loop == 0) && (dest_is_priv == 0)) {
 		dest_is_global = 1;
 	}
 #ifdef INET6
 	if (fam == AF_INET6) {
 		/* ok to use deprecated addresses? */
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			return (NULL);
 		}
 		if (ifa->src_is_priv) {
 			/* Special case, linklocal to loop */
 			if (dest_is_loop)
 				return (NULL);
 		}
 	}
 #endif
 	/*
 	 * Now that we know what is what, implement our table. This could in
 	 * theory be done slicker (it used to be), but this is
 	 * straightforward and easier to validate :-)
 	 */
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa->src_is_loop:%d dest_is_priv:%d\n",
 	    ifa->src_is_loop,
 	    dest_is_priv);
 	if ((ifa->src_is_loop == 1) && (dest_is_priv)) {
 		return (NULL);
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa->src_is_loop:%d dest_is_glob:%d\n",
 	    ifa->src_is_loop,
 	    dest_is_global);
 	if ((ifa->src_is_loop == 1) && (dest_is_global)) {
 		return (NULL);
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "address is acceptable\n");
 	/* its an acceptable address */
 	return (ifa);
 }
 
 int
 sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
 {
 	struct sctp_laddr *laddr;
 
 	if (stcb == NULL) {
 		/* There are no restrictions, no TCB :-) */
 		return (0);
 	}
 	LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) {
 		if (laddr->ifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
 			    __func__);
 			continue;
 		}
 		if (laddr->ifa == ifa) {
 			/* Yes it is on the list */
 			return (1);
 		}
 	}
 	return (0);
 }
 
 
 int
 sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
 {
 	struct sctp_laddr *laddr;
 
 	if (ifa == NULL)
 		return (0);
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
 			    __func__);
 			continue;
 		}
 		if ((laddr->ifa == ifa) && laddr->action == 0)
 			/* same pointer */
 			return (1);
 	}
 	return (0);
 }
 
 
 
 static struct sctp_ifa *
 sctp_choose_boundspecific_inp(struct sctp_inpcb *inp,
     sctp_route_t *ro,
     uint32_t vrf_id,
     int non_asoc_addr_ok,
     uint8_t dest_is_priv,
     uint8_t dest_is_loop,
     sa_family_t fam)
 {
 	struct sctp_laddr *laddr, *starting_point;
 	void *ifn;
 	int resettotop = 0;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa, *sifa;
 	struct sctp_vrf *vrf;
 	uint32_t ifn_index;
 
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL)
 		return (NULL);
 
 	ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
 	ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro);
 	sctp_ifn = sctp_find_ifn(ifn, ifn_index);
 	/*
 	 * first question, is the ifn we will emit on in our list, if so, we
 	 * want such an address. Note that we first looked for a preferred
 	 * address.
 	 */
 	if (sctp_ifn) {
 		/* is a preferred one on the interface we route out? */
 		LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 #ifdef INET
 			if ((sctp_ifa->address.sa.sa_family == AF_INET) &&
 			    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin.sin_addr) != 0)) {
 				continue;
 			}
 #endif
 #ifdef INET6
 			if ((sctp_ifa->address.sa.sa_family == AF_INET6) &&
 			    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin6.sin6_addr) != 0)) {
 				continue;
 			}
 #endif
 			if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
 			    (non_asoc_addr_ok == 0))
 				continue;
 			sifa = sctp_is_ifa_addr_preferred(sctp_ifa,
 			    dest_is_loop,
 			    dest_is_priv, fam);
 			if (sifa == NULL)
 				continue;
 			if (sctp_is_addr_in_ep(inp, sifa)) {
 				atomic_add_int(&sifa->refcount, 1);
 				return (sifa);
 			}
 		}
 	}
 	/*
 	 * ok, now we now need to find one on the list of the addresses. We
 	 * can't get one on the emitting interface so let's find first a
 	 * preferred one. If not that an acceptable one otherwise... we
 	 * return NULL.
 	 */
 	starting_point = inp->next_addr_touse;
 once_again:
 	if (inp->next_addr_touse == NULL) {
 		inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list);
 		resettotop = 1;
 	}
 	for (laddr = inp->next_addr_touse; laddr;
 	    laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
 		if (laddr->ifa == NULL) {
 			/* address has been removed */
 			continue;
 		}
 		if (laddr->action == SCTP_DEL_IP_ADDRESS) {
 			/* address is being deleted */
 			continue;
 		}
 		sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop,
 		    dest_is_priv, fam);
 		if (sifa == NULL)
 			continue;
 		atomic_add_int(&sifa->refcount, 1);
 		return (sifa);
 	}
 	if (resettotop == 0) {
 		inp->next_addr_touse = NULL;
 		goto once_again;
 	}
 
 	inp->next_addr_touse = starting_point;
 	resettotop = 0;
 once_again_too:
 	if (inp->next_addr_touse == NULL) {
 		inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list);
 		resettotop = 1;
 	}
 
 	/* ok, what about an acceptable address in the inp */
 	for (laddr = inp->next_addr_touse; laddr;
 	    laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
 		if (laddr->ifa == NULL) {
 			/* address has been removed */
 			continue;
 		}
 		if (laddr->action == SCTP_DEL_IP_ADDRESS) {
 			/* address is being deleted */
 			continue;
 		}
 		sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop,
 		    dest_is_priv, fam);
 		if (sifa == NULL)
 			continue;
 		atomic_add_int(&sifa->refcount, 1);
 		return (sifa);
 	}
 	if (resettotop == 0) {
 		inp->next_addr_touse = NULL;
 		goto once_again_too;
 	}
 
 	/*
 	 * no address bound can be a source for the destination we are in
 	 * trouble
 	 */
 	return (NULL);
 }
 
 
 
 static struct sctp_ifa *
 sctp_choose_boundspecific_stcb(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     sctp_route_t *ro,
     uint32_t vrf_id,
     uint8_t dest_is_priv,
     uint8_t dest_is_loop,
     int non_asoc_addr_ok,
     sa_family_t fam)
 {
 	struct sctp_laddr *laddr, *starting_point;
 	void *ifn;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa, *sifa;
 	uint8_t start_at_beginning = 0;
 	struct sctp_vrf *vrf;
 	uint32_t ifn_index;
 
 	/*
 	 * first question, is the ifn we will emit on in our list, if so, we
 	 * want that one.
 	 */
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL)
 		return (NULL);
 
 	ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
 	ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro);
 	sctp_ifn = sctp_find_ifn(ifn, ifn_index);
 
 	/*
 	 * first question, is the ifn we will emit on in our list?  If so,
 	 * we want that one. First we look for a preferred. Second, we go
 	 * for an acceptable.
 	 */
 	if (sctp_ifn) {
 		/* first try for a preferred address on the ep */
 		LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 #ifdef INET
 			if ((sctp_ifa->address.sa.sa_family == AF_INET) &&
 			    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin.sin_addr) != 0)) {
 				continue;
 			}
 #endif
 #ifdef INET6
 			if ((sctp_ifa->address.sa.sa_family == AF_INET6) &&
 			    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin6.sin6_addr) != 0)) {
 				continue;
 			}
 #endif
 			if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0))
 				continue;
 			if (sctp_is_addr_in_ep(inp, sctp_ifa)) {
 				sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam);
 				if (sifa == NULL)
 					continue;
 				if (((non_asoc_addr_ok == 0) &&
 				    (sctp_is_addr_restricted(stcb, sifa))) ||
 				    (non_asoc_addr_ok &&
 				    (sctp_is_addr_restricted(stcb, sifa)) &&
 				    (!sctp_is_addr_pending(stcb, sifa)))) {
 					/* on the no-no list */
 					continue;
 				}
 				atomic_add_int(&sifa->refcount, 1);
 				return (sifa);
 			}
 		}
 		/* next try for an acceptable address on the ep */
 		LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 #ifdef INET
 			if ((sctp_ifa->address.sa.sa_family == AF_INET) &&
 			    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin.sin_addr) != 0)) {
 				continue;
 			}
 #endif
 #ifdef INET6
 			if ((sctp_ifa->address.sa.sa_family == AF_INET6) &&
 			    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin6.sin6_addr) != 0)) {
 				continue;
 			}
 #endif
 			if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0))
 				continue;
 			if (sctp_is_addr_in_ep(inp, sctp_ifa)) {
 				sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam);
 				if (sifa == NULL)
 					continue;
 				if (((non_asoc_addr_ok == 0) &&
 				    (sctp_is_addr_restricted(stcb, sifa))) ||
 				    (non_asoc_addr_ok &&
 				    (sctp_is_addr_restricted(stcb, sifa)) &&
 				    (!sctp_is_addr_pending(stcb, sifa)))) {
 					/* on the no-no list */
 					continue;
 				}
 				atomic_add_int(&sifa->refcount, 1);
 				return (sifa);
 			}
 		}
 
 	}
 	/*
 	 * if we can't find one like that then we must look at all addresses
 	 * bound to pick one at first preferable then secondly acceptable.
 	 */
 	starting_point = stcb->asoc.last_used_address;
 sctp_from_the_top:
 	if (stcb->asoc.last_used_address == NULL) {
 		start_at_beginning = 1;
 		stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list);
 	}
 	/* search beginning with the last used address */
 	for (laddr = stcb->asoc.last_used_address; laddr;
 	    laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
 		if (laddr->ifa == NULL) {
 			/* address has been removed */
 			continue;
 		}
 		if (laddr->action == SCTP_DEL_IP_ADDRESS) {
 			/* address is being deleted */
 			continue;
 		}
 		sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam);
 		if (sifa == NULL)
 			continue;
 		if (((non_asoc_addr_ok == 0) &&
 		    (sctp_is_addr_restricted(stcb, sifa))) ||
 		    (non_asoc_addr_ok &&
 		    (sctp_is_addr_restricted(stcb, sifa)) &&
 		    (!sctp_is_addr_pending(stcb, sifa)))) {
 			/* on the no-no list */
 			continue;
 		}
 		stcb->asoc.last_used_address = laddr;
 		atomic_add_int(&sifa->refcount, 1);
 		return (sifa);
 	}
 	if (start_at_beginning == 0) {
 		stcb->asoc.last_used_address = NULL;
 		goto sctp_from_the_top;
 	}
 	/* now try for any higher scope than the destination */
 	stcb->asoc.last_used_address = starting_point;
 	start_at_beginning = 0;
 sctp_from_the_top2:
 	if (stcb->asoc.last_used_address == NULL) {
 		start_at_beginning = 1;
 		stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list);
 	}
 	/* search beginning with the last used address */
 	for (laddr = stcb->asoc.last_used_address; laddr;
 	    laddr = LIST_NEXT(laddr, sctp_nxt_addr)) {
 		if (laddr->ifa == NULL) {
 			/* address has been removed */
 			continue;
 		}
 		if (laddr->action == SCTP_DEL_IP_ADDRESS) {
 			/* address is being deleted */
 			continue;
 		}
 		sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop,
 		    dest_is_priv, fam);
 		if (sifa == NULL)
 			continue;
 		if (((non_asoc_addr_ok == 0) &&
 		    (sctp_is_addr_restricted(stcb, sifa))) ||
 		    (non_asoc_addr_ok &&
 		    (sctp_is_addr_restricted(stcb, sifa)) &&
 		    (!sctp_is_addr_pending(stcb, sifa)))) {
 			/* on the no-no list */
 			continue;
 		}
 		stcb->asoc.last_used_address = laddr;
 		atomic_add_int(&sifa->refcount, 1);
 		return (sifa);
 	}
 	if (start_at_beginning == 0) {
 		stcb->asoc.last_used_address = NULL;
 		goto sctp_from_the_top2;
 	}
 	return (NULL);
 }
 
 static struct sctp_ifa *
 sctp_select_nth_preferred_addr_from_ifn_boundall(struct sctp_ifn *ifn,
     struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     int non_asoc_addr_ok,
     uint8_t dest_is_loop,
     uint8_t dest_is_priv,
     int addr_wanted,
     sa_family_t fam,
     sctp_route_t *ro
 )
 {
 	struct sctp_ifa *ifa, *sifa;
 	int num_eligible_addr = 0;
 #ifdef INET6
 	struct sockaddr_in6 sin6, lsa6;
 
 	if (fam == AF_INET6) {
 		memcpy(&sin6, &ro->ro_dst, sizeof(struct sockaddr_in6));
 		(void)sa6_recoverscope(&sin6);
 	}
 #endif				/* INET6 */
 	LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) {
 #ifdef INET
 		if ((ifa->address.sa.sa_family == AF_INET) &&
 		    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 		    &ifa->address.sin.sin_addr) != 0)) {
 			continue;
 		}
 #endif
 #ifdef INET6
 		if ((ifa->address.sa.sa_family == AF_INET6) &&
 		    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 		    &ifa->address.sin6.sin6_addr) != 0)) {
 			continue;
 		}
 #endif
 		if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
 		    (non_asoc_addr_ok == 0))
 			continue;
 		sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop,
 		    dest_is_priv, fam);
 		if (sifa == NULL)
 			continue;
 #ifdef INET6
 		if (fam == AF_INET6 &&
 		    dest_is_loop &&
 		    sifa->src_is_loop && sifa->src_is_priv) {
 			/*
 			 * don't allow fe80::1 to be a src on loop ::1, we
 			 * don't list it to the peer so we will get an
 			 * abort.
 			 */
 			continue;
 		}
 		if (fam == AF_INET6 &&
 		    IN6_IS_ADDR_LINKLOCAL(&sifa->address.sin6.sin6_addr) &&
 		    IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) {
 			/*
 			 * link-local <-> link-local must belong to the same
 			 * scope.
 			 */
 			memcpy(&lsa6, &sifa->address.sin6, sizeof(struct sockaddr_in6));
 			(void)sa6_recoverscope(&lsa6);
 			if (sin6.sin6_scope_id != lsa6.sin6_scope_id) {
 				continue;
 			}
 		}
 #endif				/* INET6 */
 
 		/*
 		 * Check if the IPv6 address matches to next-hop. In the
 		 * mobile case, old IPv6 address may be not deleted from the
 		 * interface. Then, the interface has previous and new
 		 * addresses.  We should use one corresponding to the
 		 * next-hop.  (by micchie)
 		 */
 #ifdef INET6
 		if (stcb && fam == AF_INET6 &&
 		    sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) {
 			if (sctp_v6src_match_nexthop(&sifa->address.sin6, ro)
 			    == 0) {
 				continue;
 			}
 		}
 #endif
 #ifdef INET
 		/* Avoid topologically incorrect IPv4 address */
 		if (stcb && fam == AF_INET &&
 		    sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) {
 			if (sctp_v4src_match_nexthop(sifa, ro) == 0) {
 				continue;
 			}
 		}
 #endif
 		if (stcb) {
 			if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) {
 				continue;
 			}
 			if (((non_asoc_addr_ok == 0) &&
 			    (sctp_is_addr_restricted(stcb, sifa))) ||
 			    (non_asoc_addr_ok &&
 			    (sctp_is_addr_restricted(stcb, sifa)) &&
 			    (!sctp_is_addr_pending(stcb, sifa)))) {
 				/*
 				 * It is restricted for some reason..
 				 * probably not yet added.
 				 */
 				continue;
 			}
 		}
 		if (num_eligible_addr >= addr_wanted) {
 			return (sifa);
 		}
 		num_eligible_addr++;
 	}
 	return (NULL);
 }
 
 
 static int
 sctp_count_num_preferred_boundall(struct sctp_ifn *ifn,
     struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     int non_asoc_addr_ok,
     uint8_t dest_is_loop,
     uint8_t dest_is_priv,
     sa_family_t fam)
 {
 	struct sctp_ifa *ifa, *sifa;
 	int num_eligible_addr = 0;
 
 	LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) {
 #ifdef INET
 		if ((ifa->address.sa.sa_family == AF_INET) &&
 		    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 		    &ifa->address.sin.sin_addr) != 0)) {
 			continue;
 		}
 #endif
 #ifdef INET6
 		if ((ifa->address.sa.sa_family == AF_INET6) &&
 		    (stcb != NULL) &&
 		    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 		    &ifa->address.sin6.sin6_addr) != 0)) {
 			continue;
 		}
 #endif
 		if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
 		    (non_asoc_addr_ok == 0)) {
 			continue;
 		}
 		sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop,
 		    dest_is_priv, fam);
 		if (sifa == NULL) {
 			continue;
 		}
 		if (stcb) {
 			if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) {
 				continue;
 			}
 			if (((non_asoc_addr_ok == 0) &&
 			    (sctp_is_addr_restricted(stcb, sifa))) ||
 			    (non_asoc_addr_ok &&
 			    (sctp_is_addr_restricted(stcb, sifa)) &&
 			    (!sctp_is_addr_pending(stcb, sifa)))) {
 				/*
 				 * It is restricted for some reason..
 				 * probably not yet added.
 				 */
 				continue;
 			}
 		}
 		num_eligible_addr++;
 	}
 	return (num_eligible_addr);
 }
 
 static struct sctp_ifa *
 sctp_choose_boundall(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_nets *net,
     sctp_route_t *ro,
     uint32_t vrf_id,
     uint8_t dest_is_priv,
     uint8_t dest_is_loop,
     int non_asoc_addr_ok,
     sa_family_t fam)
 {
 	int cur_addr_num = 0, num_preferred = 0;
 	void *ifn;
 	struct sctp_ifn *sctp_ifn, *looked_at = NULL, *emit_ifn;
 	struct sctp_ifa *sctp_ifa, *sifa;
 	uint32_t ifn_index;
 	struct sctp_vrf *vrf;
 #ifdef INET
 	int retried = 0;
 #endif
 
 	/*-
 	 * For boundall we can use any address in the association.
 	 * If non_asoc_addr_ok is set we can use any address (at least in
 	 * theory). So we look for preferred addresses first. If we find one,
 	 * we use it. Otherwise we next try to get an address on the
 	 * interface, which we should be able to do (unless non_asoc_addr_ok
 	 * is false and we are routed out that way). In these cases where we
 	 * can't use the address of the interface we go through all the
 	 * ifn's looking for an address we can use and fill that in. Punting
 	 * means we send back address 0, which will probably cause problems
 	 * actually since then IP will fill in the address of the route ifn,
 	 * which means we probably already rejected it.. i.e. here comes an
 	 * abort :-<.
 	 */
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL)
 		return (NULL);
 
 	ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
 	ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro);
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn from route:%p ifn_index:%d\n", ifn, ifn_index);
 	emit_ifn = looked_at = sctp_ifn = sctp_find_ifn(ifn, ifn_index);
 	if (sctp_ifn == NULL) {
 		/* ?? We don't have this guy ?? */
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "No ifn emit interface?\n");
 		goto bound_all_plan_b;
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn_index:%d name:%s is emit interface\n",
 	    ifn_index, sctp_ifn->ifn_name);
 
 	if (net) {
 		cur_addr_num = net->indx_of_eligible_next_to_use;
 	}
 	num_preferred = sctp_count_num_preferred_boundall(sctp_ifn,
 	    inp, stcb,
 	    non_asoc_addr_ok,
 	    dest_is_loop,
 	    dest_is_priv, fam);
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found %d preferred source addresses for intf:%s\n",
 	    num_preferred, sctp_ifn->ifn_name);
 	if (num_preferred == 0) {
 		/*
 		 * no eligible addresses, we must use some other interface
 		 * address if we can find one.
 		 */
 		goto bound_all_plan_b;
 	}
 	/*
 	 * Ok we have num_eligible_addr set with how many we can use, this
 	 * may vary from call to call due to addresses being deprecated
 	 * etc..
 	 */
 	if (cur_addr_num >= num_preferred) {
 		cur_addr_num = 0;
 	}
 	/*
 	 * select the nth address from the list (where cur_addr_num is the
 	 * nth) and 0 is the first one, 1 is the second one etc...
 	 */
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "cur_addr_num:%d\n", cur_addr_num);
 
 	sctp_ifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop,
 	    dest_is_priv, cur_addr_num, fam, ro);
 
 	/* if sctp_ifa is NULL something changed??, fall to plan b. */
 	if (sctp_ifa) {
 		atomic_add_int(&sctp_ifa->refcount, 1);
 		if (net) {
 			/* save off where the next one we will want */
 			net->indx_of_eligible_next_to_use = cur_addr_num + 1;
 		}
 		return (sctp_ifa);
 	}
 	/*
 	 * plan_b: Look at all interfaces and find a preferred address. If
 	 * no preferred fall through to plan_c.
 	 */
 bound_all_plan_b:
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan B\n");
 	LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "Examine interface %s\n",
 		    sctp_ifn->ifn_name);
 		if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 			/* wrong base scope */
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "skip\n");
 			continue;
 		}
 		if ((sctp_ifn == looked_at) && looked_at) {
 			/* already looked at this guy */
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "already seen\n");
 			continue;
 		}
 		num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok,
 		    dest_is_loop, dest_is_priv, fam);
 		SCTPDBG(SCTP_DEBUG_OUTPUT2,
 		    "Found ifn:%p %d preferred source addresses\n",
 		    ifn, num_preferred);
 		if (num_preferred == 0) {
 			/* None on this interface. */
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "No preferred -- skipping to next\n");
 			continue;
 		}
 		SCTPDBG(SCTP_DEBUG_OUTPUT2,
 		    "num preferred:%d on interface:%p cur_addr_num:%d\n",
 		    num_preferred, (void *)sctp_ifn, cur_addr_num);
 
 		/*
 		 * Ok we have num_eligible_addr set with how many we can
 		 * use, this may vary from call to call due to addresses
 		 * being deprecated etc..
 		 */
 		if (cur_addr_num >= num_preferred) {
 			cur_addr_num = 0;
 		}
 		sifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop,
 		    dest_is_priv, cur_addr_num, fam, ro);
 		if (sifa == NULL)
 			continue;
 		if (net) {
 			net->indx_of_eligible_next_to_use = cur_addr_num + 1;
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "we selected %d\n",
 			    cur_addr_num);
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "Source:");
 			SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa);
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "Dest:");
 			SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &net->ro._l_addr.sa);
 		}
 		atomic_add_int(&sifa->refcount, 1);
 		return (sifa);
 	}
 #ifdef INET
 again_with_private_addresses_allowed:
 #endif
 	/* plan_c: do we have an acceptable address on the emit interface */
 	sifa = NULL;
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan C: find acceptable on interface\n");
 	if (emit_ifn == NULL) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jump to Plan D - no emit_ifn\n");
 		goto plan_d;
 	}
 	LIST_FOREACH(sctp_ifa, &emit_ifn->ifalist, next_ifa) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifa:%p\n", (void *)sctp_ifa);
 #ifdef INET
 		if ((sctp_ifa->address.sa.sa_family == AF_INET) &&
 		    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 		    &sctp_ifa->address.sin.sin_addr) != 0)) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jailed\n");
 			continue;
 		}
 #endif
 #ifdef INET6
 		if ((sctp_ifa->address.sa.sa_family == AF_INET6) &&
 		    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 		    &sctp_ifa->address.sin6.sin6_addr) != 0)) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jailed\n");
 			continue;
 		}
 #endif
 		if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
 		    (non_asoc_addr_ok == 0)) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "Defer\n");
 			continue;
 		}
 		sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop,
 		    dest_is_priv, fam);
 		if (sifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "IFA not acceptable\n");
 			continue;
 		}
 		if (stcb) {
 			if (sctp_is_address_in_scope(sifa, &stcb->asoc.scope, 0) == 0) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT2, "NOT in scope\n");
 				sifa = NULL;
 				continue;
 			}
 			if (((non_asoc_addr_ok == 0) &&
 			    (sctp_is_addr_restricted(stcb, sifa))) ||
 			    (non_asoc_addr_ok &&
 			    (sctp_is_addr_restricted(stcb, sifa)) &&
 			    (!sctp_is_addr_pending(stcb, sifa)))) {
 				/*
 				 * It is restricted for some reason..
 				 * probably not yet added.
 				 */
 				SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its restricted\n");
 				sifa = NULL;
 				continue;
 			}
 		}
 		atomic_add_int(&sifa->refcount, 1);
 		goto out;
 	}
 plan_d:
 	/*
 	 * plan_d: We are in trouble. No preferred address on the emit
 	 * interface. And not even a preferred address on all interfaces. Go
 	 * out and see if we can find an acceptable address somewhere
 	 * amongst all interfaces.
 	 */
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan D looked_at is %p\n", (void *)looked_at);
 	LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 		if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 			/* wrong base scope */
 			continue;
 		}
 		LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 #ifdef INET
 			if ((sctp_ifa->address.sa.sa_family == AF_INET) &&
 			    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin.sin_addr) != 0)) {
 				continue;
 			}
 #endif
 #ifdef INET6
 			if ((sctp_ifa->address.sa.sa_family == AF_INET6) &&
 			    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 			    &sctp_ifa->address.sin6.sin6_addr) != 0)) {
 				continue;
 			}
 #endif
 			if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
 			    (non_asoc_addr_ok == 0))
 				continue;
 			sifa = sctp_is_ifa_addr_acceptable(sctp_ifa,
 			    dest_is_loop,
 			    dest_is_priv, fam);
 			if (sifa == NULL)
 				continue;
 			if (stcb) {
 				if (sctp_is_address_in_scope(sifa, &stcb->asoc.scope, 0) == 0) {
 					sifa = NULL;
 					continue;
 				}
 				if (((non_asoc_addr_ok == 0) &&
 				    (sctp_is_addr_restricted(stcb, sifa))) ||
 				    (non_asoc_addr_ok &&
 				    (sctp_is_addr_restricted(stcb, sifa)) &&
 				    (!sctp_is_addr_pending(stcb, sifa)))) {
 					/*
 					 * It is restricted for some
 					 * reason.. probably not yet added.
 					 */
 					sifa = NULL;
 					continue;
 				}
 			}
 			goto out;
 		}
 	}
 #ifdef INET
 	if (stcb) {
 		if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) {
 			stcb->asoc.scope.ipv4_local_scope = 1;
 			retried = 1;
 			goto again_with_private_addresses_allowed;
 		} else if (retried == 1) {
 			stcb->asoc.scope.ipv4_local_scope = 0;
 		}
 	}
 #endif
 out:
 #ifdef INET
 	if (sifa) {
 		if (retried == 1) {
 			LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 				if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 					/* wrong base scope */
 					continue;
 				}
 				LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 					struct sctp_ifa *tmp_sifa;
 
 #ifdef INET
 					if ((sctp_ifa->address.sa.sa_family == AF_INET) &&
 					    (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 					    &sctp_ifa->address.sin.sin_addr) != 0)) {
 						continue;
 					}
 #endif
 #ifdef INET6
 					if ((sctp_ifa->address.sa.sa_family == AF_INET6) &&
 					    (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 					    &sctp_ifa->address.sin6.sin6_addr) != 0)) {
 						continue;
 					}
 #endif
 					if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) &&
 					    (non_asoc_addr_ok == 0))
 						continue;
 					tmp_sifa = sctp_is_ifa_addr_acceptable(sctp_ifa,
 					    dest_is_loop,
 					    dest_is_priv, fam);
 					if (tmp_sifa == NULL) {
 						continue;
 					}
 					if (tmp_sifa == sifa) {
 						continue;
 					}
 					if (stcb) {
 						if (sctp_is_address_in_scope(tmp_sifa,
 						    &stcb->asoc.scope, 0) == 0) {
 							continue;
 						}
 						if (((non_asoc_addr_ok == 0) &&
 						    (sctp_is_addr_restricted(stcb, tmp_sifa))) ||
 						    (non_asoc_addr_ok &&
 						    (sctp_is_addr_restricted(stcb, tmp_sifa)) &&
 						    (!sctp_is_addr_pending(stcb, tmp_sifa)))) {
 							/*
 							 * It is restricted
 							 * for some reason..
 							 * probably not yet
 							 * added.
 							 */
 							continue;
 						}
 					}
 					if ((tmp_sifa->address.sin.sin_family == AF_INET) &&
 					    (IN4_ISPRIVATE_ADDRESS(&(tmp_sifa->address.sin.sin_addr)))) {
 						sctp_add_local_addr_restricted(stcb, tmp_sifa);
 					}
 				}
 			}
 		}
 		atomic_add_int(&sifa->refcount, 1);
 	}
 #endif
 	return (sifa);
 }
 
 
 
 /* tcb may be NULL */
 struct sctp_ifa *
 sctp_source_address_selection(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     sctp_route_t *ro,
     struct sctp_nets *net,
     int non_asoc_addr_ok, uint32_t vrf_id)
 {
 	struct sctp_ifa *answer;
 	uint8_t dest_is_priv, dest_is_loop;
 	sa_family_t fam;
 #ifdef INET
 	struct sockaddr_in *to = (struct sockaddr_in *)&ro->ro_dst;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *to6 = (struct sockaddr_in6 *)&ro->ro_dst;
 #endif
 
 	/**
 	 * Rules:
 	 * - Find the route if needed, cache if I can.
 	 * - Look at interface address in route, Is it in the bound list. If so we
 	 *   have the best source.
 	 * - If not we must rotate amongst the addresses.
 	 *
 	 * Cavets and issues
 	 *
 	 * Do we need to pay attention to scope. We can have a private address
 	 * or a global address we are sourcing or sending to. So if we draw
 	 * it out
 	 * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
 	 * For V4
 	 * ------------------------------------------
 	 *      source     *      dest  *  result
 	 * -----------------------------------------
 	 * <a>  Private    *    Global  *  NAT
 	 * -----------------------------------------
 	 * <b>  Private    *    Private *  No problem
 	 * -----------------------------------------
 	 * <c>  Global     *    Private *  Huh, How will this work?
 	 * -----------------------------------------
 	 * <d>  Global     *    Global  *  No Problem
 	 *------------------------------------------
 	 * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
 	 * For V6
 	 *------------------------------------------
 	 *      source     *      dest  *  result
 	 * -----------------------------------------
 	 * <a>  Linklocal  *    Global  *
 	 * -----------------------------------------
 	 * <b>  Linklocal  * Linklocal  *  No problem
 	 * -----------------------------------------
 	 * <c>  Global     * Linklocal  *  Huh, How will this work?
 	 * -----------------------------------------
 	 * <d>  Global     *    Global  *  No Problem
 	 *------------------------------------------
 	 * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
 	 *
 	 * And then we add to that what happens if there are multiple addresses
 	 * assigned to an interface. Remember the ifa on a ifn is a linked
 	 * list of addresses. So one interface can have more than one IP
 	 * address. What happens if we have both a private and a global
 	 * address? Do we then use context of destination to sort out which
 	 * one is best? And what about NAT's sending P->G may get you a NAT
 	 * translation, or should you select the G thats on the interface in
 	 * preference.
 	 *
 	 * Decisions:
 	 *
 	 * - count the number of addresses on the interface.
 	 * - if it is one, no problem except case <c>.
 	 *   For <a> we will assume a NAT out there.
 	 * - if there are more than one, then we need to worry about scope P
 	 *   or G. We should prefer G -> G and P -> P if possible.
 	 *   Then as a secondary fall back to mixed types G->P being a last
 	 *   ditch one.
 	 * - The above all works for bound all, but bound specific we need to
 	 *   use the same concept but instead only consider the bound
 	 *   addresses. If the bound set is NOT assigned to the interface then
 	 *   we must use rotation amongst the bound addresses..
 	 */
 	if (ro->ro_nh == NULL) {
 		/*
 		 * Need a route to cache.
 		 */
 		SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
 	}
 	if (ro->ro_nh == NULL) {
 		return (NULL);
 	}
 	fam = ro->ro_dst.sa_family;
 	dest_is_priv = dest_is_loop = 0;
 	/* Setup our scopes for the destination */
 	switch (fam) {
 #ifdef INET
 	case AF_INET:
 		/* Scope based on outbound address */
 		if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) {
 			dest_is_loop = 1;
 			if (net != NULL) {
 				/* mark it as local */
 				net->addr_is_local = 1;
 			}
 		} else if ((IN4_ISPRIVATE_ADDRESS(&to->sin_addr))) {
 			dest_is_priv = 1;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		/* Scope based on outbound address */
 		if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr) ||
 		    SCTP_ROUTE_IS_REAL_LOOP(ro)) {
 			/*
 			 * If the address is a loopback address, which
 			 * consists of "::1" OR "fe80::1%lo0", we are
 			 * loopback scope. But we don't use dest_is_priv
 			 * (link local addresses).
 			 */
 			dest_is_loop = 1;
 			if (net != NULL) {
 				/* mark it as local */
 				net->addr_is_local = 1;
 			}
 		} else if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) {
 			dest_is_priv = 1;
 		}
 		break;
 #endif
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "Select source addr for:");
 	SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&ro->ro_dst);
 	SCTP_IPI_ADDR_RLOCK();
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/*
 		 * Bound all case
 		 */
 		answer = sctp_choose_boundall(inp, stcb, net, ro, vrf_id,
 		    dest_is_priv, dest_is_loop,
 		    non_asoc_addr_ok, fam);
 		SCTP_IPI_ADDR_RUNLOCK();
 		return (answer);
 	}
 	/*
 	 * Subset bound case
 	 */
 	if (stcb) {
 		answer = sctp_choose_boundspecific_stcb(inp, stcb, ro,
 		    vrf_id, dest_is_priv,
 		    dest_is_loop,
 		    non_asoc_addr_ok, fam);
 	} else {
 		answer = sctp_choose_boundspecific_inp(inp, ro, vrf_id,
 		    non_asoc_addr_ok,
 		    dest_is_priv,
 		    dest_is_loop, fam);
 	}
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (answer);
 }
 
 static int
 sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize)
 {
 	struct cmsghdr cmh;
 	struct sctp_sndinfo sndinfo;
 	struct sctp_prinfo prinfo;
 	struct sctp_authinfo authinfo;
 	int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off;
 	int found;
 
 	/*
 	 * Independent of how many mbufs, find the c_type inside the control
 	 * structure and copy out the data.
 	 */
 	found = 0;
 	tot_len = SCTP_BUF_LEN(control);
 	for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) {
 		rem_len = tot_len - off;
 		if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) {
 			/* There is not enough room for one more. */
 			return (found);
 		}
 		m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh);
 		if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) {
 			/* We dont't have a complete CMSG header. */
 			return (found);
 		}
 		if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) {
 			/* We don't have the complete CMSG. */
 			return (found);
 		}
 		cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh));
 		cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh));
 		if ((cmh.cmsg_level == IPPROTO_SCTP) &&
 		    ((c_type == cmh.cmsg_type) ||
 		    ((c_type == SCTP_SNDRCV) &&
 		    ((cmh.cmsg_type == SCTP_SNDINFO) ||
 		    (cmh.cmsg_type == SCTP_PRINFO) ||
 		    (cmh.cmsg_type == SCTP_AUTHINFO))))) {
 			if (c_type == cmh.cmsg_type) {
 				if (cpsize > INT_MAX) {
 					return (found);
 				}
 				if (cmsg_data_len < (int)cpsize) {
 					return (found);
 				}
 				/* It is exactly what we want. Copy it out. */
 				m_copydata(control, cmsg_data_off, (int)cpsize, (caddr_t)data);
 				return (1);
 			} else {
 				struct sctp_sndrcvinfo *sndrcvinfo;
 
 				sndrcvinfo = (struct sctp_sndrcvinfo *)data;
 				if (found == 0) {
 					if (cpsize < sizeof(struct sctp_sndrcvinfo)) {
 						return (found);
 					}
 					memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
 				}
 				switch (cmh.cmsg_type) {
 				case SCTP_SNDINFO:
 					if (cmsg_data_len < (int)sizeof(struct sctp_sndinfo)) {
 						return (found);
 					}
 					m_copydata(control, cmsg_data_off, sizeof(struct sctp_sndinfo), (caddr_t)&sndinfo);
 					sndrcvinfo->sinfo_stream = sndinfo.snd_sid;
 					sndrcvinfo->sinfo_flags = sndinfo.snd_flags;
 					sndrcvinfo->sinfo_ppid = sndinfo.snd_ppid;
 					sndrcvinfo->sinfo_context = sndinfo.snd_context;
 					sndrcvinfo->sinfo_assoc_id = sndinfo.snd_assoc_id;
 					break;
 				case SCTP_PRINFO:
 					if (cmsg_data_len < (int)sizeof(struct sctp_prinfo)) {
 						return (found);
 					}
 					m_copydata(control, cmsg_data_off, sizeof(struct sctp_prinfo), (caddr_t)&prinfo);
 					if (prinfo.pr_policy != SCTP_PR_SCTP_NONE) {
 						sndrcvinfo->sinfo_timetolive = prinfo.pr_value;
 					} else {
 						sndrcvinfo->sinfo_timetolive = 0;
 					}
 					sndrcvinfo->sinfo_flags |= prinfo.pr_policy;
 					break;
 				case SCTP_AUTHINFO:
 					if (cmsg_data_len < (int)sizeof(struct sctp_authinfo)) {
 						return (found);
 					}
 					m_copydata(control, cmsg_data_off, sizeof(struct sctp_authinfo), (caddr_t)&authinfo);
 					sndrcvinfo->sinfo_keynumber_valid = 1;
 					sndrcvinfo->sinfo_keynumber = authinfo.auth_keynumber;
 					break;
 				default:
 					return (found);
 				}
 				found = 1;
 			}
 		}
 	}
 	return (found);
 }
 
 static int
 sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error)
 {
 	struct cmsghdr cmh;
 	struct sctp_initmsg initmsg;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 	int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off;
 
 	tot_len = SCTP_BUF_LEN(control);
 	for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) {
 		rem_len = tot_len - off;
 		if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) {
 			/* There is not enough room for one more. */
 			*error = EINVAL;
 			return (1);
 		}
 		m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh);
 		if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) {
 			/* We dont't have a complete CMSG header. */
 			*error = EINVAL;
 			return (1);
 		}
 		if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) {
 			/* We don't have the complete CMSG. */
 			*error = EINVAL;
 			return (1);
 		}
 		cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh));
 		cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh));
 		if (cmh.cmsg_level == IPPROTO_SCTP) {
 			switch (cmh.cmsg_type) {
 			case SCTP_INIT:
 				if (cmsg_data_len < (int)sizeof(struct sctp_initmsg)) {
 					*error = EINVAL;
 					return (1);
 				}
 				m_copydata(control, cmsg_data_off, sizeof(struct sctp_initmsg), (caddr_t)&initmsg);
 				if (initmsg.sinit_max_attempts)
 					stcb->asoc.max_init_times = initmsg.sinit_max_attempts;
 				if (initmsg.sinit_num_ostreams)
 					stcb->asoc.pre_open_streams = initmsg.sinit_num_ostreams;
 				if (initmsg.sinit_max_instreams)
 					stcb->asoc.max_inbound_streams = initmsg.sinit_max_instreams;
 				if (initmsg.sinit_max_init_timeo)
 					stcb->asoc.initial_init_rto_max = initmsg.sinit_max_init_timeo;
 				if (stcb->asoc.streamoutcnt < stcb->asoc.pre_open_streams) {
 					struct sctp_stream_out *tmp_str;
 					unsigned int i;
 #if defined(SCTP_DETAILED_STR_STATS)
 					int j;
 #endif
 
 					/* Default is NOT correct */
 					SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n",
 					    stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams);
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_MALLOC(tmp_str,
 					    struct sctp_stream_out *,
 					    (stcb->asoc.pre_open_streams * sizeof(struct sctp_stream_out)),
 					    SCTP_M_STRMO);
 					SCTP_TCB_LOCK(stcb);
 					if (tmp_str != NULL) {
 						SCTP_FREE(stcb->asoc.strmout, SCTP_M_STRMO);
 						stcb->asoc.strmout = tmp_str;
 						stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt = stcb->asoc.pre_open_streams;
 					} else {
 						stcb->asoc.pre_open_streams = stcb->asoc.streamoutcnt;
 					}
 					for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 						TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
 						stcb->asoc.strmout[i].chunks_on_queues = 0;
 						stcb->asoc.strmout[i].next_mid_ordered = 0;
 						stcb->asoc.strmout[i].next_mid_unordered = 0;
 #if defined(SCTP_DETAILED_STR_STATS)
 						for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
 							stcb->asoc.strmout[i].abandoned_sent[j] = 0;
 							stcb->asoc.strmout[i].abandoned_unsent[j] = 0;
 						}
 #else
 						stcb->asoc.strmout[i].abandoned_sent[0] = 0;
 						stcb->asoc.strmout[i].abandoned_unsent[0] = 0;
 #endif
 						stcb->asoc.strmout[i].sid = i;
 						stcb->asoc.strmout[i].last_msg_incomplete = 0;
 						stcb->asoc.strmout[i].state = SCTP_STREAM_OPENING;
 						stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL);
 					}
 				}
 				break;
 #ifdef INET
 			case SCTP_DSTADDRV4:
 				if (cmsg_data_len < (int)sizeof(struct in_addr)) {
 					*error = EINVAL;
 					return (1);
 				}
 				memset(&sin, 0, sizeof(struct sockaddr_in));
 				sin.sin_family = AF_INET;
 				sin.sin_len = sizeof(struct sockaddr_in);
 				sin.sin_port = stcb->rport;
 				m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr);
 				if ((sin.sin_addr.s_addr == INADDR_ANY) ||
 				    (sin.sin_addr.s_addr == INADDR_BROADCAST) ||
 				    IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 					*error = EINVAL;
 					return (1);
 				}
 				if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port,
 				    SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
 					*error = ENOBUFS;
 					return (1);
 				}
 				break;
 #endif
 #ifdef INET6
 			case SCTP_DSTADDRV6:
 				if (cmsg_data_len < (int)sizeof(struct in6_addr)) {
 					*error = EINVAL;
 					return (1);
 				}
 				memset(&sin6, 0, sizeof(struct sockaddr_in6));
 				sin6.sin6_family = AF_INET6;
 				sin6.sin6_len = sizeof(struct sockaddr_in6);
 				sin6.sin6_port = stcb->rport;
 				m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr);
 				if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) ||
 				    IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) {
 					*error = EINVAL;
 					return (1);
 				}
 #ifdef INET
 				if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) {
 					in6_sin6_2_sin(&sin, &sin6);
 					if ((sin.sin_addr.s_addr == INADDR_ANY) ||
 					    (sin.sin_addr.s_addr == INADDR_BROADCAST) ||
 					    IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 						*error = EINVAL;
 						return (1);
 					}
 					if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port,
 					    SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
 						*error = ENOBUFS;
 						return (1);
 					}
 				} else
 #endif
 					if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL, stcb->asoc.port,
 				    SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
 					*error = ENOBUFS;
 					return (1);
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 		}
 	}
 	return (0);
 }
 
 #if defined(INET) || defined(INET6)
 static struct sctp_tcb *
 sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p,
     uint16_t port,
     struct mbuf *control,
     struct sctp_nets **net_p,
     int *error)
 {
 	struct cmsghdr cmh;
 	struct sctp_tcb *stcb;
 	struct sockaddr *addr;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 	int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off;
 
 	tot_len = SCTP_BUF_LEN(control);
 	for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) {
 		rem_len = tot_len - off;
 		if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) {
 			/* There is not enough room for one more. */
 			*error = EINVAL;
 			return (NULL);
 		}
 		m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh);
 		if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) {
 			/* We dont't have a complete CMSG header. */
 			*error = EINVAL;
 			return (NULL);
 		}
 		if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) {
 			/* We don't have the complete CMSG. */
 			*error = EINVAL;
 			return (NULL);
 		}
 		cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh));
 		cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh));
 		if (cmh.cmsg_level == IPPROTO_SCTP) {
 			switch (cmh.cmsg_type) {
 #ifdef INET
 			case SCTP_DSTADDRV4:
 				if (cmsg_data_len < (int)sizeof(struct in_addr)) {
 					*error = EINVAL;
 					return (NULL);
 				}
 				memset(&sin, 0, sizeof(struct sockaddr_in));
 				sin.sin_family = AF_INET;
 				sin.sin_len = sizeof(struct sockaddr_in);
 				sin.sin_port = port;
 				m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr);
 				addr = (struct sockaddr *)&sin;
 				break;
 #endif
 #ifdef INET6
 			case SCTP_DSTADDRV6:
 				if (cmsg_data_len < (int)sizeof(struct in6_addr)) {
 					*error = EINVAL;
 					return (NULL);
 				}
 				memset(&sin6, 0, sizeof(struct sockaddr_in6));
 				sin6.sin6_family = AF_INET6;
 				sin6.sin6_len = sizeof(struct sockaddr_in6);
 				sin6.sin6_port = port;
 				m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr);
 #ifdef INET
 				if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) {
 					in6_sin6_2_sin(&sin, &sin6);
 					addr = (struct sockaddr *)&sin;
 				} else
 #endif
 					addr = (struct sockaddr *)&sin6;
 				break;
 #endif
 			default:
 				addr = NULL;
 				break;
 			}
 			if (addr) {
 				stcb = sctp_findassociation_ep_addr(inp_p, addr, net_p, NULL, NULL);
 				if (stcb != NULL) {
 					return (stcb);
 				}
 			}
 		}
 	}
 	return (NULL);
 }
 #endif
 
 static struct mbuf *
 sctp_add_cookie(struct mbuf *init, int init_offset,
     struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t **signature)
 {
 	struct mbuf *copy_init, *copy_initack, *m_at, *sig, *mret;
 	struct sctp_state_cookie *stc;
 	struct sctp_paramhdr *ph;
 	uint16_t cookie_sz;
 
 	mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) +
 	    sizeof(struct sctp_paramhdr)), 0,
 	    M_NOWAIT, 1, MT_DATA);
 	if (mret == NULL) {
 		return (NULL);
 	}
 	copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_NOWAIT);
 	if (copy_init == NULL) {
 		sctp_m_freem(mret);
 		return (NULL);
 	}
 #ifdef SCTP_MBUF_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mbc(copy_init, SCTP_MBUF_ICOPY);
 	}
 #endif
 	copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL,
 	    M_NOWAIT);
 	if (copy_initack == NULL) {
 		sctp_m_freem(mret);
 		sctp_m_freem(copy_init);
 		return (NULL);
 	}
 #ifdef SCTP_MBUF_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mbc(copy_initack, SCTP_MBUF_ICOPY);
 	}
 #endif
 	/* easy side we just drop it on the end */
 	ph = mtod(mret, struct sctp_paramhdr *);
 	SCTP_BUF_LEN(mret) = sizeof(struct sctp_state_cookie) +
 	    sizeof(struct sctp_paramhdr);
 	stc = (struct sctp_state_cookie *)((caddr_t)ph +
 	    sizeof(struct sctp_paramhdr));
 	ph->param_type = htons(SCTP_STATE_COOKIE);
 	ph->param_length = 0;	/* fill in at the end */
 	/* Fill in the stc cookie data */
 	memcpy(stc, stc_in, sizeof(struct sctp_state_cookie));
 
 	/* tack the INIT and then the INIT-ACK onto the chain */
 	cookie_sz = 0;
 	for (m_at = mret; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
 		cookie_sz += SCTP_BUF_LEN(m_at);
 		if (SCTP_BUF_NEXT(m_at) == NULL) {
 			SCTP_BUF_NEXT(m_at) = copy_init;
 			break;
 		}
 	}
 	for (m_at = copy_init; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
 		cookie_sz += SCTP_BUF_LEN(m_at);
 		if (SCTP_BUF_NEXT(m_at) == NULL) {
 			SCTP_BUF_NEXT(m_at) = copy_initack;
 			break;
 		}
 	}
 	for (m_at = copy_initack; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
 		cookie_sz += SCTP_BUF_LEN(m_at);
 		if (SCTP_BUF_NEXT(m_at) == NULL) {
 			break;
 		}
 	}
 	sig = sctp_get_mbuf_for_msg(SCTP_SIGNATURE_SIZE, 0, M_NOWAIT, 1, MT_DATA);
 	if (sig == NULL) {
 		/* no space, so free the entire chain */
 		sctp_m_freem(mret);
 		return (NULL);
 	}
 	SCTP_BUF_NEXT(m_at) = sig;
 	SCTP_BUF_LEN(sig) = SCTP_SIGNATURE_SIZE;
 	cookie_sz += SCTP_SIGNATURE_SIZE;
 	ph->param_length = htons(cookie_sz);
 	*signature = (uint8_t *)mtod(sig, caddr_t);
 	memset(*signature, 0, SCTP_SIGNATURE_SIZE);
 	return (mret);
 }
 
 static uint8_t
 sctp_get_ect(struct sctp_tcb *stcb)
 {
 	if ((stcb != NULL) && (stcb->asoc.ecn_supported == 1)) {
 		return (SCTP_ECT0_BIT);
 	} else {
 		return (0);
 	}
 }
 
 #if defined(INET) || defined(INET6)
 static void
 sctp_handle_no_route(struct sctp_tcb *stcb,
     struct sctp_nets *net,
     int so_locked)
 {
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "dropped packet - no valid source addr\n");
 
 	if (net) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT1, "Destination was ");
 		SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT1, &net->ro._l_addr.sa);
 		if (net->dest_state & SCTP_ADDR_CONFIRMED) {
 			if ((net->dest_state & SCTP_ADDR_REACHABLE) && stcb) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "no route takes interface %p down\n", (void *)net);
 				sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
 				    stcb, 0,
 				    (void *)net,
 				    so_locked);
 				net->dest_state &= ~SCTP_ADDR_REACHABLE;
 				net->dest_state &= ~SCTP_ADDR_PF;
 			}
 		}
 		if (stcb) {
 			if (net == stcb->asoc.primary_destination) {
 				/* need a new primary */
 				struct sctp_nets *alt;
 
 				alt = sctp_find_alternate_net(stcb, net, 0);
 				if (alt != net) {
 					if (stcb->asoc.alternate) {
 						sctp_free_remote_addr(stcb->asoc.alternate);
 					}
 					stcb->asoc.alternate = alt;
 					atomic_add_int(&stcb->asoc.alternate->ref_count, 1);
 					if (net->ro._s_addr) {
 						sctp_free_ifa(net->ro._s_addr);
 						net->ro._s_addr = NULL;
 					}
 					net->src_addr_selected = 0;
 				}
 			}
 		}
 	}
 }
 #endif
 
 static int
 sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,	/* may be NULL */
     struct sctp_nets *net,
     struct sockaddr *to,
     struct mbuf *m,
     uint32_t auth_offset,
     struct sctp_auth_chunk *auth,
     uint16_t auth_keyid,
     int nofragment_flag,
     int ecn_ok,
     int out_of_asoc_ok,
     uint16_t src_port,
     uint16_t dest_port,
     uint32_t v_tag,
     uint16_t port,
     union sctp_sockstore *over_addr,
     uint8_t mflowtype, uint32_t mflowid,
     int so_locked)
 {
 /* nofragment_flag to tell if IP_DF should be set (IPv4 only) */
 	/**
 	 * Given a mbuf chain (via SCTP_BUF_NEXT()) that holds a packet header
 	 * WITH an SCTPHDR but no IP header, endpoint inp and sa structure:
 	 * - fill in the HMAC digest of any AUTH chunk in the packet.
 	 * - calculate and fill in the SCTP checksum.
 	 * - prepend an IP address header.
 	 * - if boundall use INADDR_ANY.
 	 * - if boundspecific do source address selection.
 	 * - set fragmentation option for ipV4.
 	 * - On return from IP output, check/adjust mtu size of output
 	 *   interface and smallest_mtu size as well.
 	 */
 	/* Will need ifdefs around this */
 	struct mbuf *newm;
 	struct sctphdr *sctphdr;
 	int packet_length;
 	int ret;
 #if defined(INET) || defined(INET6)
 	uint32_t vrf_id;
 #endif
 #if defined(INET) || defined(INET6)
 	struct mbuf *o_pak;
 	sctp_route_t *ro = NULL;
 	struct udphdr *udp = NULL;
 #endif
 	uint8_t tos_value;
 
 	if ((net) && (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE)) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
 		sctp_m_freem(m);
 		return (EFAULT);
 	}
 #if defined(INET) || defined(INET6)
 	if (stcb) {
 		vrf_id = stcb->asoc.vrf_id;
 	} else {
 		vrf_id = inp->def_vrf_id;
 	}
 #endif
 	/* fill in the HMAC digest for any AUTH chunk in the packet */
 	if ((auth != NULL) && (stcb != NULL)) {
 		sctp_fill_hmac_digest_m(m, auth_offset, auth, stcb, auth_keyid);
 	}
 
 	if (net) {
 		tos_value = net->dscp;
 	} else if (stcb) {
 		tos_value = stcb->asoc.default_dscp;
 	} else {
 		tos_value = inp->sctp_ep.default_dscp;
 	}
 
 	switch (to->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct ip *ip = NULL;
 			sctp_route_t iproute;
 			int len;
 
 			len = SCTP_MIN_V4_OVERHEAD;
 			if (port) {
 				len += sizeof(struct udphdr);
 			}
 			newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA);
 			if (newm == NULL) {
 				sctp_m_freem(m);
 				SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 				return (ENOMEM);
 			}
 			SCTP_ALIGN_TO_END(newm, len);
 			SCTP_BUF_LEN(newm) = len;
 			SCTP_BUF_NEXT(newm) = m;
 			m = newm;
 			if (net != NULL) {
 				m->m_pkthdr.flowid = net->flowid;
 				M_HASHTYPE_SET(m, net->flowtype);
 			} else {
 				m->m_pkthdr.flowid = mflowid;
 				M_HASHTYPE_SET(m, mflowtype);
 			}
 			packet_length = sctp_calculate_len(m);
 			ip = mtod(m, struct ip *);
 			ip->ip_v = IPVERSION;
 			ip->ip_hl = (sizeof(struct ip) >> 2);
 			if (tos_value == 0) {
 				/*
 				 * This means especially, that it is not set
 				 * at the SCTP layer. So use the value from
 				 * the IP layer.
 				 */
 				tos_value = inp->ip_inp.inp.inp_ip_tos;
 			}
 			tos_value &= 0xfc;
 			if (ecn_ok) {
 				tos_value |= sctp_get_ect(stcb);
 			}
 			if ((nofragment_flag) && (port == 0)) {
 				ip->ip_off = htons(IP_DF);
 			} else {
 				ip->ip_off = htons(0);
 			}
 			/* FreeBSD has a function for ip_id's */
 			ip_fillid(ip);
 
 			ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl;
 			ip->ip_len = htons(packet_length);
 			ip->ip_tos = tos_value;
 			if (port) {
 				ip->ip_p = IPPROTO_UDP;
 			} else {
 				ip->ip_p = IPPROTO_SCTP;
 			}
 			ip->ip_sum = 0;
 			if (net == NULL) {
 				ro = &iproute;
 				memset(&iproute, 0, sizeof(iproute));
 				memcpy(&ro->ro_dst, to, to->sa_len);
 			} else {
 				ro = (sctp_route_t *)&net->ro;
 			}
 			/* Now the address selection part */
 			ip->ip_dst.s_addr = ((struct sockaddr_in *)to)->sin_addr.s_addr;
 
 			/* call the routine to select the src address */
 			if (net && out_of_asoc_ok == 0) {
 				if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) {
 					sctp_free_ifa(net->ro._s_addr);
 					net->ro._s_addr = NULL;
 					net->src_addr_selected = 0;
 					RO_NHFREE(ro);
 				}
 				if (net->src_addr_selected == 0) {
 					/* Cache the source address */
 					net->ro._s_addr = sctp_source_address_selection(inp, stcb,
 					    ro, net, 0,
 					    vrf_id);
 					net->src_addr_selected = 1;
 				}
 				if (net->ro._s_addr == NULL) {
 					/* No route to host */
 					net->src_addr_selected = 0;
 					sctp_handle_no_route(stcb, net, so_locked);
 					SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 					sctp_m_freem(m);
 					return (EHOSTUNREACH);
 				}
 				ip->ip_src = net->ro._s_addr->address.sin.sin_addr;
 			} else {
 				if (over_addr == NULL) {
 					struct sctp_ifa *_lsrc;
 
 					_lsrc = sctp_source_address_selection(inp, stcb, ro,
 					    net,
 					    out_of_asoc_ok,
 					    vrf_id);
 					if (_lsrc == NULL) {
 						sctp_handle_no_route(stcb, net, so_locked);
 						SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 						sctp_m_freem(m);
 						return (EHOSTUNREACH);
 					}
 					ip->ip_src = _lsrc->address.sin.sin_addr;
 					sctp_free_ifa(_lsrc);
 				} else {
 					ip->ip_src = over_addr->sin.sin_addr;
 					SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
 				}
 			}
 			if (port) {
 				if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
 					sctp_handle_no_route(stcb, net, so_locked);
 					SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 					sctp_m_freem(m);
 					return (EHOSTUNREACH);
 				}
 				udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
 				udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
 				udp->uh_dport = port;
 				udp->uh_ulen = htons((uint16_t)(packet_length - sizeof(struct ip)));
 				if (V_udp_cksum) {
 					udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
 				} else {
 					udp->uh_sum = 0;
 				}
 				sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
 			} else {
 				sctphdr = (struct sctphdr *)((caddr_t)ip + sizeof(struct ip));
 			}
 
 			sctphdr->src_port = src_port;
 			sctphdr->dest_port = dest_port;
 			sctphdr->v_tag = v_tag;
 			sctphdr->checksum = 0;
 
 			/*
 			 * If source address selection fails and we find no
 			 * route then the ip_output should fail as well with
 			 * a NO_ROUTE_TO_HOST type error. We probably should
 			 * catch that somewhere and abort the association
 			 * right away (assuming this is an INIT being sent).
 			 */
 			if (ro->ro_nh == NULL) {
 				/*
 				 * src addr selection failed to find a route
 				 * (or valid source addr), so we can't get
 				 * there from here (yet)!
 				 */
 				sctp_handle_no_route(stcb, net, so_locked);
 				SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 				sctp_m_freem(m);
 				return (EHOSTUNREACH);
 			}
 			if (ro != &iproute) {
 				memcpy(&iproute, ro, sizeof(*ro));
 			}
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv4 output routine from low level src addr:%x\n",
 			    (uint32_t)(ntohl(ip->ip_src.s_addr)));
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n",
 			    (uint32_t)(ntohl(ip->ip_dst.s_addr)));
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n",
 			    (void *)ro->ro_nh);
 
 			if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
 				/* failed to prepend data, give up */
 				SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 				sctp_m_freem(m);
 				return (ENOMEM);
 			}
 			SCTP_ATTACH_CHAIN(o_pak, m, packet_length);
 			if (port) {
 				sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr));
 				SCTP_STAT_INCR(sctps_sendswcrc);
 				if (V_udp_cksum) {
 					SCTP_ENABLE_UDP_CSUM(o_pak);
 				}
 			} else {
 				m->m_pkthdr.csum_flags = CSUM_SCTP;
 				m->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum);
 				SCTP_STAT_INCR(sctps_sendhwcrc);
 			}
 #ifdef SCTP_PACKET_LOGGING
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
 				sctp_packet_log(o_pak);
 #endif
 			/* send it out.  table id is taken from stcb */
 			SCTP_PROBE5(send, NULL, stcb, ip, stcb, sctphdr);
 			SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id);
 			if (port) {
 				UDPSTAT_INC(udps_opackets);
 			}
 			SCTP_STAT_INCR(sctps_sendpackets);
 			SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
 			if (ret)
 				SCTP_STAT_INCR(sctps_senderrors);
 
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret);
 			if (net == NULL) {
 				/* free tempy routes */
 				RO_NHFREE(ro);
 			} else {
 				if ((ro->ro_nh != NULL) && (net->ro._s_addr) &&
 				    ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) {
 					uint32_t mtu;
 
 					mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh);
 					if (mtu > 0) {
 						if (net->port) {
 							mtu -= sizeof(struct udphdr);
 						}
 						if (mtu < net->mtu) {
 							if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) {
 								sctp_mtu_size_reset(inp, &stcb->asoc, mtu);
 							}
 							net->mtu = mtu;
 						}
 					}
 				} else if (ro->ro_nh == NULL) {
 					/* route was freed */
 					if (net->ro._s_addr &&
 					    net->src_addr_selected) {
 						sctp_free_ifa(net->ro._s_addr);
 						net->ro._s_addr = NULL;
 					}
 					net->src_addr_selected = 0;
 				}
 			}
 			return (ret);
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			uint32_t flowlabel, flowinfo;
 			struct ip6_hdr *ip6h;
 			struct route_in6 ip6route;
 			struct ifnet *ifp;
 			struct sockaddr_in6 *sin6, tmp, *lsa6, lsa6_tmp;
 			int prev_scope = 0;
 			struct sockaddr_in6 lsa6_storage;
 			int error;
 			u_short prev_port = 0;
 			int len;
 
 			if (net) {
 				flowlabel = net->flowlabel;
 			} else if (stcb) {
 				flowlabel = stcb->asoc.default_flowlabel;
 			} else {
 				flowlabel = inp->sctp_ep.default_flowlabel;
 			}
 			if (flowlabel == 0) {
 				/*
 				 * This means especially, that it is not set
 				 * at the SCTP layer. So use the value from
 				 * the IP layer.
 				 */
 				flowlabel = ntohl(((struct inpcb *)inp)->inp_flow);
 			}
 			flowlabel &= 0x000fffff;
 			len = SCTP_MIN_OVERHEAD;
 			if (port) {
 				len += sizeof(struct udphdr);
 			}
 			newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA);
 			if (newm == NULL) {
 				sctp_m_freem(m);
 				SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 				return (ENOMEM);
 			}
 			SCTP_ALIGN_TO_END(newm, len);
 			SCTP_BUF_LEN(newm) = len;
 			SCTP_BUF_NEXT(newm) = m;
 			m = newm;
 			if (net != NULL) {
 				m->m_pkthdr.flowid = net->flowid;
 				M_HASHTYPE_SET(m, net->flowtype);
 			} else {
 				m->m_pkthdr.flowid = mflowid;
 				M_HASHTYPE_SET(m, mflowtype);
 			}
 			packet_length = sctp_calculate_len(m);
 
 			ip6h = mtod(m, struct ip6_hdr *);
 			/* protect *sin6 from overwrite */
 			sin6 = (struct sockaddr_in6 *)to;
 			tmp = *sin6;
 			sin6 = &tmp;
 
 			/* KAME hack: embed scopeid */
 			if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 				SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 				sctp_m_freem(m);
 				return (EINVAL);
 			}
 			if (net == NULL) {
 				memset(&ip6route, 0, sizeof(ip6route));
 				ro = (sctp_route_t *)&ip6route;
 				memcpy(&ro->ro_dst, sin6, sin6->sin6_len);
 			} else {
 				ro = (sctp_route_t *)&net->ro;
 			}
 			/*
 			 * We assume here that inp_flow is in host byte
 			 * order within the TCB!
 			 */
 			if (tos_value == 0) {
 				/*
 				 * This means especially, that it is not set
 				 * at the SCTP layer. So use the value from
 				 * the IP layer.
 				 */
 				tos_value = (ntohl(((struct inpcb *)inp)->inp_flow) >> 20) & 0xff;
 			}
 			tos_value &= 0xfc;
 			if (ecn_ok) {
 				tos_value |= sctp_get_ect(stcb);
 			}
 			flowinfo = 0x06;
 			flowinfo <<= 8;
 			flowinfo |= tos_value;
 			flowinfo <<= 20;
 			flowinfo |= flowlabel;
 			ip6h->ip6_flow = htonl(flowinfo);
 			if (port) {
 				ip6h->ip6_nxt = IPPROTO_UDP;
 			} else {
 				ip6h->ip6_nxt = IPPROTO_SCTP;
 			}
 			ip6h->ip6_plen = htons((uint16_t)(packet_length - sizeof(struct ip6_hdr)));
 			ip6h->ip6_dst = sin6->sin6_addr;
 
 			/*
 			 * Add SRC address selection here: we can only reuse
 			 * to a limited degree the kame src-addr-sel, since
 			 * we can try their selection but it may not be
 			 * bound.
 			 */
 			memset(&lsa6_tmp, 0, sizeof(lsa6_tmp));
 			lsa6_tmp.sin6_family = AF_INET6;
 			lsa6_tmp.sin6_len = sizeof(lsa6_tmp);
 			lsa6 = &lsa6_tmp;
 			if (net && out_of_asoc_ok == 0) {
 				if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) {
 					sctp_free_ifa(net->ro._s_addr);
 					net->ro._s_addr = NULL;
 					net->src_addr_selected = 0;
 					RO_NHFREE(ro);
 				}
 				if (net->src_addr_selected == 0) {
 					sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 					/* KAME hack: embed scopeid */
 					if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 						SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 						sctp_m_freem(m);
 						return (EINVAL);
 					}
 					/* Cache the source address */
 					net->ro._s_addr = sctp_source_address_selection(inp,
 					    stcb,
 					    ro,
 					    net,
 					    0,
 					    vrf_id);
 					(void)sa6_recoverscope(sin6);
 					net->src_addr_selected = 1;
 				}
 				if (net->ro._s_addr == NULL) {
 					SCTPDBG(SCTP_DEBUG_OUTPUT3, "V6:No route to host\n");
 					net->src_addr_selected = 0;
 					sctp_handle_no_route(stcb, net, so_locked);
 					SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 					sctp_m_freem(m);
 					return (EHOSTUNREACH);
 				}
 				lsa6->sin6_addr = net->ro._s_addr->address.sin6.sin6_addr;
 			} else {
 				sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
 				/* KAME hack: embed scopeid */
 				if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 					SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 					sctp_m_freem(m);
 					return (EINVAL);
 				}
 				if (over_addr == NULL) {
 					struct sctp_ifa *_lsrc;
 
 					_lsrc = sctp_source_address_selection(inp, stcb, ro,
 					    net,
 					    out_of_asoc_ok,
 					    vrf_id);
 					if (_lsrc == NULL) {
 						sctp_handle_no_route(stcb, net, so_locked);
 						SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 						sctp_m_freem(m);
 						return (EHOSTUNREACH);
 					}
 					lsa6->sin6_addr = _lsrc->address.sin6.sin6_addr;
 					sctp_free_ifa(_lsrc);
 				} else {
 					lsa6->sin6_addr = over_addr->sin6.sin6_addr;
 					SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
 				}
 				(void)sa6_recoverscope(sin6);
 			}
 			lsa6->sin6_port = inp->sctp_lport;
 
 			if (ro->ro_nh == NULL) {
 				/*
 				 * src addr selection failed to find a route
 				 * (or valid source addr), so we can't get
 				 * there from here!
 				 */
 				sctp_handle_no_route(stcb, net, so_locked);
 				SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 				sctp_m_freem(m);
 				return (EHOSTUNREACH);
 			}
 			/*
 			 * XXX: sa6 may not have a valid sin6_scope_id in
 			 * the non-SCOPEDROUTING case.
 			 */
 			memset(&lsa6_storage, 0, sizeof(lsa6_storage));
 			lsa6_storage.sin6_family = AF_INET6;
 			lsa6_storage.sin6_len = sizeof(lsa6_storage);
 			lsa6_storage.sin6_addr = lsa6->sin6_addr;
 			if ((error = sa6_recoverscope(&lsa6_storage)) != 0) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT3, "recover scope fails error %d\n", error);
 				sctp_m_freem(m);
 				return (error);
 			}
 			/* XXX */
 			lsa6_storage.sin6_addr = lsa6->sin6_addr;
 			lsa6_storage.sin6_port = inp->sctp_lport;
 			lsa6 = &lsa6_storage;
 			ip6h->ip6_src = lsa6->sin6_addr;
 
 			if (port) {
 				if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
 					sctp_handle_no_route(stcb, net, so_locked);
 					SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH);
 					sctp_m_freem(m);
 					return (EHOSTUNREACH);
 				}
 				udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr));
 				udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
 				udp->uh_dport = port;
 				udp->uh_ulen = htons((uint16_t)(packet_length - sizeof(struct ip6_hdr)));
 				udp->uh_sum = 0;
 				sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
 			} else {
 				sctphdr = (struct sctphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr));
 			}
 
 			sctphdr->src_port = src_port;
 			sctphdr->dest_port = dest_port;
 			sctphdr->v_tag = v_tag;
 			sctphdr->checksum = 0;
 
 			/*
 			 * We set the hop limit now since there is a good
 			 * chance that our ro pointer is now filled
 			 */
 			ip6h->ip6_hlim = SCTP_GET_HLIM(inp, ro);
 			ifp = SCTP_GET_IFN_VOID_FROM_ROUTE(ro);
 
 #ifdef SCTP_DEBUG
 			/* Copy to be sure something bad is not happening */
 			sin6->sin6_addr = ip6h->ip6_dst;
 			lsa6->sin6_addr = ip6h->ip6_src;
 #endif
 
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv6 output routine from low level\n");
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "src: ");
 			SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)lsa6);
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst: ");
 			SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)sin6);
 			if (net) {
 				sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 				/*
 				 * preserve the port and scope for link
 				 * local send
 				 */
 				prev_scope = sin6->sin6_scope_id;
 				prev_port = sin6->sin6_port;
 			}
 
 			if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
 				/* failed to prepend data, give up */
 				sctp_m_freem(m);
 				SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 				return (ENOMEM);
 			}
 			SCTP_ATTACH_CHAIN(o_pak, m, packet_length);
 			if (port) {
 				sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
 				SCTP_STAT_INCR(sctps_sendswcrc);
 				if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), packet_length - sizeof(struct ip6_hdr))) == 0) {
 					udp->uh_sum = 0xffff;
 				}
 			} else {
 				m->m_pkthdr.csum_flags = CSUM_SCTP_IPV6;
 				m->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum);
 				SCTP_STAT_INCR(sctps_sendhwcrc);
 			}
 			/* send it out. table id is taken from stcb */
 #ifdef SCTP_PACKET_LOGGING
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
 				sctp_packet_log(o_pak);
 #endif
 			SCTP_PROBE5(send, NULL, stcb, ip6h, stcb, sctphdr);
 			SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id);
 			if (net) {
 				/* for link local this must be done */
 				sin6->sin6_scope_id = prev_scope;
 				sin6->sin6_port = prev_port;
 			}
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret);
 			if (port) {
 				UDPSTAT_INC(udps_opackets);
 			}
 			SCTP_STAT_INCR(sctps_sendpackets);
 			SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
 			if (ret) {
 				SCTP_STAT_INCR(sctps_senderrors);
 			}
 			if (net == NULL) {
 				/* Now if we had a temp route free it */
 				RO_NHFREE(ro);
 			} else {
 				/*
 				 * PMTU check versus smallest asoc MTU goes
 				 * here
 				 */
 				if (ro->ro_nh == NULL) {
 					/* Route was freed */
 					if (net->ro._s_addr &&
 					    net->src_addr_selected) {
 						sctp_free_ifa(net->ro._s_addr);
 						net->ro._s_addr = NULL;
 					}
 					net->src_addr_selected = 0;
 				}
 				if ((ro->ro_nh != NULL) && (net->ro._s_addr) &&
 				    ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) {
 					uint32_t mtu;
 
 					mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh);
 					if (mtu > 0) {
 						if (net->port) {
 							mtu -= sizeof(struct udphdr);
 						}
 						if (mtu < net->mtu) {
 							if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) {
 								sctp_mtu_size_reset(inp, &stcb->asoc, mtu);
 							}
 							net->mtu = mtu;
 						}
 					}
 				} else if (ifp) {
 					if (ND_IFINFO(ifp)->linkmtu &&
 					    (stcb->asoc.smallest_mtu > ND_IFINFO(ifp)->linkmtu)) {
 						sctp_mtu_size_reset(inp,
 						    &stcb->asoc,
 						    ND_IFINFO(ifp)->linkmtu);
 					}
 				}
 			}
 			return (ret);
 		}
 #endif
 	default:
 		SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n",
 		    ((struct sockaddr *)to)->sa_family);
 		sctp_m_freem(m);
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
 		return (EFAULT);
 	}
 }
 
 
 void
 sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked)
 {
 	struct mbuf *m, *m_last;
 	struct sctp_nets *net;
 	struct sctp_init_chunk *init;
 	struct sctp_supported_addr_param *sup_addr;
 	struct sctp_adaptation_layer_indication *ali;
 	struct sctp_supported_chunk_types_param *pr_supported;
 	struct sctp_paramhdr *ph;
 	int cnt_inits_to = 0;
 	int error;
 	uint16_t num_ext, chunk_len, padding_len, parameter_len;
 
 	/* INIT's always go to the primary (and usually ONLY address) */
 	net = stcb->asoc.primary_destination;
 	if (net == NULL) {
 		net = TAILQ_FIRST(&stcb->asoc.nets);
 		if (net == NULL) {
 			/* TSNH */
 			return;
 		}
 		/* we confirm any address we send an INIT to */
 		net->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
 		(void)sctp_set_primary_addr(stcb, NULL, net);
 	} else {
 		/* we confirm any address we send an INIT to */
 		net->dest_state &= ~SCTP_ADDR_UNCONFIRMED;
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT\n");
 #ifdef INET6
 	if (net->ro._l_addr.sa.sa_family == AF_INET6) {
 		/*
 		 * special hook, if we are sending to link local it will not
 		 * show up in our private address count.
 		 */
 		if (IN6_IS_ADDR_LINKLOCAL(&net->ro._l_addr.sin6.sin6_addr))
 			cnt_inits_to = 1;
 	}
 #endif
 	if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
 		/* This case should not happen */
 		SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - failed timer?\n");
 		return;
 	}
 	/* start the INIT timer */
 	sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net);
 
 	m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_NOWAIT, 1, MT_DATA);
 	if (m == NULL) {
 		/* No memory, INIT timer will re-attempt. */
 		SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n");
 		return;
 	}
 	chunk_len = (uint16_t)sizeof(struct sctp_init_chunk);
 	padding_len = 0;
 	/* Now lets put the chunk header in place */
 	init = mtod(m, struct sctp_init_chunk *);
 	/* now the chunk header */
 	init->ch.chunk_type = SCTP_INITIATION;
 	init->ch.chunk_flags = 0;
 	/* fill in later from mbuf we build */
 	init->ch.chunk_length = 0;
 	/* place in my tag */
 	init->init.initiate_tag = htonl(stcb->asoc.my_vtag);
 	/* set up some of the credits. */
 	init->init.a_rwnd = htonl(max(inp->sctp_socket ? SCTP_SB_LIMIT_RCV(inp->sctp_socket) : 0,
 	    SCTP_MINIMAL_RWND));
 	init->init.num_outbound_streams = htons(stcb->asoc.pre_open_streams);
 	init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams);
 	init->init.initial_tsn = htonl(stcb->asoc.init_seq_number);
 
 	/* Adaptation layer indication parameter */
 	if (inp->sctp_ep.adaptation_layer_indicator_provided) {
 		parameter_len = (uint16_t)sizeof(struct sctp_adaptation_layer_indication);
 		ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len);
 		ali->ph.param_type = htons(SCTP_ULP_ADAPTATION);
 		ali->ph.param_length = htons(parameter_len);
 		ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator);
 		chunk_len += parameter_len;
 	}
 
 	/* ECN parameter */
 	if (stcb->asoc.ecn_supported == 1) {
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
 		ph->param_type = htons(SCTP_ECN_CAPABLE);
 		ph->param_length = htons(parameter_len);
 		chunk_len += parameter_len;
 	}
 
 	/* PR-SCTP supported parameter */
 	if (stcb->asoc.prsctp_supported == 1) {
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
 		ph->param_type = htons(SCTP_PRSCTP_SUPPORTED);
 		ph->param_length = htons(parameter_len);
 		chunk_len += parameter_len;
 	}
 
 	/* Add NAT friendly parameter. */
 	if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) {
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
 		ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
 		ph->param_length = htons(parameter_len);
 		chunk_len += parameter_len;
 	}
 
 	/* And now tell the peer which extensions we support */
 	num_ext = 0;
 	pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len);
 	if (stcb->asoc.prsctp_supported == 1) {
 		pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
 		if (stcb->asoc.idata_supported) {
 			pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN;
 		}
 	}
 	if (stcb->asoc.auth_supported == 1) {
 		pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION;
 	}
 	if (stcb->asoc.asconf_supported == 1) {
 		pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
 		pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
 	}
 	if (stcb->asoc.reconfig_supported == 1) {
 		pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
 	}
 	if (stcb->asoc.idata_supported) {
 		pr_supported->chunk_types[num_ext++] = SCTP_IDATA;
 	}
 	if (stcb->asoc.nrsack_supported == 1) {
 		pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK;
 	}
 	if (stcb->asoc.pktdrop_supported == 1) {
 		pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
 	}
 	if (num_ext > 0) {
 		parameter_len = (uint16_t)sizeof(struct sctp_supported_chunk_types_param) + num_ext;
 		pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
 		pr_supported->ph.param_length = htons(parameter_len);
 		padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 		chunk_len += parameter_len;
 	}
 	/* add authentication parameters */
 	if (stcb->asoc.auth_supported) {
 		/* attach RANDOM parameter, if available */
 		if (stcb->asoc.authinfo.random != NULL) {
 			struct sctp_auth_random *randp;
 
 			if (padding_len > 0) {
 				memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 				chunk_len += padding_len;
 				padding_len = 0;
 			}
 			randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len);
 			parameter_len = (uint16_t)sizeof(struct sctp_auth_random) + stcb->asoc.authinfo.random_len;
 			/* random key already contains the header */
 			memcpy(randp, stcb->asoc.authinfo.random->key, parameter_len);
 			padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 			chunk_len += parameter_len;
 		}
 		/* add HMAC_ALGO parameter */
 		if (stcb->asoc.local_hmacs != NULL) {
 			struct sctp_auth_hmac_algo *hmacs;
 
 			if (padding_len > 0) {
 				memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 				chunk_len += padding_len;
 				padding_len = 0;
 			}
 			hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len);
 			parameter_len = (uint16_t)(sizeof(struct sctp_auth_hmac_algo) +
 			    stcb->asoc.local_hmacs->num_algo * sizeof(uint16_t));
 			hmacs->ph.param_type = htons(SCTP_HMAC_LIST);
 			hmacs->ph.param_length = htons(parameter_len);
 			sctp_serialize_hmaclist(stcb->asoc.local_hmacs, (uint8_t *)hmacs->hmac_ids);
 			padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 			chunk_len += parameter_len;
 		}
 		/* add CHUNKS parameter */
 		if (stcb->asoc.local_auth_chunks != NULL) {
 			struct sctp_auth_chunk_list *chunks;
 
 			if (padding_len > 0) {
 				memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 				chunk_len += padding_len;
 				padding_len = 0;
 			}
 			chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len);
 			parameter_len = (uint16_t)(sizeof(struct sctp_auth_chunk_list) +
 			    sctp_auth_get_chklist_size(stcb->asoc.local_auth_chunks));
 			chunks->ph.param_type = htons(SCTP_CHUNK_LIST);
 			chunks->ph.param_length = htons(parameter_len);
 			sctp_serialize_auth_chunks(stcb->asoc.local_auth_chunks, chunks->chunk_types);
 			padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 			chunk_len += parameter_len;
 		}
 	}
 
 	/* now any cookie time extensions */
 	if (stcb->asoc.cookie_preserve_req) {
 		struct sctp_cookie_perserve_param *cookie_preserve;
 
 		if (padding_len > 0) {
 			memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 			chunk_len += padding_len;
 			padding_len = 0;
 		}
 		parameter_len = (uint16_t)sizeof(struct sctp_cookie_perserve_param);
 		cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len);
 		cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE);
 		cookie_preserve->ph.param_length = htons(parameter_len);
 		cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req);
 		stcb->asoc.cookie_preserve_req = 0;
 		chunk_len += parameter_len;
 	}
 
 	if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) {
 		uint8_t i;
 
 		if (padding_len > 0) {
 			memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 			chunk_len += padding_len;
 			padding_len = 0;
 		}
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		if (stcb->asoc.scope.ipv4_addr_legal) {
 			parameter_len += (uint16_t)sizeof(uint16_t);
 		}
 		if (stcb->asoc.scope.ipv6_addr_legal) {
 			parameter_len += (uint16_t)sizeof(uint16_t);
 		}
 		sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len);
 		sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE);
 		sup_addr->ph.param_length = htons(parameter_len);
 		i = 0;
 		if (stcb->asoc.scope.ipv4_addr_legal) {
 			sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS);
 		}
 		if (stcb->asoc.scope.ipv6_addr_legal) {
 			sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS);
 		}
 		padding_len = 4 - 2 * i;
 		chunk_len += parameter_len;
 	}
 
 	SCTP_BUF_LEN(m) = chunk_len;
 	/* now the addresses */
 	/*
 	 * To optimize this we could put the scoping stuff into a structure
 	 * and remove the individual uint8's from the assoc structure. Then
 	 * we could just sifa in the address within the stcb. But for now
 	 * this is a quick hack to get the address stuff teased apart.
 	 */
 	m_last = sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope,
 	    m, cnt_inits_to,
 	    &padding_len, &chunk_len);
 
 	init->ch.chunk_length = htons(chunk_len);
 	if (padding_len > 0) {
 		if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) {
 			sctp_m_freem(m);
 			return;
 		}
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - calls lowlevel_output\n");
 	if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
 	    (struct sockaddr *)&net->ro._l_addr,
 	    m, 0, NULL, 0, 0, 0, 0,
 	    inp->sctp_lport, stcb->rport, htonl(0),
 	    net->port, NULL,
 	    0, 0,
 	    so_locked))) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak send error %d\n", error);
 		if (error == ENOBUFS) {
 			stcb->asoc.ifp_had_enobuf = 1;
 			SCTP_STAT_INCR(sctps_lowlevelerr);
 		}
 	} else {
 		stcb->asoc.ifp_had_enobuf = 0;
 	}
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	(void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
 }
 
 struct mbuf *
 sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt,
     int param_offset, int *abort_processing,
     struct sctp_chunkhdr *cp,
     int *nat_friendly,
     int *cookie_found)
 {
 	/*
 	 * Given a mbuf containing an INIT or INIT-ACK with the param_offset
 	 * being equal to the beginning of the params i.e. (iphlen +
 	 * sizeof(struct sctp_init_msg) parse through the parameters to the
 	 * end of the mbuf verifying that all parameters are known.
 	 *
 	 * For unknown parameters build and return a mbuf with
 	 * UNRECOGNIZED_PARAMETER errors. If the flags indicate to stop
 	 * processing this chunk stop, and set *abort_processing to 1.
 	 *
 	 * By having param_offset be pre-set to where parameters begin it is
 	 * hoped that this routine may be reused in the future by new
 	 * features.
 	 */
 	struct sctp_paramhdr *phdr, params;
 
 	struct mbuf *mat, *m_tmp, *op_err, *op_err_last;
 	int at, limit, pad_needed;
 	uint16_t ptype, plen, padded_size;
 
 	*abort_processing = 0;
 	if (cookie_found != NULL) {
 		*cookie_found = 0;
 	}
 	mat = in_initpkt;
 	limit = ntohs(cp->chunk_length) - sizeof(struct sctp_init_chunk);
 	at = param_offset;
 	op_err = NULL;
 	op_err_last = NULL;
 	pad_needed = 0;
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "Check for unrecognized param's\n");
 	phdr = sctp_get_next_param(mat, at, &params, sizeof(params));
 	while ((phdr != NULL) && ((size_t)limit >= sizeof(struct sctp_paramhdr))) {
 		ptype = ntohs(phdr->param_type);
 		plen = ntohs(phdr->param_length);
 		if ((plen > limit) || (plen < sizeof(struct sctp_paramhdr))) {
 			/* wacked parameter */
 			SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error %d\n", plen);
 			goto invalid_size;
 		}
 		limit -= SCTP_SIZE32(plen);
 		/*-
 		 * All parameters for all chunks that we know/understand are
 		 * listed here. We process them other places and make
 		 * appropriate stop actions per the upper bits. However this
 		 * is the generic routine processor's can call to get back
 		 * an operr.. to either incorporate (init-ack) or send.
 		 */
 		padded_size = SCTP_SIZE32(plen);
 		switch (ptype) {
 			/* Param's with variable size */
 		case SCTP_HEARTBEAT_INFO:
 		case SCTP_UNRECOG_PARAM:
 		case SCTP_ERROR_CAUSE_IND:
 			/* ok skip fwd */
 			at += padded_size;
 			break;
 		case SCTP_STATE_COOKIE:
 			if (cookie_found != NULL) {
 				*cookie_found = 1;
 			}
 			at += padded_size;
 			break;
 			/* Param's with variable size within a range */
 		case SCTP_CHUNK_LIST:
 		case SCTP_SUPPORTED_CHUNK_EXT:
 			if (padded_size > (sizeof(struct sctp_supported_chunk_types_param) + (sizeof(uint8_t) * SCTP_MAX_SUPPORTED_EXT))) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error chklist %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_SUPPORTED_ADDRTYPE:
 			if (padded_size > SCTP_MAX_ADDR_PARAMS_SIZE) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error supaddrtype %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_RANDOM:
 			if (padded_size > (sizeof(struct sctp_auth_random) + SCTP_RANDOM_MAX_SIZE)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error random %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_SET_PRIM_ADDR:
 		case SCTP_DEL_IP_ADDRESS:
 		case SCTP_ADD_IP_ADDRESS:
 			if ((padded_size != sizeof(struct sctp_asconf_addrv4_param)) &&
 			    (padded_size != sizeof(struct sctp_asconf_addr_param))) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error setprim %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 			/* Param's with a fixed size */
 		case SCTP_IPV4_ADDRESS:
 			if (padded_size != sizeof(struct sctp_ipv4addr_param)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv4 addr %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_IPV6_ADDRESS:
 			if (padded_size != sizeof(struct sctp_ipv6addr_param)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv6 addr %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_COOKIE_PRESERVE:
 			if (padded_size != sizeof(struct sctp_cookie_perserve_param)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error cookie-preserve %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_HAS_NAT_SUPPORT:
 			*nat_friendly = 1;
 			/* fall through */
 		case SCTP_PRSCTP_SUPPORTED:
 			if (padded_size != sizeof(struct sctp_paramhdr)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error prsctp/nat support %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_ECN_CAPABLE:
 			if (padded_size != sizeof(struct sctp_paramhdr)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_ULP_ADAPTATION:
 			if (padded_size != sizeof(struct sctp_adaptation_layer_indication)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error adapatation %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_SUCCESS_REPORT:
 			if (padded_size != sizeof(struct sctp_asconf_paramhdr)) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error success %d\n", plen);
 				goto invalid_size;
 			}
 			at += padded_size;
 			break;
 		case SCTP_HOSTNAME_ADDRESS:
 			{
 				/* Hostname parameters are deprecated. */
 				struct sctp_gen_error_cause *cause;
 				int l_len;
 
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "Can't handle hostname addresses.. abort processing\n");
 				*abort_processing = 1;
 				sctp_m_freem(op_err);
 				op_err = NULL;
 				op_err_last = NULL;
 #ifdef INET6
 				l_len = SCTP_MIN_OVERHEAD;
 #else
 				l_len = SCTP_MIN_V4_OVERHEAD;
 #endif
 				l_len += sizeof(struct sctp_chunkhdr);
 				l_len += sizeof(struct sctp_gen_error_cause);
 				op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA);
 				if (op_err != NULL) {
 					/*
 					 * Pre-reserve space for IP, SCTP,
 					 * and chunk header.
 					 */
 #ifdef INET6
 					SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
 #else
 					SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
 #endif
 					SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
 					SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
 					SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause);
 					cause = mtod(op_err, struct sctp_gen_error_cause *);
 					cause->code = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR);
 					cause->length = htons((uint16_t)(sizeof(struct sctp_gen_error_cause) + plen));
 					SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT);
 					if (SCTP_BUF_NEXT(op_err) == NULL) {
 						sctp_m_freem(op_err);
 						op_err = NULL;
 						op_err_last = NULL;
 					}
 				}
 				return (op_err);
 			}
 		default:
 			/*
 			 * we do not recognize the parameter figure out what
 			 * we do.
 			 */
 			SCTPDBG(SCTP_DEBUG_OUTPUT1, "Hit default param %x\n", ptype);
 			if ((ptype & 0x4000) == 0x4000) {
 				/* Report bit is set?? */
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "report op err\n");
 				if (op_err == NULL) {
 					int l_len;
 
 					/* Ok need to try to get an mbuf */
 #ifdef INET6
 					l_len = SCTP_MIN_OVERHEAD;
 #else
 					l_len = SCTP_MIN_V4_OVERHEAD;
 #endif
 					l_len += sizeof(struct sctp_chunkhdr);
 					l_len += sizeof(struct sctp_paramhdr);
 					op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA);
 					if (op_err) {
 						SCTP_BUF_LEN(op_err) = 0;
 #ifdef INET6
 						SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
 #else
 						SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
 #endif
 						SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
 						SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
 						op_err_last = op_err;
 					}
 				}
 				if (op_err != NULL) {
 					/* If we have space */
 					struct sctp_paramhdr *param;
 
 					if (pad_needed > 0) {
 						op_err_last = sctp_add_pad_tombuf(op_err_last, pad_needed);
 					}
 					if (op_err_last == NULL) {
 						sctp_m_freem(op_err);
 						op_err = NULL;
 						op_err_last = NULL;
 						goto more_processing;
 					}
 					if (M_TRAILINGSPACE(op_err_last) < (int)sizeof(struct sctp_paramhdr)) {
 						m_tmp = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA);
 						if (m_tmp == NULL) {
 							sctp_m_freem(op_err);
 							op_err = NULL;
 							op_err_last = NULL;
 							goto more_processing;
 						}
 						SCTP_BUF_LEN(m_tmp) = 0;
 						SCTP_BUF_NEXT(m_tmp) = NULL;
 						SCTP_BUF_NEXT(op_err_last) = m_tmp;
 						op_err_last = m_tmp;
 					}
 					param = (struct sctp_paramhdr *)(mtod(op_err_last, caddr_t)+SCTP_BUF_LEN(op_err_last));
 					param->param_type = htons(SCTP_UNRECOG_PARAM);
 					param->param_length = htons((uint16_t)sizeof(struct sctp_paramhdr) + plen);
 					SCTP_BUF_LEN(op_err_last) += sizeof(struct sctp_paramhdr);
 					SCTP_BUF_NEXT(op_err_last) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT);
 					if (SCTP_BUF_NEXT(op_err_last) == NULL) {
 						sctp_m_freem(op_err);
 						op_err = NULL;
 						op_err_last = NULL;
 						goto more_processing;
 					} else {
 						while (SCTP_BUF_NEXT(op_err_last) != NULL) {
 							op_err_last = SCTP_BUF_NEXT(op_err_last);
 						}
 					}
 					if (plen % 4 != 0) {
 						pad_needed = 4 - (plen % 4);
 					} else {
 						pad_needed = 0;
 					}
 				}
 			}
 	more_processing:
 			if ((ptype & 0x8000) == 0x0000) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "stop proc\n");
 				return (op_err);
 			} else {
 				/* skip this chunk and continue processing */
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "move on\n");
 				at += SCTP_SIZE32(plen);
 			}
 			break;
 
 		}
 		phdr = sctp_get_next_param(mat, at, &params, sizeof(params));
 	}
 	return (op_err);
 invalid_size:
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "abort flag set\n");
 	*abort_processing = 1;
 	sctp_m_freem(op_err);
 	op_err = NULL;
 	op_err_last = NULL;
 	if (phdr != NULL) {
 		struct sctp_paramhdr *param;
 		int l_len;
 #ifdef INET6
 		l_len = SCTP_MIN_OVERHEAD;
 #else
 		l_len = SCTP_MIN_V4_OVERHEAD;
 #endif
 		l_len += sizeof(struct sctp_chunkhdr);
 		l_len += (2 * sizeof(struct sctp_paramhdr));
 		op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA);
 		if (op_err) {
 			SCTP_BUF_LEN(op_err) = 0;
 #ifdef INET6
 			SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr));
 #else
 			SCTP_BUF_RESV_UF(op_err, sizeof(struct ip));
 #endif
 			SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr));
 			SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
 			SCTP_BUF_LEN(op_err) = 2 * sizeof(struct sctp_paramhdr);
 			param = mtod(op_err, struct sctp_paramhdr *);
 			param->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
 			param->param_length = htons(2 * sizeof(struct sctp_paramhdr));
 			param++;
 			param->param_type = htons(ptype);
 			param->param_length = htons(plen);
 		}
 	}
 	return (op_err);
 }
 
 static int
 sctp_are_there_new_addresses(struct sctp_association *asoc,
     struct mbuf *in_initpkt, int offset, struct sockaddr *src)
 {
 	/*
 	 * Given a INIT packet, look through the packet to verify that there
 	 * are NO new addresses. As we go through the parameters add reports
 	 * of any un-understood parameters that require an error.  Also we
 	 * must return (1) to drop the packet if we see a un-understood
 	 * parameter that tells us to drop the chunk.
 	 */
 	struct sockaddr *sa_touse;
 	struct sockaddr *sa;
 	struct sctp_paramhdr *phdr, params;
 	uint16_t ptype, plen;
 	uint8_t fnd;
 	struct sctp_nets *net;
 	int check_src;
 #ifdef INET
 	struct sockaddr_in sin4, *sa4;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6, *sa6;
 #endif
 
 #ifdef INET
 	memset(&sin4, 0, sizeof(sin4));
 	sin4.sin_family = AF_INET;
 	sin4.sin_len = sizeof(sin4);
 #endif
 #ifdef INET6
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(sin6);
 #endif
 	/* First what about the src address of the pkt ? */
 	check_src = 0;
 	switch (src->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (asoc->scope.ipv4_addr_legal) {
 			check_src = 1;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (asoc->scope.ipv6_addr_legal) {
 			check_src = 1;
 		}
 		break;
 #endif
 	default:
 		/* TSNH */
 		break;
 	}
 	if (check_src) {
 		fnd = 0;
 		TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 			sa = (struct sockaddr *)&net->ro._l_addr;
 			if (sa->sa_family == src->sa_family) {
 #ifdef INET
 				if (sa->sa_family == AF_INET) {
 					struct sockaddr_in *src4;
 
 					sa4 = (struct sockaddr_in *)sa;
 					src4 = (struct sockaddr_in *)src;
 					if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) {
 						fnd = 1;
 						break;
 					}
 				}
 #endif
 #ifdef INET6
 				if (sa->sa_family == AF_INET6) {
 					struct sockaddr_in6 *src6;
 
 					sa6 = (struct sockaddr_in6 *)sa;
 					src6 = (struct sockaddr_in6 *)src;
 					if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) {
 						fnd = 1;
 						break;
 					}
 				}
 #endif
 			}
 		}
 		if (fnd == 0) {
 			/* New address added! no need to look further. */
 			return (1);
 		}
 	}
 	/* Ok so far lets munge through the rest of the packet */
 	offset += sizeof(struct sctp_init_chunk);
 	phdr = sctp_get_next_param(in_initpkt, offset, &params, sizeof(params));
 	while (phdr) {
 		sa_touse = NULL;
 		ptype = ntohs(phdr->param_type);
 		plen = ntohs(phdr->param_length);
 		switch (ptype) {
 #ifdef INET
 		case SCTP_IPV4_ADDRESS:
 			{
 				struct sctp_ipv4addr_param *p4, p4_buf;
 
 				if (plen != sizeof(struct sctp_ipv4addr_param)) {
 					return (1);
 				}
 				phdr = sctp_get_next_param(in_initpkt, offset,
 				    (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf));
 				if (phdr == NULL) {
 					return (1);
 				}
 				if (asoc->scope.ipv4_addr_legal) {
 					p4 = (struct sctp_ipv4addr_param *)phdr;
 					sin4.sin_addr.s_addr = p4->addr;
 					sa_touse = (struct sockaddr *)&sin4;
 				}
 				break;
 			}
 #endif
 #ifdef INET6
 		case SCTP_IPV6_ADDRESS:
 			{
 				struct sctp_ipv6addr_param *p6, p6_buf;
 
 				if (plen != sizeof(struct sctp_ipv6addr_param)) {
 					return (1);
 				}
 				phdr = sctp_get_next_param(in_initpkt, offset,
 				    (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf));
 				if (phdr == NULL) {
 					return (1);
 				}
 				if (asoc->scope.ipv6_addr_legal) {
 					p6 = (struct sctp_ipv6addr_param *)phdr;
 					memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
 					    sizeof(p6->addr));
 					sa_touse = (struct sockaddr *)&sin6;
 				}
 				break;
 			}
 #endif
 		default:
 			sa_touse = NULL;
 			break;
 		}
 		if (sa_touse) {
 			/* ok, sa_touse points to one to check */
 			fnd = 0;
 			TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 				sa = (struct sockaddr *)&net->ro._l_addr;
 				if (sa->sa_family != sa_touse->sa_family) {
 					continue;
 				}
 #ifdef INET
 				if (sa->sa_family == AF_INET) {
 					sa4 = (struct sockaddr_in *)sa;
 					if (sa4->sin_addr.s_addr ==
 					    sin4.sin_addr.s_addr) {
 						fnd = 1;
 						break;
 					}
 				}
 #endif
 #ifdef INET6
 				if (sa->sa_family == AF_INET6) {
 					sa6 = (struct sockaddr_in6 *)sa;
 					if (SCTP6_ARE_ADDR_EQUAL(
 					    sa6, &sin6)) {
 						fnd = 1;
 						break;
 					}
 				}
 #endif
 			}
 			if (!fnd) {
 				/* New addr added! no need to look further */
 				return (1);
 			}
 		}
 		offset += SCTP_SIZE32(plen);
 		phdr = sctp_get_next_param(in_initpkt, offset, &params, sizeof(params));
 	}
 	return (0);
 }
 
 /*
  * Given a MBUF chain that was sent into us containing an INIT. Build a
  * INIT-ACK with COOKIE and send back. We assume that the in_initpkt has done
  * a pullup to include IPv6/4header, SCTP header and initial part of INIT
  * message (i.e. the struct sctp_init_msg).
  */
 void
 sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct sctp_nets *src_net, struct mbuf *init_pkt,
     int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_init_chunk *init_chk,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_association *asoc;
 	struct mbuf *m, *m_tmp, *m_last, *m_cookie, *op_err;
 	struct sctp_init_ack_chunk *initack;
 	struct sctp_adaptation_layer_indication *ali;
 	struct sctp_supported_chunk_types_param *pr_supported;
 	struct sctp_paramhdr *ph;
 	union sctp_sockstore *over_addr;
 	struct sctp_scoping scp;
 	struct timeval now;
 #ifdef INET
 	struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
 	struct sockaddr_in *src4 = (struct sockaddr_in *)src;
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
 	struct sockaddr_in6 *src6 = (struct sockaddr_in6 *)src;
 	struct sockaddr_in6 *sin6;
 #endif
 	struct sockaddr *to;
 	struct sctp_state_cookie stc;
 	struct sctp_nets *net = NULL;
 	uint8_t *signature = NULL;
 	int cnt_inits_to = 0;
 	uint16_t his_limit, i_want;
 	int abort_flag;
 	int nat_friendly = 0;
 	int error;
 	struct socket *so;
 	uint16_t num_ext, chunk_len, padding_len, parameter_len;
 
 	if (stcb) {
 		asoc = &stcb->asoc;
 	} else {
 		asoc = NULL;
 	}
 	if ((asoc != NULL) &&
 	    (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT)) {
 		if (sctp_are_there_new_addresses(asoc, init_pkt, offset, src)) {
 			/*
 			 * new addresses, out of here in non-cookie-wait
 			 * states
 			 *
 			 * Send an ABORT, without the new address error
 			 * cause. This looks no different than if no
 			 * listener was present.
 			 */
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    "Address added");
 			sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err,
 			    mflowtype, mflowid, inp->fibnum,
 			    vrf_id, port);
 			return;
 		}
 		if (src_net != NULL && (src_net->port != port)) {
 			/*
 			 * change of remote encapsulation port, out of here
 			 * in non-cookie-wait states
 			 *
 			 * Send an ABORT, without an specific error cause.
 			 * This looks no different than if no listener was
 			 * present.
 			 */
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    "Remote encapsulation port changed");
 			sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err,
 			    mflowtype, mflowid, inp->fibnum,
 			    vrf_id, port);
 			return;
 		}
 	}
 	abort_flag = 0;
 	op_err = sctp_arethere_unrecognized_parameters(init_pkt,
 	    (offset + sizeof(struct sctp_init_chunk)),
 	    &abort_flag,
 	    (struct sctp_chunkhdr *)init_chk,
 	    &nat_friendly, NULL);
 	if (abort_flag) {
 do_a_abort:
 		if (op_err == NULL) {
 			char msg[SCTP_DIAG_INFO_LEN];
 
 			SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__);
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    msg);
 		}
 		sctp_send_abort(init_pkt, iphlen, src, dst, sh,
 		    init_chk->init.initiate_tag, op_err,
 		    mflowtype, mflowid, inp->fibnum,
 		    vrf_id, port);
 		return;
 	}
 	m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (m == NULL) {
 		/* No memory, INIT timer will re-attempt. */
 		sctp_m_freem(op_err);
 		return;
 	}
 	chunk_len = (uint16_t)sizeof(struct sctp_init_ack_chunk);
 	padding_len = 0;
 
 	/*
 	 * We might not overwrite the identification[] completely and on
 	 * some platforms time_entered will contain some padding. Therefore
 	 * zero out the cookie to avoid putting uninitialized memory on the
 	 * wire.
 	 */
 	memset(&stc, 0, sizeof(struct sctp_state_cookie));
 
 	/* the time I built cookie */
 	(void)SCTP_GETTIME_TIMEVAL(&now);
 	stc.time_entered.tv_sec = now.tv_sec;
 	stc.time_entered.tv_usec = now.tv_usec;
 
 	/* populate any tie tags */
 	if (asoc != NULL) {
 		/* unlock before tag selections */
 		stc.tie_tag_my_vtag = asoc->my_vtag_nonce;
 		stc.tie_tag_peer_vtag = asoc->peer_vtag_nonce;
 		stc.cookie_life = asoc->cookie_life;
 		net = asoc->primary_destination;
 	} else {
 		stc.tie_tag_my_vtag = 0;
 		stc.tie_tag_peer_vtag = 0;
 		/* life I will award this cookie */
 		stc.cookie_life = inp->sctp_ep.def_cookie_life;
 	}
 
 	/* copy in the ports for later check */
 	stc.myport = sh->dest_port;
 	stc.peerport = sh->src_port;
 
 	/*
 	 * If we wanted to honor cookie life extensions, we would add to
 	 * stc.cookie_life. For now we should NOT honor any extension
 	 */
 	stc.site_scope = stc.local_scope = stc.loopback_scope = 0;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 		stc.ipv6_addr_legal = 1;
 		if (SCTP_IPV6_V6ONLY(inp)) {
 			stc.ipv4_addr_legal = 0;
 		} else {
 			stc.ipv4_addr_legal = 1;
 		}
 	} else {
 		stc.ipv6_addr_legal = 0;
 		stc.ipv4_addr_legal = 1;
 	}
 	stc.ipv4_scope = 0;
 	if (net == NULL) {
 		to = src;
 		switch (dst->sa_family) {
 #ifdef INET
 		case AF_INET:
 			{
 				/* lookup address */
 				stc.address[0] = src4->sin_addr.s_addr;
 				stc.address[1] = 0;
 				stc.address[2] = 0;
 				stc.address[3] = 0;
 				stc.addr_type = SCTP_IPV4_ADDRESS;
 				/* local from address */
 				stc.laddress[0] = dst4->sin_addr.s_addr;
 				stc.laddress[1] = 0;
 				stc.laddress[2] = 0;
 				stc.laddress[3] = 0;
 				stc.laddr_type = SCTP_IPV4_ADDRESS;
 				/* scope_id is only for v6 */
 				stc.scope_id = 0;
 				if ((IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) ||
 				    (IN4_ISPRIVATE_ADDRESS(&dst4->sin_addr))) {
 					stc.ipv4_scope = 1;
 				}
 				/* Must use the address in this case */
 				if (sctp_is_address_on_local_host(src, vrf_id)) {
 					stc.loopback_scope = 1;
 					stc.ipv4_scope = 1;
 					stc.site_scope = 1;
 					stc.local_scope = 0;
 				}
 				break;
 			}
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				stc.addr_type = SCTP_IPV6_ADDRESS;
 				memcpy(&stc.address, &src6->sin6_addr, sizeof(struct in6_addr));
 				stc.scope_id = ntohs(in6_getscope(&src6->sin6_addr));
 				if (sctp_is_address_on_local_host(src, vrf_id)) {
 					stc.loopback_scope = 1;
 					stc.local_scope = 0;
 					stc.site_scope = 1;
 					stc.ipv4_scope = 1;
 				} else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr) ||
 				    IN6_IS_ADDR_LINKLOCAL(&dst6->sin6_addr)) {
 					/*
 					 * If the new destination or source
 					 * is a LINK_LOCAL we must have
 					 * common both site and local scope.
 					 * Don't set local scope though
 					 * since we must depend on the
 					 * source to be added implicitly. We
 					 * cannot assure just because we
 					 * share one link that all links are
 					 * common.
 					 */
 					stc.local_scope = 0;
 					stc.site_scope = 1;
 					stc.ipv4_scope = 1;
 					/*
 					 * we start counting for the private
 					 * address stuff at 1. since the
 					 * link local we source from won't
 					 * show up in our scoped count.
 					 */
 					cnt_inits_to = 1;
 					/*
 					 * pull out the scope_id from
 					 * incoming pkt
 					 */
 				} else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr) ||
 				    IN6_IS_ADDR_SITELOCAL(&dst6->sin6_addr)) {
 					/*
 					 * If the new destination or source
 					 * is SITE_LOCAL then we must have
 					 * site scope in common.
 					 */
 					stc.site_scope = 1;
 				}
 				memcpy(&stc.laddress, &dst6->sin6_addr, sizeof(struct in6_addr));
 				stc.laddr_type = SCTP_IPV6_ADDRESS;
 				break;
 			}
 #endif
 		default:
 			/* TSNH */
 			goto do_a_abort;
 			break;
 		}
 	} else {
 		/* set the scope per the existing tcb */
 
 #ifdef INET6
 		struct sctp_nets *lnet;
 #endif
 
 		stc.loopback_scope = asoc->scope.loopback_scope;
 		stc.ipv4_scope = asoc->scope.ipv4_local_scope;
 		stc.site_scope = asoc->scope.site_scope;
 		stc.local_scope = asoc->scope.local_scope;
 #ifdef INET6
 		/* Why do we not consider IPv4 LL addresses? */
 		TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) {
 			if (lnet->ro._l_addr.sin6.sin6_family == AF_INET6) {
 				if (IN6_IS_ADDR_LINKLOCAL(&lnet->ro._l_addr.sin6.sin6_addr)) {
 					/*
 					 * if we have a LL address, start
 					 * counting at 1.
 					 */
 					cnt_inits_to = 1;
 				}
 			}
 		}
 #endif
 		/* use the net pointer */
 		to = (struct sockaddr *)&net->ro._l_addr;
 		switch (to->sa_family) {
 #ifdef INET
 		case AF_INET:
 			sin = (struct sockaddr_in *)to;
 			stc.address[0] = sin->sin_addr.s_addr;
 			stc.address[1] = 0;
 			stc.address[2] = 0;
 			stc.address[3] = 0;
 			stc.addr_type = SCTP_IPV4_ADDRESS;
 			if (net->src_addr_selected == 0) {
 				/*
 				 * strange case here, the INIT should have
 				 * did the selection.
 				 */
 				net->ro._s_addr = sctp_source_address_selection(inp,
 				    stcb, (sctp_route_t *)&net->ro,
 				    net, 0, vrf_id);
 				if (net->ro._s_addr == NULL) {
 					sctp_m_freem(op_err);
 					sctp_m_freem(m);
 					return;
 				}
 
 				net->src_addr_selected = 1;
 
 			}
 			stc.laddress[0] = net->ro._s_addr->address.sin.sin_addr.s_addr;
 			stc.laddress[1] = 0;
 			stc.laddress[2] = 0;
 			stc.laddress[3] = 0;
 			stc.laddr_type = SCTP_IPV4_ADDRESS;
 			/* scope_id is only for v6 */
 			stc.scope_id = 0;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			sin6 = (struct sockaddr_in6 *)to;
 			memcpy(&stc.address, &sin6->sin6_addr,
 			    sizeof(struct in6_addr));
 			stc.addr_type = SCTP_IPV6_ADDRESS;
 			stc.scope_id = sin6->sin6_scope_id;
 			if (net->src_addr_selected == 0) {
 				/*
 				 * strange case here, the INIT should have
 				 * done the selection.
 				 */
 				net->ro._s_addr = sctp_source_address_selection(inp,
 				    stcb, (sctp_route_t *)&net->ro,
 				    net, 0, vrf_id);
 				if (net->ro._s_addr == NULL) {
 					sctp_m_freem(op_err);
 					sctp_m_freem(m);
 					return;
 				}
 
 				net->src_addr_selected = 1;
 			}
 			memcpy(&stc.laddress, &net->ro._s_addr->address.sin6.sin6_addr,
 			    sizeof(struct in6_addr));
 			stc.laddr_type = SCTP_IPV6_ADDRESS;
 			break;
 #endif
 		}
 	}
 	/* Now lets put the SCTP header in place */
 	initack = mtod(m, struct sctp_init_ack_chunk *);
 	/* Save it off for quick ref */
 	stc.peers_vtag = ntohl(init_chk->init.initiate_tag);
 	/* who are we */
 	memcpy(stc.identification, SCTP_VERSION_STRING,
 	    min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification)));
 	memset(stc.reserved, 0, SCTP_RESERVE_SPACE);
 	/* now the chunk header */
 	initack->ch.chunk_type = SCTP_INITIATION_ACK;
 	initack->ch.chunk_flags = 0;
 	/* fill in later from mbuf we build */
 	initack->ch.chunk_length = 0;
 	/* place in my tag */
 	if ((asoc != NULL) &&
 	    ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_INUSE) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED))) {
 		/* re-use the v-tags and init-seq here */
 		initack->init.initiate_tag = htonl(asoc->my_vtag);
 		initack->init.initial_tsn = htonl(asoc->init_seq_number);
 	} else {
 		uint32_t vtag, itsn;
 
 		if (asoc) {
 			atomic_add_int(&asoc->refcnt, 1);
 			SCTP_TCB_UNLOCK(stcb);
 	new_tag:
 			vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1);
 			if ((asoc->peer_supports_nat) && (vtag == asoc->my_vtag)) {
 				/*
 				 * Got a duplicate vtag on some guy behind a
 				 * nat make sure we don't use it.
 				 */
 				goto new_tag;
 			}
 			initack->init.initiate_tag = htonl(vtag);
 			/* get a TSN to use too */
 			itsn = sctp_select_initial_TSN(&inp->sctp_ep);
 			initack->init.initial_tsn = htonl(itsn);
 			SCTP_TCB_LOCK(stcb);
 			atomic_add_int(&asoc->refcnt, -1);
 		} else {
 			SCTP_INP_INCR_REF(inp);
 			SCTP_INP_RUNLOCK(inp);
 			vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1);
 			initack->init.initiate_tag = htonl(vtag);
 			/* get a TSN to use too */
 			initack->init.initial_tsn = htonl(sctp_select_initial_TSN(&inp->sctp_ep));
 			SCTP_INP_RLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 		}
 	}
 	/* save away my tag to */
 	stc.my_vtag = initack->init.initiate_tag;
 
 	/* set up some of the credits. */
 	so = inp->sctp_socket;
 	if (so == NULL) {
 		/* memory problem */
 		sctp_m_freem(op_err);
 		sctp_m_freem(m);
 		return;
 	} else {
 		initack->init.a_rwnd = htonl(max(SCTP_SB_LIMIT_RCV(so), SCTP_MINIMAL_RWND));
 	}
 	/* set what I want */
 	his_limit = ntohs(init_chk->init.num_inbound_streams);
 	/* choose what I want */
 	if (asoc != NULL) {
 		if (asoc->streamoutcnt > asoc->pre_open_streams) {
 			i_want = asoc->streamoutcnt;
 		} else {
 			i_want = asoc->pre_open_streams;
 		}
 	} else {
 		i_want = inp->sctp_ep.pre_open_stream_count;
 	}
 	if (his_limit < i_want) {
 		/* I Want more :< */
 		initack->init.num_outbound_streams = init_chk->init.num_inbound_streams;
 	} else {
 		/* I can have what I want :> */
 		initack->init.num_outbound_streams = htons(i_want);
 	}
 	/* tell him his limit. */
 	initack->init.num_inbound_streams =
 	    htons(inp->sctp_ep.max_open_streams_intome);
 
 	/* adaptation layer indication parameter */
 	if (inp->sctp_ep.adaptation_layer_indicator_provided) {
 		parameter_len = (uint16_t)sizeof(struct sctp_adaptation_layer_indication);
 		ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len);
 		ali->ph.param_type = htons(SCTP_ULP_ADAPTATION);
 		ali->ph.param_length = htons(parameter_len);
 		ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator);
 		chunk_len += parameter_len;
 	}
 
 	/* ECN parameter */
 	if (((asoc != NULL) && (asoc->ecn_supported == 1)) ||
 	    ((asoc == NULL) && (inp->ecn_supported == 1))) {
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
 		ph->param_type = htons(SCTP_ECN_CAPABLE);
 		ph->param_length = htons(parameter_len);
 		chunk_len += parameter_len;
 	}
 
 	/* PR-SCTP supported parameter */
 	if (((asoc != NULL) && (asoc->prsctp_supported == 1)) ||
 	    ((asoc == NULL) && (inp->prsctp_supported == 1))) {
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
 		ph->param_type = htons(SCTP_PRSCTP_SUPPORTED);
 		ph->param_length = htons(parameter_len);
 		chunk_len += parameter_len;
 	}
 
 	/* Add NAT friendly parameter */
 	if (nat_friendly) {
 		parameter_len = (uint16_t)sizeof(struct sctp_paramhdr);
 		ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
 		ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
 		ph->param_length = htons(parameter_len);
 		chunk_len += parameter_len;
 	}
 
 	/* And now tell the peer which extensions we support */
 	num_ext = 0;
 	pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len);
 	if (((asoc != NULL) && (asoc->prsctp_supported == 1)) ||
 	    ((asoc == NULL) && (inp->prsctp_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
 		if (((asoc != NULL) && (asoc->idata_supported == 1)) ||
 		    ((asoc == NULL) && (inp->idata_supported == 1))) {
 			pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN;
 		}
 	}
 	if (((asoc != NULL) && (asoc->auth_supported == 1)) ||
 	    ((asoc == NULL) && (inp->auth_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION;
 	}
 	if (((asoc != NULL) && (asoc->asconf_supported == 1)) ||
 	    ((asoc == NULL) && (inp->asconf_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
 		pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
 	}
 	if (((asoc != NULL) && (asoc->reconfig_supported == 1)) ||
 	    ((asoc == NULL) && (inp->reconfig_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
 	}
 	if (((asoc != NULL) && (asoc->idata_supported == 1)) ||
 	    ((asoc == NULL) && (inp->idata_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_IDATA;
 	}
 	if (((asoc != NULL) && (asoc->nrsack_supported == 1)) ||
 	    ((asoc == NULL) && (inp->nrsack_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK;
 	}
 	if (((asoc != NULL) && (asoc->pktdrop_supported == 1)) ||
 	    ((asoc == NULL) && (inp->pktdrop_supported == 1))) {
 		pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
 	}
 	if (num_ext > 0) {
 		parameter_len = (uint16_t)sizeof(struct sctp_supported_chunk_types_param) + num_ext;
 		pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
 		pr_supported->ph.param_length = htons(parameter_len);
 		padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 		chunk_len += parameter_len;
 	}
 
 	/* add authentication parameters */
 	if (((asoc != NULL) && (asoc->auth_supported == 1)) ||
 	    ((asoc == NULL) && (inp->auth_supported == 1))) {
 		struct sctp_auth_random *randp;
 		struct sctp_auth_hmac_algo *hmacs;
 		struct sctp_auth_chunk_list *chunks;
 
 		if (padding_len > 0) {
 			memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 			chunk_len += padding_len;
 			padding_len = 0;
 		}
 		/* generate and add RANDOM parameter */
 		randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len);
 		parameter_len = (uint16_t)sizeof(struct sctp_auth_random) +
 		    SCTP_AUTH_RANDOM_SIZE_DEFAULT;
 		randp->ph.param_type = htons(SCTP_RANDOM);
 		randp->ph.param_length = htons(parameter_len);
 		SCTP_READ_RANDOM(randp->random_data, SCTP_AUTH_RANDOM_SIZE_DEFAULT);
 		padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 		chunk_len += parameter_len;
 
 		if (padding_len > 0) {
 			memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 			chunk_len += padding_len;
 			padding_len = 0;
 		}
 		/* add HMAC_ALGO parameter */
 		hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len);
 		parameter_len = (uint16_t)sizeof(struct sctp_auth_hmac_algo) +
 		    sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs,
 		    (uint8_t *)hmacs->hmac_ids);
 		hmacs->ph.param_type = htons(SCTP_HMAC_LIST);
 		hmacs->ph.param_length = htons(parameter_len);
 		padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 		chunk_len += parameter_len;
 
 		if (padding_len > 0) {
 			memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 			chunk_len += padding_len;
 			padding_len = 0;
 		}
 		/* add CHUNKS parameter */
 		chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len);
 		parameter_len = (uint16_t)sizeof(struct sctp_auth_chunk_list) +
 		    sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks,
 		    chunks->chunk_types);
 		chunks->ph.param_type = htons(SCTP_CHUNK_LIST);
 		chunks->ph.param_length = htons(parameter_len);
 		padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 		chunk_len += parameter_len;
 	}
 	SCTP_BUF_LEN(m) = chunk_len;
 	m_last = m;
 	/* now the addresses */
 	/*
 	 * To optimize this we could put the scoping stuff into a structure
 	 * and remove the individual uint8's from the stc structure. Then we
 	 * could just sifa in the address within the stc.. but for now this
 	 * is a quick hack to get the address stuff teased apart.
 	 */
 	scp.ipv4_addr_legal = stc.ipv4_addr_legal;
 	scp.ipv6_addr_legal = stc.ipv6_addr_legal;
 	scp.loopback_scope = stc.loopback_scope;
 	scp.ipv4_local_scope = stc.ipv4_scope;
 	scp.local_scope = stc.local_scope;
 	scp.site_scope = stc.site_scope;
 	m_last = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_last,
 	    cnt_inits_to,
 	    &padding_len, &chunk_len);
 	/* padding_len can only be positive, if no addresses have been added */
 	if (padding_len > 0) {
 		memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
 		chunk_len += padding_len;
 		SCTP_BUF_LEN(m) += padding_len;
 		padding_len = 0;
 	}
 
 	/* tack on the operational error if present */
 	if (op_err) {
 		parameter_len = 0;
 		for (m_tmp = op_err; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
 			parameter_len += SCTP_BUF_LEN(m_tmp);
 		}
 		padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 		SCTP_BUF_NEXT(m_last) = op_err;
 		while (SCTP_BUF_NEXT(m_last) != NULL) {
 			m_last = SCTP_BUF_NEXT(m_last);
 		}
 		chunk_len += parameter_len;
 	}
 	if (padding_len > 0) {
 		m_last = sctp_add_pad_tombuf(m_last, padding_len);
 		if (m_last == NULL) {
 			/* Houston we have a problem, no space */
 			sctp_m_freem(m);
 			return;
 		}
 		chunk_len += padding_len;
 		padding_len = 0;
 	}
 	/* Now we must build a cookie */
 	m_cookie = sctp_add_cookie(init_pkt, offset, m, 0, &stc, &signature);
 	if (m_cookie == NULL) {
 		/* memory problem */
 		sctp_m_freem(m);
 		return;
 	}
 	/* Now append the cookie to the end and update the space/size */
 	SCTP_BUF_NEXT(m_last) = m_cookie;
 	parameter_len = 0;
 	for (m_tmp = m_cookie; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
 		parameter_len += SCTP_BUF_LEN(m_tmp);
 		if (SCTP_BUF_NEXT(m_tmp) == NULL) {
 			m_last = m_tmp;
 		}
 	}
 	padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
 	chunk_len += parameter_len;
 
 	/*
 	 * Place in the size, but we don't include the last pad (if any) in
 	 * the INIT-ACK.
 	 */
 	initack->ch.chunk_length = htons(chunk_len);
 
 	/*
 	 * Time to sign the cookie, we don't sign over the cookie signature
 	 * though thus we set trailer.
 	 */
 	(void)sctp_hmac_m(SCTP_HMAC,
 	    (uint8_t *)inp->sctp_ep.secret_key[(int)(inp->sctp_ep.current_secret_number)],
 	    SCTP_SECRET_SIZE, m_cookie, sizeof(struct sctp_paramhdr),
 	    (uint8_t *)signature, SCTP_SIGNATURE_SIZE);
 	/*
 	 * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return
 	 * here since the timer will drive a retranmission.
 	 */
 	if (padding_len > 0) {
 		if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) {
 			sctp_m_freem(m);
 			return;
 		}
 	}
 	if (stc.loopback_scope) {
 		over_addr = (union sctp_sockstore *)dst;
 	} else {
 		over_addr = NULL;
 	}
 
 	if ((error = sctp_lowlevel_chunk_output(inp, NULL, NULL, to, m, 0, NULL, 0, 0,
 	    0, 0,
 	    inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag,
 	    port, over_addr,
 	    mflowtype, mflowid,
 	    SCTP_SO_NOT_LOCKED))) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak send error %d\n", error);
 		if (error == ENOBUFS) {
 			if (asoc != NULL) {
 				asoc->ifp_had_enobuf = 1;
 			}
 			SCTP_STAT_INCR(sctps_lowlevelerr);
 		}
 	} else {
 		if (asoc != NULL) {
 			asoc->ifp_had_enobuf = 0;
 		}
 	}
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 }
 
 
 static void
 sctp_prune_prsctp(struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     struct sctp_sndrcvinfo *srcv,
     int dataout)
 {
 	int freed_spc = 0;
 	struct sctp_tmit_chunk *chk, *nchk;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if ((asoc->prsctp_supported) &&
 	    (asoc->sent_queue_cnt_removeable > 0)) {
 		TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 			/*
 			 * Look for chunks marked with the PR_SCTP flag AND
 			 * the buffer space flag. If the one being sent is
 			 * equal or greater priority then purge the old one
 			 * and free some space.
 			 */
 			if (PR_SCTP_BUF_ENABLED(chk->flags)) {
 				/*
 				 * This one is PR-SCTP AND buffer space
 				 * limited type
 				 */
 				if (chk->rec.data.timetodrop.tv_sec > (long)srcv->sinfo_timetolive) {
 					/*
 					 * Lower numbers equates to higher
 					 * priority. So if the one we are
 					 * looking at has a larger priority,
 					 * we want to drop the data and NOT
 					 * retransmit it.
 					 */
 					if (chk->data) {
 						/*
 						 * We release the book_size
 						 * if the mbuf is here
 						 */
 						int ret_spc;
 						uint8_t sent;
 
 						if (chk->sent > SCTP_DATAGRAM_UNSENT)
 							sent = 1;
 						else
 							sent = 0;
 						ret_spc = sctp_release_pr_sctp_chunk(stcb, chk,
 						    sent,
 						    SCTP_SO_LOCKED);
 						freed_spc += ret_spc;
 						if (freed_spc >= dataout) {
 							return;
 						}
 					}	/* if chunk was present */
 				}	/* if of sufficient priority */
 			}	/* if chunk has enabled */
 		}		/* tailqforeach */
 
 		TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
 			/* Here we must move to the sent queue and mark */
 			if (PR_SCTP_BUF_ENABLED(chk->flags)) {
 				if (chk->rec.data.timetodrop.tv_sec > (long)srcv->sinfo_timetolive) {
 					if (chk->data) {
 						/*
 						 * We release the book_size
 						 * if the mbuf is here
 						 */
 						int ret_spc;
 
 						ret_spc = sctp_release_pr_sctp_chunk(stcb, chk,
 						    0, SCTP_SO_LOCKED);
 
 						freed_spc += ret_spc;
 						if (freed_spc >= dataout) {
 							return;
 						}
 					}	/* end if chk->data */
 				}	/* end if right class */
 			}	/* end if chk pr-sctp */
 		}		/* tailqforeachsafe (chk) */
 	}			/* if enabled in asoc */
 }
 
 int
 sctp_get_frag_point(struct sctp_tcb *stcb,
     struct sctp_association *asoc)
 {
 	int siz, ovh;
 
 	/*
 	 * For endpoints that have both v6 and v4 addresses we must reserve
 	 * room for the ipv6 header, for those that are only dealing with V4
 	 * we use a larger frag point.
 	 */
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 		ovh = SCTP_MIN_OVERHEAD;
 	} else {
 		ovh = SCTP_MIN_V4_OVERHEAD;
 	}
 	ovh += SCTP_DATA_CHUNK_OVERHEAD(stcb);
 	if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu)
 		siz = asoc->smallest_mtu - ovh;
 	else
 		siz = (stcb->asoc.sctp_frag_point - ovh);
 	/*
 	 * if (siz > (MCLBYTES-sizeof(struct sctp_data_chunk))) {
 	 */
 	/* A data chunk MUST fit in a cluster */
 	/* siz = (MCLBYTES - sizeof(struct sctp_data_chunk)); */
 	/* } */
 
 	/* adjust for an AUTH chunk if DATA requires auth */
 	if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks))
 		siz -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 
 	if (siz % 4) {
 		/* make it an even word boundary please */
 		siz -= (siz % 4);
 	}
 	return (siz);
 }
 
 static void
 sctp_set_prsctp_policy(struct sctp_stream_queue_pending *sp)
 {
 	/*
 	 * We assume that the user wants PR_SCTP_TTL if the user provides a
 	 * positive lifetime but does not specify any PR_SCTP policy.
 	 */
 	if (PR_SCTP_ENABLED(sp->sinfo_flags)) {
 		sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags);
 	} else if (sp->timetolive > 0) {
 		sp->sinfo_flags |= SCTP_PR_SCTP_TTL;
 		sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags);
 	} else {
 		return;
 	}
 	switch (PR_SCTP_POLICY(sp->sinfo_flags)) {
 	case CHUNK_FLAGS_PR_SCTP_BUF:
 		/*
 		 * Time to live is a priority stored in tv_sec when doing
 		 * the buffer drop thing.
 		 */
 		sp->ts.tv_sec = sp->timetolive;
 		sp->ts.tv_usec = 0;
 		break;
 	case CHUNK_FLAGS_PR_SCTP_TTL:
 		{
 			struct timeval tv;
 
 			(void)SCTP_GETTIME_TIMEVAL(&sp->ts);
 			tv.tv_sec = sp->timetolive / 1000;
 			tv.tv_usec = (sp->timetolive * 1000) % 1000000;
 			/*
 			 * TODO sctp_constants.h needs alternative time
 			 * macros when _KERNEL is undefined.
 			 */
 			timevaladd(&sp->ts, &tv);
 		}
 		break;
 	case CHUNK_FLAGS_PR_SCTP_RTX:
 		/*
 		 * Time to live is a the number or retransmissions stored in
 		 * tv_sec.
 		 */
 		sp->ts.tv_sec = sp->timetolive;
 		sp->ts.tv_usec = 0;
 		break;
 	default:
 		SCTPDBG(SCTP_DEBUG_USRREQ1,
 		    "Unknown PR_SCTP policy %u.\n",
 		    PR_SCTP_POLICY(sp->sinfo_flags));
 		break;
 	}
 }
 
 static int
 sctp_msg_append(struct sctp_tcb *stcb,
     struct sctp_nets *net,
     struct mbuf *m,
     struct sctp_sndrcvinfo *srcv, int hold_stcb_lock)
 {
 	int error = 0;
 	struct mbuf *at;
 	struct sctp_stream_queue_pending *sp = NULL;
 	struct sctp_stream_out *strm;
 
 	/*
 	 * Given an mbuf chain, put it into the association send queue and
 	 * place it on the wheel
 	 */
 	if (srcv->sinfo_stream >= stcb->asoc.streamoutcnt) {
 		/* Invalid stream number */
 		SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		error = EINVAL;
 		goto out_now;
 	}
 	if ((stcb->asoc.stream_locked) &&
 	    (stcb->asoc.stream_locked_on != srcv->sinfo_stream)) {
 		SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		error = EINVAL;
 		goto out_now;
 	}
 	strm = &stcb->asoc.strmout[srcv->sinfo_stream];
 	/* Now can we send this? */
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
 	    (stcb->asoc.state & SCTP_STATE_SHUTDOWN_PENDING)) {
 		/* got data while shutting down */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
 		error = ECONNRESET;
 		goto out_now;
 	}
 	sctp_alloc_a_strmoq(stcb, sp);
 	if (sp == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		error = ENOMEM;
 		goto out_now;
 	}
 	sp->sinfo_flags = srcv->sinfo_flags;
 	sp->timetolive = srcv->sinfo_timetolive;
 	sp->ppid = srcv->sinfo_ppid;
 	sp->context = srcv->sinfo_context;
 	sp->fsn = 0;
 	if (sp->sinfo_flags & SCTP_ADDR_OVER) {
 		sp->net = net;
 		atomic_add_int(&sp->net->ref_count, 1);
 	} else {
 		sp->net = NULL;
 	}
 	(void)SCTP_GETTIME_TIMEVAL(&sp->ts);
 	sp->sid = srcv->sinfo_stream;
 	sp->msg_is_complete = 1;
 	sp->sender_all_done = 1;
 	sp->some_taken = 0;
 	sp->data = m;
 	sp->tail_mbuf = NULL;
 	sctp_set_prsctp_policy(sp);
 	/*
 	 * We could in theory (for sendall) sifa the length in, but we would
 	 * still have to hunt through the chain since we need to setup the
 	 * tail_mbuf
 	 */
 	sp->length = 0;
 	for (at = m; at; at = SCTP_BUF_NEXT(at)) {
 		if (SCTP_BUF_NEXT(at) == NULL)
 			sp->tail_mbuf = at;
 		sp->length += SCTP_BUF_LEN(at);
 	}
 	if (srcv->sinfo_keynumber_valid) {
 		sp->auth_keyid = srcv->sinfo_keynumber;
 	} else {
 		sp->auth_keyid = stcb->asoc.authinfo.active_keyid;
 	}
 	if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) {
 		sctp_auth_key_acquire(stcb, sp->auth_keyid);
 		sp->holds_key_ref = 1;
 	}
 	if (hold_stcb_lock == 0) {
 		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	sctp_snd_sb_alloc(stcb, sp->length);
 	atomic_add_int(&stcb->asoc.stream_queue_cnt, 1);
 	TAILQ_INSERT_TAIL(&strm->outqueue, sp, next);
 	stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, &stcb->asoc, strm, sp, 1);
 	m = NULL;
 	if (hold_stcb_lock == 0) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 out_now:
 	if (m) {
 		sctp_m_freem(m);
 	}
 	return (error);
 }
 
 
 static struct mbuf *
 sctp_copy_mbufchain(struct mbuf *clonechain,
     struct mbuf *outchain,
     struct mbuf **endofchain,
     int can_take_mbuf,
     int sizeofcpy,
     uint8_t copy_by_ref)
 {
 	struct mbuf *m;
 	struct mbuf *appendchain;
 	caddr_t cp;
 	int len;
 
 	if (endofchain == NULL) {
 		/* error */
 error_out:
 		if (outchain)
 			sctp_m_freem(outchain);
 		return (NULL);
 	}
 	if (can_take_mbuf) {
 		appendchain = clonechain;
 	} else {
 		if (!copy_by_ref &&
 		    (sizeofcpy <= (int)((((SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) - 1) * MLEN) + MHLEN)))) {
 			/* Its not in a cluster */
 			if (*endofchain == NULL) {
 				/* lets get a mbuf cluster */
 				if (outchain == NULL) {
 					/* This is the general case */
 			new_mbuf:
 					outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER);
 					if (outchain == NULL) {
 						goto error_out;
 					}
 					SCTP_BUF_LEN(outchain) = 0;
 					*endofchain = outchain;
 					/* get the prepend space */
 					SCTP_BUF_RESV_UF(outchain, (SCTP_FIRST_MBUF_RESV + 4));
 				} else {
 					/*
 					 * We really should not get a NULL
 					 * in endofchain
 					 */
 					/* find end */
 					m = outchain;
 					while (m) {
 						if (SCTP_BUF_NEXT(m) == NULL) {
 							*endofchain = m;
 							break;
 						}
 						m = SCTP_BUF_NEXT(m);
 					}
 					/* sanity */
 					if (*endofchain == NULL) {
 						/*
 						 * huh, TSNH XXX maybe we
 						 * should panic
 						 */
 						sctp_m_freem(outchain);
 						goto new_mbuf;
 					}
 				}
 				/* get the new end of length */
 				len = (int)M_TRAILINGSPACE(*endofchain);
 			} else {
 				/* how much is left at the end? */
 				len = (int)M_TRAILINGSPACE(*endofchain);
 			}
 			/* Find the end of the data, for appending */
 			cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain)));
 
 			/* Now lets copy it out */
 			if (len >= sizeofcpy) {
 				/* It all fits, copy it in */
 				m_copydata(clonechain, 0, sizeofcpy, cp);
 				SCTP_BUF_LEN((*endofchain)) += sizeofcpy;
 			} else {
 				/* fill up the end of the chain */
 				if (len > 0) {
 					m_copydata(clonechain, 0, len, cp);
 					SCTP_BUF_LEN((*endofchain)) += len;
 					/* now we need another one */
 					sizeofcpy -= len;
 				}
 				m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER);
 				if (m == NULL) {
 					/* We failed */
 					goto error_out;
 				}
 				SCTP_BUF_NEXT((*endofchain)) = m;
 				*endofchain = m;
 				cp = mtod((*endofchain), caddr_t);
 				m_copydata(clonechain, len, sizeofcpy, cp);
 				SCTP_BUF_LEN((*endofchain)) += sizeofcpy;
 			}
 			return (outchain);
 		} else {
 			/* copy the old fashion way */
 			appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_NOWAIT);
 #ifdef SCTP_MBUF_LOGGING
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 				sctp_log_mbc(appendchain, SCTP_MBUF_ICOPY);
 			}
 #endif
 		}
 	}
 	if (appendchain == NULL) {
 		/* error */
 		if (outchain)
 			sctp_m_freem(outchain);
 		return (NULL);
 	}
 	if (outchain) {
 		/* tack on to the end */
 		if (*endofchain != NULL) {
 			SCTP_BUF_NEXT(((*endofchain))) = appendchain;
 		} else {
 			m = outchain;
 			while (m) {
 				if (SCTP_BUF_NEXT(m) == NULL) {
 					SCTP_BUF_NEXT(m) = appendchain;
 					break;
 				}
 				m = SCTP_BUF_NEXT(m);
 			}
 		}
 		/*
 		 * save off the end and update the end-chain position
 		 */
 		m = appendchain;
 		while (m) {
 			if (SCTP_BUF_NEXT(m) == NULL) {
 				*endofchain = m;
 				break;
 			}
 			m = SCTP_BUF_NEXT(m);
 		}
 		return (outchain);
 	} else {
 		/* save off the end and update the end-chain position */
 		m = appendchain;
 		while (m) {
 			if (SCTP_BUF_NEXT(m) == NULL) {
 				*endofchain = m;
 				break;
 			}
 			m = SCTP_BUF_NEXT(m);
 		}
 		return (appendchain);
 	}
 }
 
 static int
 sctp_med_chunk_output(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     int *num_out,
     int *reason_code,
     int control_only, int from_where,
     struct timeval *now, int *now_filled, int frag_point, int so_locked);
 
 static void
 sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
     uint32_t val SCTP_UNUSED)
 {
 	struct sctp_copy_all *ca;
 	struct mbuf *m;
 	int ret = 0;
 	int added_control = 0;
 	int un_sent, do_chunk_output = 1;
 	struct sctp_association *asoc;
 	struct sctp_nets *net;
 
 	ca = (struct sctp_copy_all *)ptr;
 	if (ca->m == NULL) {
 		return;
 	}
 	if (ca->inp != inp) {
 		/* TSNH */
 		return;
 	}
 	if (ca->sndlen > 0) {
 		m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_NOWAIT);
 		if (m == NULL) {
 			/* can't copy so we are done */
 			ca->cnt_failed++;
 			return;
 		}
 #ifdef SCTP_MBUF_LOGGING
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 			sctp_log_mbc(m, SCTP_MBUF_ICOPY);
 		}
 #endif
 	} else {
 		m = NULL;
 	}
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if (stcb->asoc.alternate) {
 		net = stcb->asoc.alternate;
 	} else {
 		net = stcb->asoc.primary_destination;
 	}
 	if (ca->sndrcv.sinfo_flags & SCTP_ABORT) {
 		/* Abort this assoc with m as the user defined reason */
 		if (m != NULL) {
 			SCTP_BUF_PREPEND(m, sizeof(struct sctp_paramhdr), M_NOWAIT);
 		} else {
 			m = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr),
 			    0, M_NOWAIT, 1, MT_DATA);
 			SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr);
 		}
 		if (m != NULL) {
 			struct sctp_paramhdr *ph;
 
 			ph = mtod(m, struct sctp_paramhdr *);
 			ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
 			ph->param_length = htons((uint16_t)(sizeof(struct sctp_paramhdr) + ca->sndlen));
 		}
 		/*
 		 * We add one here to keep the assoc from dis-appearing on
 		 * us.
 		 */
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		sctp_abort_an_association(inp, stcb, m, SCTP_SO_NOT_LOCKED);
 		/*
 		 * sctp_abort_an_association calls sctp_free_asoc() free
 		 * association will NOT free it since we incremented the
 		 * refcnt .. we do this to prevent it being freed and things
 		 * getting tricky since we could end up (from free_asoc)
 		 * calling inpcb_free which would get a recursive lock call
 		 * to the iterator lock.. But as a consequence of that the
 		 * stcb will return to us un-locked.. since free_asoc
 		 * returns with either no TCB or the TCB unlocked, we must
 		 * relock.. to unlock in the iterator timer :-0
 		 */
 		SCTP_TCB_LOCK(stcb);
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 		goto no_chunk_output;
 	} else {
 		if (m) {
 			ret = sctp_msg_append(stcb, net, m,
 			    &ca->sndrcv, 1);
 		}
 		asoc = &stcb->asoc;
 		if (ca->sndrcv.sinfo_flags & SCTP_EOF) {
 			/* shutdown this assoc */
 			if (TAILQ_EMPTY(&asoc->send_queue) &&
 			    TAILQ_EMPTY(&asoc->sent_queue) &&
 			    sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED) == 0) {
 				if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 					goto abort_anyway;
 				}
 				/*
 				 * there is nothing queued to send, so I'm
 				 * done...
 				 */
 				if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) &&
 				    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
 				    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 					/*
 					 * only send SHUTDOWN the first time
 					 * through
 					 */
 					if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT);
 					sctp_stop_timers_for_shutdown(stcb);
 					sctp_send_shutdown(stcb, net);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb,
 					    net);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
 					    NULL);
 					added_control = 1;
 					do_chunk_output = 0;
 				}
 			} else {
 				/*
 				 * we still got (or just got) data to send,
 				 * so set SHUTDOWN_PENDING
 				 */
 				/*
 				 * XXX sockets draft says that SCTP_EOF
 				 * should be sent with no data.  currently,
 				 * we will allow user data to be sent first
 				 * and move to SHUTDOWN-PENDING
 				 */
 				if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) &&
 				    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
 				    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 					if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 						SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT);
 					}
 					SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING);
 					if (TAILQ_EMPTY(&asoc->send_queue) &&
 					    TAILQ_EMPTY(&asoc->sent_queue) &&
 					    (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 						struct mbuf *op_err;
 						char msg[SCTP_DIAG_INFO_LEN];
 
 				abort_anyway:
 						SCTP_SNPRINTF(msg, sizeof(msg),
 						    "%s:%d at %s", __FILE__, __LINE__, __func__);
 						op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 						    msg);
 						atomic_add_int(&stcb->asoc.refcnt, 1);
 						sctp_abort_an_association(stcb->sctp_ep, stcb,
 						    op_err, SCTP_SO_NOT_LOCKED);
 						atomic_add_int(&stcb->asoc.refcnt, -1);
 						goto no_chunk_output;
 					}
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
 					    NULL);
 				}
 			}
 
 		}
 	}
 	un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) +
 	    (stcb->asoc.stream_queue_cnt * SCTP_DATA_CHUNK_OVERHEAD(stcb)));
 
 	if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) &&
 	    (stcb->asoc.total_flight > 0) &&
 	    (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) {
 		do_chunk_output = 0;
 	}
 	if (do_chunk_output)
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED);
 	else if (added_control) {
 		int num_out, reason, now_filled = 0;
 		struct timeval now;
 		int frag_point;
 
 		frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
 		(void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out,
 		    &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_NOT_LOCKED);
 	}
 no_chunk_output:
 	if (ret) {
 		ca->cnt_failed++;
 	} else {
 		ca->cnt_sent++;
 	}
 }
 
 static void
 sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED)
 {
 	struct sctp_copy_all *ca;
 
 	ca = (struct sctp_copy_all *)ptr;
 	/*
 	 * Do a notify here? Kacheong suggests that the notify be done at
 	 * the send time.. so you would push up a notification if any send
 	 * failed. Don't know if this is feasible since the only failures we
 	 * have is "memory" related and if you cannot get an mbuf to send
 	 * the data you surely can't get an mbuf to send up to notify the
 	 * user you can't send the data :->
 	 */
 
 	/* now free everything */
 	if (ca->inp) {
 		/* Lets clear the flag to allow others to run. */
 		ca->inp->sctp_flags &= ~SCTP_PCB_FLAGS_SND_ITERATOR_UP;
 	}
 	sctp_m_freem(ca->m);
 	SCTP_FREE(ca, SCTP_M_COPYAL);
 }
 
 static struct mbuf *
 sctp_copy_out_all(struct uio *uio, ssize_t len)
 {
 	struct mbuf *ret, *at;
 	ssize_t left, willcpy, cancpy, error;
 
 	ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAITOK, 1, MT_DATA);
 	if (ret == NULL) {
 		/* TSNH */
 		return (NULL);
 	}
 	left = len;
 	SCTP_BUF_LEN(ret) = 0;
 	/* save space for the data chunk header */
 	cancpy = (int)M_TRAILINGSPACE(ret);
 	willcpy = min(cancpy, left);
 	at = ret;
 	while (left > 0) {
 		/* Align data to the end */
 		error = uiomove(mtod(at, caddr_t), (int)willcpy, uio);
 		if (error) {
 	err_out_now:
 			sctp_m_freem(at);
 			return (NULL);
 		}
 		SCTP_BUF_LEN(at) = (int)willcpy;
 		SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0;
 		left -= willcpy;
 		if (left > 0) {
 			SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg((unsigned int)left, 0, M_WAITOK, 1, MT_DATA);
 			if (SCTP_BUF_NEXT(at) == NULL) {
 				goto err_out_now;
 			}
 			at = SCTP_BUF_NEXT(at);
 			SCTP_BUF_LEN(at) = 0;
 			cancpy = (int)M_TRAILINGSPACE(at);
 			willcpy = min(cancpy, left);
 		}
 	}
 	return (ret);
 }
 
 static int
 sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m,
     struct sctp_sndrcvinfo *srcv)
 {
 	int ret;
 	struct sctp_copy_all *ca;
 
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SND_ITERATOR_UP) {
 		/* There is another. */
 		return (EBUSY);
 	}
 	if (uio->uio_resid > (ssize_t)SCTP_BASE_SYSCTL(sctp_sendall_limit)) {
 		/* You must not be larger than the limit! */
 		return (EMSGSIZE);
 	}
 	SCTP_MALLOC(ca, struct sctp_copy_all *, sizeof(struct sctp_copy_all),
 	    SCTP_M_COPYAL);
 	if (ca == NULL) {
 		sctp_m_freem(m);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return (ENOMEM);
 	}
 	memset(ca, 0, sizeof(struct sctp_copy_all));
 
 	ca->inp = inp;
 	if (srcv) {
 		memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo));
 	}
 	/*
 	 * take off the sendall flag, it would be bad if we failed to do
 	 * this :-0
 	 */
 	ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL;
 	/* get length and mbuf chain */
 	if (uio) {
 		ca->sndlen = uio->uio_resid;
 		ca->m = sctp_copy_out_all(uio, ca->sndlen);
 		if (ca->m == NULL) {
 			SCTP_FREE(ca, SCTP_M_COPYAL);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 			return (ENOMEM);
 		}
 	} else {
 		/* Gather the length of the send */
 		struct mbuf *mat;
 
 		ca->sndlen = 0;
 		for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) {
 			ca->sndlen += SCTP_BUF_LEN(mat);
 		}
 	}
 	inp->sctp_flags |= SCTP_PCB_FLAGS_SND_ITERATOR_UP;
 	ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL,
 	    SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES,
 	    SCTP_ASOC_ANY_STATE,
 	    (void *)ca, 0,
 	    sctp_sendall_completes, inp, 1);
 	if (ret) {
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_SND_ITERATOR_UP;
 		SCTP_FREE(ca, SCTP_M_COPYAL);
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT);
 		return (EFAULT);
 	}
 	return (0);
 }
 
 
 void
 sctp_toss_old_cookies(struct sctp_tcb *stcb, struct sctp_association *asoc)
 {
 	struct sctp_tmit_chunk *chk, *nchk;
 
 	TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) {
 		if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
 			TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
 			asoc->ctrl_queue_cnt--;
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 			sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		}
 	}
 }
 
 void
 sctp_toss_old_asconf(struct sctp_tcb *stcb)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk, *nchk;
 	struct sctp_asconf_chunk *acp;
 
 	asoc = &stcb->asoc;
 	TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) {
 		/* find SCTP_ASCONF chunk in queue */
 		if (chk->rec.chunk_id.id == SCTP_ASCONF) {
 			if (chk->data) {
 				acp = mtod(chk->data, struct sctp_asconf_chunk *);
 				if (SCTP_TSN_GT(ntohl(acp->serial_number), asoc->asconf_seq_out_acked)) {
 					/* Not Acked yet */
 					break;
 				}
 			}
 			TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next);
 			asoc->ctrl_queue_cnt--;
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 			sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		}
 	}
 }
 
 
 static void
 sctp_clean_up_datalist(struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     struct sctp_tmit_chunk **data_list,
     int bundle_at,
     struct sctp_nets *net)
 {
 	int i;
 	struct sctp_tmit_chunk *tp1;
 
 	for (i = 0; i < bundle_at; i++) {
 		/* off of the send queue */
 		TAILQ_REMOVE(&asoc->send_queue, data_list[i], sctp_next);
 		asoc->send_queue_cnt--;
 		if (i > 0) {
 			/*
 			 * Any chunk NOT 0 you zap the time chunk 0 gets
 			 * zapped or set based on if a RTO measurment is
 			 * needed.
 			 */
 			data_list[i]->do_rtt = 0;
 		}
 		/* record time */
 		data_list[i]->sent_rcv_time = net->last_sent_time;
 		data_list[i]->rec.data.cwnd_at_send = net->cwnd;
 		data_list[i]->rec.data.fast_retran_tsn = data_list[i]->rec.data.tsn;
 		if (data_list[i]->whoTo == NULL) {
 			data_list[i]->whoTo = net;
 			atomic_add_int(&net->ref_count, 1);
 		}
 		/* on to the sent queue */
 		tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead);
 		if ((tp1) && SCTP_TSN_GT(tp1->rec.data.tsn, data_list[i]->rec.data.tsn)) {
 			struct sctp_tmit_chunk *tpp;
 
 			/* need to move back */
 	back_up_more:
 			tpp = TAILQ_PREV(tp1, sctpchunk_listhead, sctp_next);
 			if (tpp == NULL) {
 				TAILQ_INSERT_BEFORE(tp1, data_list[i], sctp_next);
 				goto all_done;
 			}
 			tp1 = tpp;
 			if (SCTP_TSN_GT(tp1->rec.data.tsn, data_list[i]->rec.data.tsn)) {
 				goto back_up_more;
 			}
 			TAILQ_INSERT_AFTER(&asoc->sent_queue, tp1, data_list[i], sctp_next);
 		} else {
 			TAILQ_INSERT_TAIL(&asoc->sent_queue,
 			    data_list[i],
 			    sctp_next);
 		}
 all_done:
 		/* This does not lower until the cum-ack passes it */
 		asoc->sent_queue_cnt++;
 		if ((asoc->peers_rwnd <= 0) &&
 		    (asoc->total_flight == 0) &&
 		    (bundle_at == 1)) {
 			/* Mark the chunk as being a window probe */
 			SCTP_STAT_INCR(sctps_windowprobed);
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_audit_log(0xC2, 3);
 #endif
 		data_list[i]->sent = SCTP_DATAGRAM_SENT;
 		data_list[i]->snd_count = 1;
 		data_list[i]->rec.data.chunk_was_revoked = 0;
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
 			sctp_misc_ints(SCTP_FLIGHT_LOG_UP,
 			    data_list[i]->whoTo->flight_size,
 			    data_list[i]->book_size,
 			    (uint32_t)(uintptr_t)data_list[i]->whoTo,
 			    data_list[i]->rec.data.tsn);
 		}
 		sctp_flight_size_increase(data_list[i]);
 		sctp_total_flight_increase(stcb, data_list[i]);
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
 			sctp_log_rwnd(SCTP_DECREASE_PEER_RWND,
 			    asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh));
 		}
 		asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd,
 		    (uint32_t)(data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)));
 		if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
 			/* SWS sender side engages */
 			asoc->peers_rwnd = 0;
 		}
 	}
 	if (asoc->cc_functions.sctp_cwnd_update_packet_transmitted) {
 		(*asoc->cc_functions.sctp_cwnd_update_packet_transmitted) (stcb, net);
 	}
 }
 
 static void
 sctp_clean_up_ctl(struct sctp_tcb *stcb, struct sctp_association *asoc, int so_locked)
 {
 	struct sctp_tmit_chunk *chk, *nchk;
 
 	TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) {
 		if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
 		    (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) ||	/* EY */
 		    (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) ||
 		    (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) ||
 		    (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) ||
 		    (chk->rec.chunk_id.id == SCTP_SHUTDOWN) ||
 		    (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) ||
 		    (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) ||
 		    (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) ||
 		    (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) ||
 		    (chk->rec.chunk_id.id == SCTP_ECN_CWR) ||
 		    (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) {
 			/* Stray chunks must be cleaned up */
 	clean_up_anyway:
 			TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
 			asoc->ctrl_queue_cnt--;
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 			if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) {
 				asoc->fwd_tsn_cnt--;
 			}
 			sctp_free_a_chunk(stcb, chk, so_locked);
 		} else if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) {
 			/* special handling, we must look into the param */
 			if (chk != asoc->str_reset) {
 				goto clean_up_anyway;
 			}
 		}
 	}
 }
 
 static uint32_t
 sctp_can_we_split_this(struct sctp_tcb *stcb, uint32_t length,
     uint32_t space_left, uint32_t frag_point, int eeor_on)
 {
 	/*
 	 * Make a decision on if I should split a msg into multiple parts.
 	 * This is only asked of incomplete messages.
 	 */
 	if (eeor_on) {
 		/*
 		 * If we are doing EEOR we need to always send it if its the
 		 * entire thing, since it might be all the guy is putting in
 		 * the hopper.
 		 */
 		if (space_left >= length) {
 			/*-
 			 * If we have data outstanding,
 			 * we get another chance when the sack
 			 * arrives to transmit - wait for more data
 			 */
 			if (stcb->asoc.total_flight == 0) {
 				/*
 				 * If nothing is in flight, we zero the
 				 * packet counter.
 				 */
 				return (length);
 			}
 			return (0);
 
 		} else {
 			/* You can fill the rest */
 			return (space_left);
 		}
 	}
 	/*-
 	 * For those strange folk that make the send buffer
 	 * smaller than our fragmentation point, we can't
 	 * get a full msg in so we have to allow splitting.
 	 */
 	if (SCTP_SB_LIMIT_SND(stcb->sctp_socket) < frag_point) {
 		return (length);
 	}
 	if ((length <= space_left) ||
 	    ((length - space_left) < SCTP_BASE_SYSCTL(sctp_min_residual))) {
 		/* Sub-optimial residual don't split in non-eeor mode. */
 		return (0);
 	}
 	/*
 	 * If we reach here length is larger than the space_left. Do we wish
 	 * to split it for the sake of packet putting together?
 	 */
 	if (space_left >= min(SCTP_BASE_SYSCTL(sctp_min_split_point), frag_point)) {
 		/* Its ok to split it */
 		return (min(space_left, frag_point));
 	}
 	/* Nope, can't split */
 	return (0);
 }
 
 static uint32_t
 sctp_move_to_outqueue(struct sctp_tcb *stcb,
     struct sctp_stream_out *strq,
     uint32_t space_left,
     uint32_t frag_point,
     int *giveup,
     int eeor_mode,
     int *bail,
     int so_locked)
 {
 	/* Move from the stream to the send_queue keeping track of the total */
 	struct sctp_association *asoc;
 	struct sctp_stream_queue_pending *sp;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_data_chunk *dchkh = NULL;
 	struct sctp_idata_chunk *ndchkh = NULL;
 	uint32_t to_move, length;
 	int leading;
 	uint8_t rcv_flags = 0;
 	uint8_t some_taken;
 	uint8_t send_lock_up = 0;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	asoc = &stcb->asoc;
 one_more_time:
 	/* sa_ignore FREED_MEMORY */
 	sp = TAILQ_FIRST(&strq->outqueue);
 	if (sp == NULL) {
 		if (send_lock_up == 0) {
 			SCTP_TCB_SEND_LOCK(stcb);
 			send_lock_up = 1;
 		}
 		sp = TAILQ_FIRST(&strq->outqueue);
 		if (sp) {
 			goto one_more_time;
 		}
 		if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_EXPLICIT_EOR) == 0) &&
 		    (stcb->asoc.idata_supported == 0) &&
 		    (strq->last_msg_incomplete)) {
 			SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n",
 			    strq->sid,
 			    strq->last_msg_incomplete);
 			strq->last_msg_incomplete = 0;
 		}
 		to_move = 0;
 		if (send_lock_up) {
 			SCTP_TCB_SEND_UNLOCK(stcb);
 			send_lock_up = 0;
 		}
 		goto out_of;
 	}
 	if ((sp->msg_is_complete) && (sp->length == 0)) {
 		if (sp->sender_all_done) {
 			/*
 			 * We are doing deferred cleanup. Last time through
 			 * when we took all the data the sender_all_done was
 			 * not set.
 			 */
 			if ((sp->put_last_out == 0) && (sp->discard_rest == 0)) {
 				SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n");
 				SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n",
 				    sp->sender_all_done,
 				    sp->length,
 				    sp->msg_is_complete,
 				    sp->put_last_out,
 				    send_lock_up);
 			}
 			if ((TAILQ_NEXT(sp, next) == NULL) && (send_lock_up == 0)) {
 				SCTP_TCB_SEND_LOCK(stcb);
 				send_lock_up = 1;
 			}
 			atomic_subtract_int(&asoc->stream_queue_cnt, 1);
 			TAILQ_REMOVE(&strq->outqueue, sp, next);
 			stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up);
 			if ((strq->state == SCTP_STREAM_RESET_PENDING) &&
 			    (strq->chunks_on_queues == 0) &&
 			    TAILQ_EMPTY(&strq->outqueue)) {
 				stcb->asoc.trigger_reset = 1;
 			}
 			if (sp->net) {
 				sctp_free_remote_addr(sp->net);
 				sp->net = NULL;
 			}
 			if (sp->data) {
 				sctp_m_freem(sp->data);
 				sp->data = NULL;
 			}
 			sctp_free_a_strmoq(stcb, sp, so_locked);
 			/* we can't be locked to it */
 			if (send_lock_up) {
 				SCTP_TCB_SEND_UNLOCK(stcb);
 				send_lock_up = 0;
 			}
 			/* back to get the next msg */
 			goto one_more_time;
 		} else {
 			/*
 			 * sender just finished this but still holds a
 			 * reference
 			 */
 			*giveup = 1;
 			to_move = 0;
 			goto out_of;
 		}
 	} else {
 		/* is there some to get */
 		if (sp->length == 0) {
 			/* no */
 			*giveup = 1;
 			to_move = 0;
 			goto out_of;
 		} else if (sp->discard_rest) {
 			if (send_lock_up == 0) {
 				SCTP_TCB_SEND_LOCK(stcb);
 				send_lock_up = 1;
 			}
 			/* Whack down the size */
 			atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length);
 			if ((stcb->sctp_socket != NULL) &&
 			    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
 				atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length);
 			}
 			if (sp->data) {
 				sctp_m_freem(sp->data);
 				sp->data = NULL;
 				sp->tail_mbuf = NULL;
 			}
 			sp->length = 0;
 			sp->some_taken = 1;
 			*giveup = 1;
 			to_move = 0;
 			goto out_of;
 		}
 	}
 	some_taken = sp->some_taken;
 re_look:
 	length = sp->length;
 	if (sp->msg_is_complete) {
 		/* The message is complete */
 		to_move = min(length, frag_point);
 		if (to_move == length) {
 			/* All of it fits in the MTU */
 			if (sp->some_taken) {
 				rcv_flags |= SCTP_DATA_LAST_FRAG;
 			} else {
 				rcv_flags |= SCTP_DATA_NOT_FRAG;
 			}
 			sp->put_last_out = 1;
 			if (sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) {
 				rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY;
 			}
 		} else {
 			/* Not all of it fits, we fragment */
 			if (sp->some_taken == 0) {
 				rcv_flags |= SCTP_DATA_FIRST_FRAG;
 			}
 			sp->some_taken = 1;
 		}
 	} else {
 		to_move = sctp_can_we_split_this(stcb, length, space_left, frag_point, eeor_mode);
 		if (to_move) {
 			/*-
 			 * We use a snapshot of length in case it
 			 * is expanding during the compare.
 			 */
 			uint32_t llen;
 
 			llen = length;
 			if (to_move >= llen) {
 				to_move = llen;
 				if (send_lock_up == 0) {
 					/*-
 					 * We are taking all of an incomplete msg
 					 * thus we need a send lock.
 					 */
 					SCTP_TCB_SEND_LOCK(stcb);
 					send_lock_up = 1;
 					if (sp->msg_is_complete) {
 						/*
 						 * the sender finished the
 						 * msg
 						 */
 						goto re_look;
 					}
 				}
 			}
 			if (sp->some_taken == 0) {
 				rcv_flags |= SCTP_DATA_FIRST_FRAG;
 				sp->some_taken = 1;
 			}
 		} else {
 			/* Nothing to take. */
 			*giveup = 1;
 			to_move = 0;
 			goto out_of;
 		}
 	}
 
 	/* If we reach here, we can copy out a chunk */
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* No chunk memory */
 		*giveup = 1;
 		to_move = 0;
 		goto out_of;
 	}
 	/*
 	 * Setup for unordered if needed by looking at the user sent info
 	 * flags.
 	 */
 	if (sp->sinfo_flags & SCTP_UNORDERED) {
 		rcv_flags |= SCTP_DATA_UNORDERED;
 	}
 	if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) &&
 	    (sp->sinfo_flags & SCTP_EOF) == SCTP_EOF) {
 		rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY;
 	}
 	/* clear out the chunk before setting up */
 	memset(chk, 0, sizeof(*chk));
 	chk->rec.data.rcv_flags = rcv_flags;
 
 	if (to_move >= length) {
 		/* we think we can steal the whole thing */
 		if ((sp->sender_all_done == 0) && (send_lock_up == 0)) {
 			SCTP_TCB_SEND_LOCK(stcb);
 			send_lock_up = 1;
 		}
 		if (to_move < sp->length) {
 			/* bail, it changed */
 			goto dont_do_it;
 		}
 		chk->data = sp->data;
 		chk->last_mbuf = sp->tail_mbuf;
 		/* register the stealing */
 		sp->data = sp->tail_mbuf = NULL;
 	} else {
 		struct mbuf *m;
 
 dont_do_it:
 		chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_NOWAIT);
 		chk->last_mbuf = NULL;
 		if (chk->data == NULL) {
 			sp->some_taken = some_taken;
 			sctp_free_a_chunk(stcb, chk, so_locked);
 			*bail = 1;
 			to_move = 0;
 			goto out_of;
 		}
 #ifdef SCTP_MBUF_LOGGING
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 			sctp_log_mbc(chk->data, SCTP_MBUF_ICOPY);
 		}
 #endif
 		/* Pull off the data */
 		m_adj(sp->data, to_move);
 		/* Now lets work our way down and compact it */
 		m = sp->data;
 		while (m && (SCTP_BUF_LEN(m) == 0)) {
 			sp->data = SCTP_BUF_NEXT(m);
 			SCTP_BUF_NEXT(m) = NULL;
 			if (sp->tail_mbuf == m) {
 				/*-
 				 * Freeing tail? TSNH since
 				 * we supposedly were taking less
 				 * than the sp->length.
 				 */
 #ifdef INVARIANTS
 				panic("Huh, freing tail? - TSNH");
 #else
 				SCTP_PRINTF("Huh, freeing tail? - TSNH\n");
 				sp->tail_mbuf = sp->data = NULL;
 				sp->length = 0;
 #endif
 
 			}
 			sctp_m_free(m);
 			m = sp->data;
 		}
 	}
 	if (SCTP_BUF_IS_EXTENDED(chk->data)) {
 		chk->copy_by_ref = 1;
 	} else {
 		chk->copy_by_ref = 0;
 	}
 	/*
 	 * get last_mbuf and counts of mb usage This is ugly but hopefully
 	 * its only one mbuf.
 	 */
 	if (chk->last_mbuf == NULL) {
 		chk->last_mbuf = chk->data;
 		while (SCTP_BUF_NEXT(chk->last_mbuf) != NULL) {
 			chk->last_mbuf = SCTP_BUF_NEXT(chk->last_mbuf);
 		}
 	}
 
 	if (to_move > length) {
 		/*- This should not happen either
 		 * since we always lower to_move to the size
 		 * of sp->length if its larger.
 		 */
 #ifdef INVARIANTS
 		panic("Huh, how can to_move be larger?");
 #else
 		SCTP_PRINTF("Huh, how can to_move be larger?\n");
 		sp->length = 0;
 #endif
 	} else {
 		atomic_subtract_int(&sp->length, to_move);
 	}
 	leading = SCTP_DATA_CHUNK_OVERHEAD(stcb);
 	if (M_LEADINGSPACE(chk->data) < leading) {
 		/* Not enough room for a chunk header, get some */
 		struct mbuf *m;
 
 		m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 1, MT_DATA);
 		if (m == NULL) {
 			/*
 			 * we're in trouble here. _PREPEND below will free
 			 * all the data if there is no leading space, so we
 			 * must put the data back and restore.
 			 */
 			if (send_lock_up == 0) {
 				SCTP_TCB_SEND_LOCK(stcb);
 				send_lock_up = 1;
 			}
 			if (sp->data == NULL) {
 				/* unsteal the data */
 				sp->data = chk->data;
 				sp->tail_mbuf = chk->last_mbuf;
 			} else {
 				struct mbuf *m_tmp;
 
 				/* reassemble the data */
 				m_tmp = sp->data;
 				sp->data = chk->data;
 				SCTP_BUF_NEXT(chk->last_mbuf) = m_tmp;
 			}
 			sp->some_taken = some_taken;
 			atomic_add_int(&sp->length, to_move);
 			chk->data = NULL;
 			*bail = 1;
 			sctp_free_a_chunk(stcb, chk, so_locked);
 			to_move = 0;
 			goto out_of;
 		} else {
 			SCTP_BUF_LEN(m) = 0;
 			SCTP_BUF_NEXT(m) = chk->data;
 			chk->data = m;
 			M_ALIGN(chk->data, 4);
 		}
 	}
 	SCTP_BUF_PREPEND(chk->data, SCTP_DATA_CHUNK_OVERHEAD(stcb), M_NOWAIT);
 	if (chk->data == NULL) {
 		/* HELP, TSNH since we assured it would not above? */
 #ifdef INVARIANTS
 		panic("prepend failes HELP?");
 #else
 		SCTP_PRINTF("prepend fails HELP?\n");
 		sctp_free_a_chunk(stcb, chk, so_locked);
 #endif
 		*bail = 1;
 		to_move = 0;
 		goto out_of;
 	}
 	sctp_snd_sb_alloc(stcb, SCTP_DATA_CHUNK_OVERHEAD(stcb));
 	chk->book_size = chk->send_size = (uint16_t)(to_move + SCTP_DATA_CHUNK_OVERHEAD(stcb));
 	chk->book_size_scale = 0;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->pad_inplace = 0;
 	chk->no_fr_allowed = 0;
 	if (stcb->asoc.idata_supported == 0) {
 		if (rcv_flags & SCTP_DATA_UNORDERED) {
 			/* Just use 0. The receiver ignores the values. */
 			chk->rec.data.mid = 0;
 		} else {
 			chk->rec.data.mid = strq->next_mid_ordered;
 			if (rcv_flags & SCTP_DATA_LAST_FRAG) {
 				strq->next_mid_ordered++;
 			}
 		}
 	} else {
 		if (rcv_flags & SCTP_DATA_UNORDERED) {
 			chk->rec.data.mid = strq->next_mid_unordered;
 			if (rcv_flags & SCTP_DATA_LAST_FRAG) {
 				strq->next_mid_unordered++;
 			}
 		} else {
 			chk->rec.data.mid = strq->next_mid_ordered;
 			if (rcv_flags & SCTP_DATA_LAST_FRAG) {
 				strq->next_mid_ordered++;
 			}
 		}
 	}
 	chk->rec.data.sid = sp->sid;
 	chk->rec.data.ppid = sp->ppid;
 	chk->rec.data.context = sp->context;
 	chk->rec.data.doing_fast_retransmit = 0;
 
 	chk->rec.data.timetodrop = sp->ts;
 	chk->flags = sp->act_flags;
 
 	if (sp->net) {
 		chk->whoTo = sp->net;
 		atomic_add_int(&chk->whoTo->ref_count, 1);
 	} else
 		chk->whoTo = NULL;
 
 	if (sp->holds_key_ref) {
 		chk->auth_keyid = sp->auth_keyid;
 		sctp_auth_key_acquire(stcb, chk->auth_keyid);
 		chk->holds_key_ref = 1;
 	}
 	chk->rec.data.tsn = atomic_fetchadd_int(&asoc->sending_seq, 1);
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) {
 		sctp_misc_ints(SCTP_STRMOUT_LOG_SEND,
 		    (uint32_t)(uintptr_t)stcb, sp->length,
 		    (uint32_t)((chk->rec.data.sid << 16) | (0x0000ffff & chk->rec.data.mid)),
 		    chk->rec.data.tsn);
 	}
 	if (stcb->asoc.idata_supported == 0) {
 		dchkh = mtod(chk->data, struct sctp_data_chunk *);
 	} else {
 		ndchkh = mtod(chk->data, struct sctp_idata_chunk *);
 	}
 	/*
 	 * Put the rest of the things in place now. Size was done earlier in
 	 * previous loop prior to padding.
 	 */
 
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if (asoc->tsn_out_at >= SCTP_TSN_LOG_SIZE) {
 		asoc->tsn_out_at = 0;
 		asoc->tsn_out_wrapped = 1;
 	}
 	asoc->out_tsnlog[asoc->tsn_out_at].tsn = chk->rec.data.tsn;
 	asoc->out_tsnlog[asoc->tsn_out_at].strm = chk->rec.data.sid;
 	asoc->out_tsnlog[asoc->tsn_out_at].seq = chk->rec.data.mid;
 	asoc->out_tsnlog[asoc->tsn_out_at].sz = chk->send_size;
 	asoc->out_tsnlog[asoc->tsn_out_at].flgs = chk->rec.data.rcv_flags;
 	asoc->out_tsnlog[asoc->tsn_out_at].stcb = (void *)stcb;
 	asoc->out_tsnlog[asoc->tsn_out_at].in_pos = asoc->tsn_out_at;
 	asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2;
 	asoc->tsn_out_at++;
 #endif
 	if (stcb->asoc.idata_supported == 0) {
 		dchkh->ch.chunk_type = SCTP_DATA;
 		dchkh->ch.chunk_flags = chk->rec.data.rcv_flags;
 		dchkh->dp.tsn = htonl(chk->rec.data.tsn);
 		dchkh->dp.sid = htons(strq->sid);
 		dchkh->dp.ssn = htons((uint16_t)chk->rec.data.mid);
 		dchkh->dp.ppid = chk->rec.data.ppid;
 		dchkh->ch.chunk_length = htons(chk->send_size);
 	} else {
 		ndchkh->ch.chunk_type = SCTP_IDATA;
 		ndchkh->ch.chunk_flags = chk->rec.data.rcv_flags;
 		ndchkh->dp.tsn = htonl(chk->rec.data.tsn);
 		ndchkh->dp.sid = htons(strq->sid);
 		ndchkh->dp.reserved = htons(0);
 		ndchkh->dp.mid = htonl(chk->rec.data.mid);
 		if (sp->fsn == 0)
 			ndchkh->dp.ppid_fsn.ppid = chk->rec.data.ppid;
 		else
 			ndchkh->dp.ppid_fsn.fsn = htonl(sp->fsn);
 		sp->fsn++;
 		ndchkh->ch.chunk_length = htons(chk->send_size);
 	}
 	/* Now advance the chk->send_size by the actual pad needed. */
 	if (chk->send_size < SCTP_SIZE32(chk->book_size)) {
 		/* need a pad */
 		struct mbuf *lm;
 		int pads;
 
 		pads = SCTP_SIZE32(chk->book_size) - chk->send_size;
 		lm = sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf);
 		if (lm != NULL) {
 			chk->last_mbuf = lm;
 			chk->pad_inplace = 1;
 		}
 		chk->send_size += pads;
 	}
 	if (PR_SCTP_ENABLED(chk->flags)) {
 		asoc->pr_sctp_cnt++;
 	}
 	if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) {
 		/* All done pull and kill the message */
 		if (sp->put_last_out == 0) {
 			SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n");
 			SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n",
 			    sp->sender_all_done,
 			    sp->length,
 			    sp->msg_is_complete,
 			    sp->put_last_out,
 			    send_lock_up);
 		}
 		if ((send_lock_up == 0) && (TAILQ_NEXT(sp, next) == NULL)) {
 			SCTP_TCB_SEND_LOCK(stcb);
 			send_lock_up = 1;
 		}
 		atomic_subtract_int(&asoc->stream_queue_cnt, 1);
 		TAILQ_REMOVE(&strq->outqueue, sp, next);
 		stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up);
 		if ((strq->state == SCTP_STREAM_RESET_PENDING) &&
 		    (strq->chunks_on_queues == 0) &&
 		    TAILQ_EMPTY(&strq->outqueue)) {
 			stcb->asoc.trigger_reset = 1;
 		}
 		if (sp->net) {
 			sctp_free_remote_addr(sp->net);
 			sp->net = NULL;
 		}
 		if (sp->data) {
 			sctp_m_freem(sp->data);
 			sp->data = NULL;
 		}
 		sctp_free_a_strmoq(stcb, sp, so_locked);
 	}
 	asoc->chunks_on_out_queue++;
 	strq->chunks_on_queues++;
 	TAILQ_INSERT_TAIL(&asoc->send_queue, chk, sctp_next);
 	asoc->send_queue_cnt++;
 out_of:
 	if (send_lock_up) {
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	return (to_move);
 }
 
 
 static void
 sctp_fill_outqueue(struct sctp_tcb *stcb,
     struct sctp_nets *net, int frag_point, int eeor_mode, int *quit_now, int so_locked)
 {
 	struct sctp_association *asoc;
 	struct sctp_stream_out *strq;
 	uint32_t space_left, moved, total_moved;
 	int bail, giveup;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	asoc = &stcb->asoc;
 	total_moved = 0;
 	switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		space_left = net->mtu - SCTP_MIN_V4_OVERHEAD;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		space_left = net->mtu - SCTP_MIN_OVERHEAD;
 		break;
 #endif
 	default:
 		/* TSNH */
 		space_left = net->mtu;
 		break;
 	}
 	/* Need an allowance for the data chunk header too */
 	space_left -= SCTP_DATA_CHUNK_OVERHEAD(stcb);
 
 	/* must make even word boundary */
 	space_left &= 0xfffffffc;
 	strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc);
 	giveup = 0;
 	bail = 0;
 	while ((space_left > 0) && (strq != NULL)) {
 		moved = sctp_move_to_outqueue(stcb, strq, space_left, frag_point,
 		    &giveup, eeor_mode, &bail, so_locked);
 		stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved);
 		if ((giveup != 0) || (bail != 0)) {
 			break;
 		}
 		strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc);
 		total_moved += moved;
 		if (space_left >= moved) {
 			space_left -= moved;
 		} else {
 			space_left = 0;
 		}
 		if (space_left >= SCTP_DATA_CHUNK_OVERHEAD(stcb)) {
 			space_left -= SCTP_DATA_CHUNK_OVERHEAD(stcb);
 		} else {
 			space_left = 0;
 		}
 		space_left &= 0xfffffffc;
 	}
 	if (bail != 0)
 		*quit_now = 1;
 
 	stcb->asoc.ss_functions.sctp_ss_packet_done(stcb, net, asoc);
 
 	if (total_moved == 0) {
 		if ((stcb->asoc.sctp_cmt_on_off == 0) &&
 		    (net == stcb->asoc.primary_destination)) {
 			/* ran dry for primary network net */
 			SCTP_STAT_INCR(sctps_primary_randry);
 		} else if (stcb->asoc.sctp_cmt_on_off > 0) {
 			/* ran dry with CMT on */
 			SCTP_STAT_INCR(sctps_cmt_randry);
 		}
 	}
 }
 
 void
 sctp_fix_ecn_echo(struct sctp_association *asoc)
 {
 	struct sctp_tmit_chunk *chk;
 
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) {
 			chk->sent = SCTP_DATAGRAM_UNSENT;
 		}
 	}
 }
 
 void
 sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_stream_queue_pending *sp;
 	unsigned int i;
 
 	if (net == NULL) {
 		return;
 	}
 	asoc = &stcb->asoc;
 	for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 		TAILQ_FOREACH(sp, &stcb->asoc.strmout[i].outqueue, next) {
 			if (sp->net == net) {
 				sctp_free_remote_addr(sp->net);
 				sp->net = NULL;
 			}
 		}
 	}
 	TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
 		if (chk->whoTo == net) {
 			sctp_free_remote_addr(chk->whoTo);
 			chk->whoTo = NULL;
 		}
 	}
 }
 
 int
 sctp_med_chunk_output(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     int *num_out,
     int *reason_code,
     int control_only, int from_where,
     struct timeval *now, int *now_filled, int frag_point, int so_locked)
 {
 	/**
 	 * Ok this is the generic chunk service queue. we must do the
 	 * following:
 	 * - Service the stream queue that is next, moving any
 	 *   message (note I must get a complete message i.e. FIRST/MIDDLE and
 	 *   LAST to the out queue in one pass) and assigning TSN's. This
 	 *   only applys though if the peer does not support NDATA. For NDATA
 	 *   chunks its ok to not send the entire message ;-)
 	 * - Check to see if the cwnd/rwnd allows any output, if so we go ahead and
 	 *   fomulate and send the low level chunks. Making sure to combine
 	 *   any control in the control chunk queue also.
 	 */
 	struct sctp_nets *net, *start_at, *sack_goes_to = NULL, *old_start_at = NULL;
 	struct mbuf *outchain, *endoutchain;
 	struct sctp_tmit_chunk *chk, *nchk;
 
 	/* temp arrays for unlinking */
 	struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING];
 	int no_fragmentflg, error;
 	unsigned int max_rwnd_per_dest, max_send_per_dest;
 	int one_chunk, hbflag, skip_data_for_this_net;
 	int asconf, cookie, no_out_cnt;
 	int bundle_at, ctl_cnt, no_data_chunks, eeor_mode;
 	unsigned int mtu, r_mtu, omtu, mx_mtu, to_out;
 	int tsns_sent = 0;
 	uint32_t auth_offset;
 	struct sctp_auth_chunk *auth;
 	uint16_t auth_keyid;
 	int override_ok = 1;
 	int skip_fill_up = 0;
 	int data_auth_reqd = 0;
 
 	/*
 	 * JRS 5/14/07 - Add flag for whether a heartbeat is sent to the
 	 * destination.
 	 */
 	int quit_now = 0;
 
 	*num_out = 0;
 	*reason_code = 0;
 	auth_keyid = stcb->asoc.authinfo.active_keyid;
 	if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
 	    (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {
 		eeor_mode = 1;
 	} else {
 		eeor_mode = 0;
 	}
 	ctl_cnt = no_out_cnt = asconf = cookie = 0;
 	/*
 	 * First lets prime the pump. For each destination, if there is room
 	 * in the flight size, attempt to pull an MTU's worth out of the
 	 * stream queues into the general send_queue
 	 */
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xC2, 2);
 #endif
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	hbflag = 0;
 	if (control_only)
 		no_data_chunks = 1;
 	else
 		no_data_chunks = 0;
 
 	/* Nothing to possible to send? */
 	if ((TAILQ_EMPTY(&asoc->control_send_queue) ||
 	    (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) &&
 	    TAILQ_EMPTY(&asoc->asconf_send_queue) &&
 	    TAILQ_EMPTY(&asoc->send_queue) &&
 	    sctp_is_there_unsent_data(stcb, so_locked) == 0) {
 nothing_to_send:
 		*reason_code = 9;
 		return (0);
 	}
 	if (asoc->peers_rwnd == 0) {
 		/* No room in peers rwnd */
 		*reason_code = 1;
 		if (asoc->total_flight > 0) {
 			/* we are allowed one chunk in flight */
 			no_data_chunks = 1;
 		}
 	}
 	if (stcb->asoc.ecn_echo_cnt_onq) {
 		/* Record where a sack goes, if any */
 		if (no_data_chunks &&
 		    (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) {
 			/* Nothing but ECNe to send - we don't do that */
 			goto nothing_to_send;
 		}
 		TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 			if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
 			    (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) {
 				sack_goes_to = chk->whoTo;
 				break;
 			}
 		}
 	}
 	max_rwnd_per_dest = ((asoc->peers_rwnd + asoc->total_flight) / asoc->numnets);
 	if (stcb->sctp_socket)
 		max_send_per_dest = SCTP_SB_LIMIT_SND(stcb->sctp_socket) / asoc->numnets;
 	else
 		max_send_per_dest = 0;
 	if (no_data_chunks == 0) {
 		/* How many non-directed chunks are there? */
 		TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
 			if (chk->whoTo == NULL) {
 				/*
 				 * We already have non-directed chunks on
 				 * the queue, no need to do a fill-up.
 				 */
 				skip_fill_up = 1;
 				break;
 			}
 		}
 
 	}
 	if ((no_data_chunks == 0) &&
 	    (skip_fill_up == 0) &&
 	    (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc))) {
 		TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 			/*
 			 * This for loop we are in takes in each net, if
 			 * its's got space in cwnd and has data sent to it
 			 * (when CMT is off) then it calls
 			 * sctp_fill_outqueue for the net. This gets data on
 			 * the send queue for that network.
 			 *
 			 * In sctp_fill_outqueue TSN's are assigned and data
 			 * is copied out of the stream buffers. Note mostly
 			 * copy by reference (we hope).
 			 */
 			net->window_probe = 0;
 			if ((net != stcb->asoc.alternate) &&
 			    ((net->dest_state & SCTP_ADDR_PF) ||
 			    (!(net->dest_state & SCTP_ADDR_REACHABLE)) ||
 			    (net->dest_state & SCTP_ADDR_UNCONFIRMED))) {
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 					sctp_log_cwnd(stcb, net, 1,
 					    SCTP_CWND_LOG_FILL_OUTQ_CALLED);
 				}
 				continue;
 			}
 			if ((stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) &&
 			    (net->flight_size == 0)) {
 				(*stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) (stcb, net);
 			}
 			if (net->flight_size >= net->cwnd) {
 				/* skip this network, no room - can't fill */
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 					sctp_log_cwnd(stcb, net, 3,
 					    SCTP_CWND_LOG_FILL_OUTQ_CALLED);
 				}
 				continue;
 			}
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 				sctp_log_cwnd(stcb, net, 4, SCTP_CWND_LOG_FILL_OUTQ_CALLED);
 			}
 			sctp_fill_outqueue(stcb, net, frag_point, eeor_mode, &quit_now, so_locked);
 			if (quit_now) {
 				/* memory alloc failure */
 				no_data_chunks = 1;
 				break;
 			}
 		}
 	}
 	/* now service each destination and send out what we can for it */
 	/* Nothing to send? */
 	if (TAILQ_EMPTY(&asoc->control_send_queue) &&
 	    TAILQ_EMPTY(&asoc->asconf_send_queue) &&
 	    TAILQ_EMPTY(&asoc->send_queue)) {
 		*reason_code = 8;
 		return (0);
 	}
 
 	if (asoc->sctp_cmt_on_off > 0) {
 		/* get the last start point */
 		start_at = asoc->last_net_cmt_send_started;
 		if (start_at == NULL) {
 			/* null so to beginning */
 			start_at = TAILQ_FIRST(&asoc->nets);
 		} else {
 			start_at = TAILQ_NEXT(asoc->last_net_cmt_send_started, sctp_next);
 			if (start_at == NULL) {
 				start_at = TAILQ_FIRST(&asoc->nets);
 			}
 		}
 		asoc->last_net_cmt_send_started = start_at;
 	} else {
 		start_at = TAILQ_FIRST(&asoc->nets);
 	}
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if (chk->whoTo == NULL) {
 			if (asoc->alternate) {
 				chk->whoTo = asoc->alternate;
 			} else {
 				chk->whoTo = asoc->primary_destination;
 			}
 			atomic_add_int(&chk->whoTo->ref_count, 1);
 		}
 	}
 	old_start_at = NULL;
 again_one_more_time:
 	for (net = start_at; net != NULL; net = TAILQ_NEXT(net, sctp_next)) {
 		/* how much can we send? */
 		/* SCTPDBG("Examine for sending net:%x\n", (uint32_t)net); */
 		if (old_start_at && (old_start_at == net)) {
 			/* through list ocmpletely. */
 			break;
 		}
 		tsns_sent = 0xa;
 		if (TAILQ_EMPTY(&asoc->control_send_queue) &&
 		    TAILQ_EMPTY(&asoc->asconf_send_queue) &&
 		    (net->flight_size >= net->cwnd)) {
 			/*
 			 * Nothing on control or asconf and flight is full,
 			 * we can skip even in the CMT case.
 			 */
 			continue;
 		}
 		bundle_at = 0;
 		endoutchain = outchain = NULL;
 		auth = NULL;
 		auth_offset = 0;
 		no_fragmentflg = 1;
 		one_chunk = 0;
 		if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 			skip_data_for_this_net = 1;
 		} else {
 			skip_data_for_this_net = 0;
 		}
 		switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
 #ifdef INET
 		case AF_INET:
 			mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			mtu = net->mtu - SCTP_MIN_OVERHEAD;
 			break;
 #endif
 		default:
 			/* TSNH */
 			mtu = net->mtu;
 			break;
 		}
 		mx_mtu = mtu;
 		to_out = 0;
 		if (mtu > asoc->peers_rwnd) {
 			if (asoc->total_flight > 0) {
 				/* We have a packet in flight somewhere */
 				r_mtu = asoc->peers_rwnd;
 			} else {
 				/* We are always allowed to send one MTU out */
 				one_chunk = 1;
 				r_mtu = mtu;
 			}
 		} else {
 			r_mtu = mtu;
 		}
 		error = 0;
 		/************************/
 		/* ASCONF transmission */
 		/************************/
 		/* Now first lets go through the asconf queue */
 		TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) {
 			if (chk->rec.chunk_id.id != SCTP_ASCONF) {
 				continue;
 			}
 			if (chk->whoTo == NULL) {
 				if (asoc->alternate == NULL) {
 					if (asoc->primary_destination != net) {
 						break;
 					}
 				} else {
 					if (asoc->alternate != net) {
 						break;
 					}
 				}
 			} else {
 				if (chk->whoTo != net) {
 					break;
 				}
 			}
 			if (chk->data == NULL) {
 				break;
 			}
 			if (chk->sent != SCTP_DATAGRAM_UNSENT &&
 			    chk->sent != SCTP_DATAGRAM_RESEND) {
 				break;
 			}
 			/*
 			 * if no AUTH is yet included and this chunk
 			 * requires it, make sure to account for it.  We
 			 * don't apply the size until the AUTH chunk is
 			 * actually added below in case there is no room for
 			 * this chunk. NOTE: we overload the use of "omtu"
 			 * here
 			 */
 			if ((auth == NULL) &&
 			    sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
 			    stcb->asoc.peer_auth_chunks)) {
 				omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 			} else
 				omtu = 0;
 			/* Here we do NOT factor the r_mtu */
 			if ((chk->send_size < (int)(mtu - omtu)) ||
 			    (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) {
 				/*
 				 * We probably should glom the mbuf chain
 				 * from the chk->data for control but the
 				 * problem is it becomes yet one more level
 				 * of tracking to do if for some reason
 				 * output fails. Then I have got to
 				 * reconstruct the merged control chain.. el
 				 * yucko.. for now we take the easy way and
 				 * do the copy
 				 */
 				/*
 				 * Add an AUTH chunk, if chunk requires it
 				 * save the offset into the chain for AUTH
 				 */
 				if ((auth == NULL) &&
 				    (sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
 				    stcb->asoc.peer_auth_chunks))) {
 					outchain = sctp_add_auth_chunk(outchain,
 					    &endoutchain,
 					    &auth,
 					    &auth_offset,
 					    stcb,
 					    chk->rec.chunk_id.id);
 					SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 				}
 				outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain,
 				    (int)chk->rec.chunk_id.can_take_data,
 				    chk->send_size, chk->copy_by_ref);
 				if (outchain == NULL) {
 					*reason_code = 8;
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 					return (ENOMEM);
 				}
 				SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 				/* update our MTU size */
 				if (mtu > (chk->send_size + omtu))
 					mtu -= (chk->send_size + omtu);
 				else
 					mtu = 0;
 				to_out += (chk->send_size + omtu);
 				/* Do clear IP_DF ? */
 				if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
 					no_fragmentflg = 0;
 				}
 				if (chk->rec.chunk_id.can_take_data)
 					chk->data = NULL;
 				/*
 				 * set hb flag since we can use these for
 				 * RTO
 				 */
 				hbflag = 1;
 				asconf = 1;
 				/*
 				 * should sysctl this: don't bundle data
 				 * with ASCONF since it requires AUTH
 				 */
 				no_data_chunks = 1;
 				chk->sent = SCTP_DATAGRAM_SENT;
 				if (chk->whoTo == NULL) {
 					chk->whoTo = net;
 					atomic_add_int(&net->ref_count, 1);
 				}
 				chk->snd_count++;
 				if (mtu == 0) {
 					/*
 					 * Ok we are out of room but we can
 					 * output without effecting the
 					 * flight size since this little guy
 					 * is a control only packet.
 					 */
 					sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net);
 					/*
 					 * do NOT clear the asconf flag as
 					 * it is used to do appropriate
 					 * source address selection.
 					 */
 					if (*now_filled == 0) {
 						(void)SCTP_GETTIME_TIMEVAL(now);
 						*now_filled = 1;
 					}
 					net->last_sent_time = *now;
 					hbflag = 0;
 					if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
 					    (struct sockaddr *)&net->ro._l_addr,
 					    outchain, auth_offset, auth,
 					    stcb->asoc.authinfo.active_keyid,
 					    no_fragmentflg, 0, asconf,
 					    inp->sctp_lport, stcb->rport,
 					    htonl(stcb->asoc.peer_vtag),
 					    net->port, NULL,
 					    0, 0,
 					    so_locked))) {
 						/*
 						 * error, we could not
 						 * output
 						 */
 						SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 						if (from_where == 0) {
 							SCTP_STAT_INCR(sctps_lowlevelerrusr);
 						}
 						if (error == ENOBUFS) {
 							asoc->ifp_had_enobuf = 1;
 							SCTP_STAT_INCR(sctps_lowlevelerr);
 						}
 						/* error, could not output */
 						if (error == EHOSTUNREACH) {
 							/*
 							 * Destination went
 							 * unreachable
 							 * during this send
 							 */
 							sctp_move_chunks_from_net(stcb, net);
 						}
 						*reason_code = 7;
 						break;
 					} else {
 						asoc->ifp_had_enobuf = 0;
 					}
 					/*
 					 * increase the number we sent, if a
 					 * cookie is sent we don't tell them
 					 * any was sent out.
 					 */
 					outchain = endoutchain = NULL;
 					auth = NULL;
 					auth_offset = 0;
 					if (!no_out_cnt)
 						*num_out += ctl_cnt;
 					/* recalc a clean slate and setup */
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 						mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						mtu = net->mtu - SCTP_MIN_OVERHEAD;
 						break;
 #endif
 					default:
 						/* TSNH */
 						mtu = net->mtu;
 						break;
 					}
 					to_out = 0;
 					no_fragmentflg = 1;
 				}
 			}
 		}
 		if (error != 0) {
 			/* try next net */
 			continue;
 		}
 		/************************/
 		/* Control transmission */
 		/************************/
 		/* Now first lets go through the control queue */
 		TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) {
 			if ((sack_goes_to) &&
 			    (chk->rec.chunk_id.id == SCTP_ECN_ECHO) &&
 			    (chk->whoTo != sack_goes_to)) {
 				/*
 				 * if we have a sack in queue, and we are
 				 * looking at an ecn echo that is NOT queued
 				 * to where the sack is going..
 				 */
 				if (chk->whoTo == net) {
 					/*
 					 * Don't transmit it to where its
 					 * going (current net)
 					 */
 					continue;
 				} else if (sack_goes_to == net) {
 					/*
 					 * But do transmit it to this
 					 * address
 					 */
 					goto skip_net_check;
 				}
 			}
 			if (chk->whoTo == NULL) {
 				if (asoc->alternate == NULL) {
 					if (asoc->primary_destination != net) {
 						continue;
 					}
 				} else {
 					if (asoc->alternate != net) {
 						continue;
 					}
 				}
 			} else {
 				if (chk->whoTo != net) {
 					continue;
 				}
 			}
 	skip_net_check:
 			if (chk->data == NULL) {
 				continue;
 			}
 			if (chk->sent != SCTP_DATAGRAM_UNSENT) {
 				/*
 				 * It must be unsent. Cookies and ASCONF's
 				 * hang around but there timers will force
 				 * when marked for resend.
 				 */
 				continue;
 			}
 			/*
 			 * if no AUTH is yet included and this chunk
 			 * requires it, make sure to account for it.  We
 			 * don't apply the size until the AUTH chunk is
 			 * actually added below in case there is no room for
 			 * this chunk. NOTE: we overload the use of "omtu"
 			 * here
 			 */
 			if ((auth == NULL) &&
 			    sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
 			    stcb->asoc.peer_auth_chunks)) {
 				omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 			} else
 				omtu = 0;
 			/* Here we do NOT factor the r_mtu */
 			if ((chk->send_size <= (int)(mtu - omtu)) ||
 			    (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) {
 				/*
 				 * We probably should glom the mbuf chain
 				 * from the chk->data for control but the
 				 * problem is it becomes yet one more level
 				 * of tracking to do if for some reason
 				 * output fails. Then I have got to
 				 * reconstruct the merged control chain.. el
 				 * yucko.. for now we take the easy way and
 				 * do the copy
 				 */
 				/*
 				 * Add an AUTH chunk, if chunk requires it
 				 * save the offset into the chain for AUTH
 				 */
 				if ((auth == NULL) &&
 				    (sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
 				    stcb->asoc.peer_auth_chunks))) {
 					outchain = sctp_add_auth_chunk(outchain,
 					    &endoutchain,
 					    &auth,
 					    &auth_offset,
 					    stcb,
 					    chk->rec.chunk_id.id);
 					SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 				}
 				outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain,
 				    (int)chk->rec.chunk_id.can_take_data,
 				    chk->send_size, chk->copy_by_ref);
 				if (outchain == NULL) {
 					*reason_code = 8;
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 					return (ENOMEM);
 				}
 				SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 				/* update our MTU size */
 				if (mtu > (chk->send_size + omtu))
 					mtu -= (chk->send_size + omtu);
 				else
 					mtu = 0;
 				to_out += (chk->send_size + omtu);
 				/* Do clear IP_DF ? */
 				if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
 					no_fragmentflg = 0;
 				}
 				if (chk->rec.chunk_id.can_take_data)
 					chk->data = NULL;
 				/* Mark things to be removed, if needed */
 				if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
 				    (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) ||	/* EY */
 				    (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) ||
 				    (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) ||
 				    (chk->rec.chunk_id.id == SCTP_SHUTDOWN) ||
 				    (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) ||
 				    (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) ||
 				    (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) ||
 				    (chk->rec.chunk_id.id == SCTP_ECN_CWR) ||
 				    (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) ||
 				    (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) {
 					if (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) {
 						hbflag = 1;
 					}
 					/* remove these chunks at the end */
 					if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) ||
 					    (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) {
 						/* turn off the timer */
 						if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
 							sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
 							    inp, stcb, NULL,
 							    SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1);
 						}
 					}
 					ctl_cnt++;
 				} else {
 					/*
 					 * Other chunks, since they have
 					 * timers running (i.e. COOKIE) we
 					 * just "trust" that it gets sent or
 					 * retransmitted.
 					 */
 					ctl_cnt++;
 					if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
 						cookie = 1;
 						no_out_cnt = 1;
 					} else if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) {
 						/*
 						 * Increment ecne send count
 						 * here this means we may be
 						 * over-zealous in our
 						 * counting if the send
 						 * fails, but its the best
 						 * place to do it (we used
 						 * to do it in the queue of
 						 * the chunk, but that did
 						 * not tell how many times
 						 * it was sent.
 						 */
 						SCTP_STAT_INCR(sctps_sendecne);
 					}
 					chk->sent = SCTP_DATAGRAM_SENT;
 					if (chk->whoTo == NULL) {
 						chk->whoTo = net;
 						atomic_add_int(&net->ref_count, 1);
 					}
 					chk->snd_count++;
 				}
 				if (mtu == 0) {
 					/*
 					 * Ok we are out of room but we can
 					 * output without effecting the
 					 * flight size since this little guy
 					 * is a control only packet.
 					 */
 					if (asconf) {
 						sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net);
 						/*
 						 * do NOT clear the asconf
 						 * flag as it is used to do
 						 * appropriate source
 						 * address selection.
 						 */
 					}
 					if (cookie) {
 						sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net);
 						cookie = 0;
 					}
 					/* Only HB or ASCONF advances time */
 					if (hbflag) {
 						if (*now_filled == 0) {
 							(void)SCTP_GETTIME_TIMEVAL(now);
 							*now_filled = 1;
 						}
 						net->last_sent_time = *now;
 						hbflag = 0;
 					}
 					if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
 					    (struct sockaddr *)&net->ro._l_addr,
 					    outchain,
 					    auth_offset, auth,
 					    stcb->asoc.authinfo.active_keyid,
 					    no_fragmentflg, 0, asconf,
 					    inp->sctp_lport, stcb->rport,
 					    htonl(stcb->asoc.peer_vtag),
 					    net->port, NULL,
 					    0, 0,
 					    so_locked))) {
 						/*
 						 * error, we could not
 						 * output
 						 */
 						SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 						if (from_where == 0) {
 							SCTP_STAT_INCR(sctps_lowlevelerrusr);
 						}
 						if (error == ENOBUFS) {
 							asoc->ifp_had_enobuf = 1;
 							SCTP_STAT_INCR(sctps_lowlevelerr);
 						}
 						if (error == EHOSTUNREACH) {
 							/*
 							 * Destination went
 							 * unreachable
 							 * during this send
 							 */
 							sctp_move_chunks_from_net(stcb, net);
 						}
 						*reason_code = 7;
 						break;
 					} else {
 						asoc->ifp_had_enobuf = 0;
 					}
 					/*
 					 * increase the number we sent, if a
 					 * cookie is sent we don't tell them
 					 * any was sent out.
 					 */
 					outchain = endoutchain = NULL;
 					auth = NULL;
 					auth_offset = 0;
 					if (!no_out_cnt)
 						*num_out += ctl_cnt;
 					/* recalc a clean slate and setup */
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 						mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						mtu = net->mtu - SCTP_MIN_OVERHEAD;
 						break;
 #endif
 					default:
 						/* TSNH */
 						mtu = net->mtu;
 						break;
 					}
 					to_out = 0;
 					no_fragmentflg = 1;
 				}
 			}
 		}
 		if (error != 0) {
 			/* try next net */
 			continue;
 		}
 		/* JRI: if dest is in PF state, do not send data to it */
 		if ((asoc->sctp_cmt_on_off > 0) &&
 		    (net != stcb->asoc.alternate) &&
 		    (net->dest_state & SCTP_ADDR_PF)) {
 			goto no_data_fill;
 		}
 		if (net->flight_size >= net->cwnd) {
 			goto no_data_fill;
 		}
 		if ((asoc->sctp_cmt_on_off > 0) &&
 		    (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_RECV_BUFFER_SPLITTING) &&
 		    (net->flight_size > max_rwnd_per_dest)) {
 			goto no_data_fill;
 		}
 		/*
 		 * We need a specific accounting for the usage of the send
 		 * buffer. We also need to check the number of messages per
 		 * net. For now, this is better than nothing and it disabled
 		 * by default...
 		 */
 		if ((asoc->sctp_cmt_on_off > 0) &&
 		    (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_SEND_BUFFER_SPLITTING) &&
 		    (max_send_per_dest > 0) &&
 		    (net->flight_size > max_send_per_dest)) {
 			goto no_data_fill;
 		}
 		/*********************/
 		/* Data transmission */
 		/*********************/
 		/*
 		 * if AUTH for DATA is required and no AUTH has been added
 		 * yet, account for this in the mtu now... if no data can be
 		 * bundled, this adjustment won't matter anyways since the
 		 * packet will be going out...
 		 */
 		data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA,
 		    stcb->asoc.peer_auth_chunks);
 		if (data_auth_reqd && (auth == NULL)) {
 			mtu -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 		}
 		/* now lets add any data within the MTU constraints */
 		switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (net->mtu > SCTP_MIN_V4_OVERHEAD)
 				omtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
 			else
 				omtu = 0;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (net->mtu > SCTP_MIN_OVERHEAD)
 				omtu = net->mtu - SCTP_MIN_OVERHEAD;
 			else
 				omtu = 0;
 			break;
 #endif
 		default:
 			/* TSNH */
 			omtu = 0;
 			break;
 		}
 		if ((((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) &&
 		    (skip_data_for_this_net == 0)) ||
 		    (cookie)) {
 			TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
 				if (no_data_chunks) {
 					/* let only control go out */
 					*reason_code = 1;
 					break;
 				}
 				if (net->flight_size >= net->cwnd) {
 					/* skip this net, no room for data */
 					*reason_code = 2;
 					break;
 				}
 				if ((chk->whoTo != NULL) &&
 				    (chk->whoTo != net)) {
 					/* Don't send the chunk on this net */
 					continue;
 				}
 
 				if (asoc->sctp_cmt_on_off == 0) {
 					if ((asoc->alternate) &&
 					    (asoc->alternate != net) &&
 					    (chk->whoTo == NULL)) {
 						continue;
 					} else if ((net != asoc->primary_destination) &&
 						    (asoc->alternate == NULL) &&
 					    (chk->whoTo == NULL)) {
 						continue;
 					}
 				}
 				if ((chk->send_size > omtu) && ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) == 0)) {
 					/*-
 					 * strange, we have a chunk that is
 					 * to big for its destination and
 					 * yet no fragment ok flag.
 					 * Something went wrong when the
 					 * PMTU changed...we did not mark
 					 * this chunk for some reason?? I
 					 * will fix it here by letting IP
 					 * fragment it for now and printing
 					 * a warning. This really should not
 					 * happen ...
 					 */
 					SCTP_PRINTF("Warning chunk of %d bytes > mtu:%d and yet PMTU disc missed\n",
 					    chk->send_size, mtu);
 					chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 				}
 				if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) &&
 				    (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) {
 					struct sctp_data_chunk *dchkh;
 
 					dchkh = mtod(chk->data, struct sctp_data_chunk *);
 					dchkh->ch.chunk_flags |= SCTP_DATA_SACK_IMMEDIATELY;
 				}
 				if (((chk->send_size <= mtu) && (chk->send_size <= r_mtu)) ||
 				    ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) && (chk->send_size <= asoc->peers_rwnd))) {
 					/* ok we will add this one */
 
 					/*
 					 * Add an AUTH chunk, if chunk
 					 * requires it, save the offset into
 					 * the chain for AUTH
 					 */
 					if (data_auth_reqd) {
 						if (auth == NULL) {
 							outchain = sctp_add_auth_chunk(outchain,
 							    &endoutchain,
 							    &auth,
 							    &auth_offset,
 							    stcb,
 							    SCTP_DATA);
 							auth_keyid = chk->auth_keyid;
 							override_ok = 0;
 							SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 						} else if (override_ok) {
 							/*
 							 * use this data's
 							 * keyid
 							 */
 							auth_keyid = chk->auth_keyid;
 							override_ok = 0;
 						} else if (auth_keyid != chk->auth_keyid) {
 							/*
 							 * different keyid,
 							 * so done bundling
 							 */
 							break;
 						}
 					}
 					outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, 0,
 					    chk->send_size, chk->copy_by_ref);
 					if (outchain == NULL) {
 						SCTPDBG(SCTP_DEBUG_OUTPUT3, "No memory?\n");
 						if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
 							sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
 						}
 						*reason_code = 3;
 						SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 						return (ENOMEM);
 					}
 					/* upate our MTU size */
 					/* Do clear IP_DF ? */
 					if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
 						no_fragmentflg = 0;
 					}
 					/* unsigned subtraction of mtu */
 					if (mtu > chk->send_size)
 						mtu -= chk->send_size;
 					else
 						mtu = 0;
 					/* unsigned subtraction of r_mtu */
 					if (r_mtu > chk->send_size)
 						r_mtu -= chk->send_size;
 					else
 						r_mtu = 0;
 
 					to_out += chk->send_size;
 					if ((to_out > mx_mtu) && no_fragmentflg) {
 #ifdef INVARIANTS
 						panic("Exceeding mtu of %d out size is %d", mx_mtu, to_out);
 #else
 						SCTP_PRINTF("Exceeding mtu of %d out size is %d\n",
 						    mx_mtu, to_out);
 #endif
 					}
 					chk->window_probe = 0;
 					data_list[bundle_at++] = chk;
 					if (bundle_at >= SCTP_MAX_DATA_BUNDLING) {
 						break;
 					}
 					if (chk->sent == SCTP_DATAGRAM_UNSENT) {
 						if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) {
 							SCTP_STAT_INCR_COUNTER64(sctps_outorderchunks);
 						} else {
 							SCTP_STAT_INCR_COUNTER64(sctps_outunorderchunks);
 						}
 						if (((chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) == SCTP_DATA_LAST_FRAG) &&
 						    ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0))
 							/*
 							 * Count number of
 							 * user msg's that
 							 * were fragmented
 							 * we do this by
 							 * counting when we
 							 * see a LAST
 							 * fragment only.
 							 */
 							SCTP_STAT_INCR_COUNTER64(sctps_fragusrmsgs);
 					}
 					if ((mtu == 0) || (r_mtu == 0) || (one_chunk)) {
 						if ((one_chunk) && (stcb->asoc.total_flight == 0)) {
 							data_list[0]->window_probe = 1;
 							net->window_probe = 1;
 						}
 						break;
 					}
 				} else {
 					/*
 					 * Must be sent in order of the
 					 * TSN's (on a network)
 					 */
 					break;
 				}
 			}	/* for (chunk gather loop for this net) */
 		}		/* if asoc.state OPEN */
 no_data_fill:
 		/* Is there something to send for this destination? */
 		if (outchain) {
 			/* We may need to start a control timer or two */
 			if (asconf) {
 				sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp,
 				    stcb, net);
 				/*
 				 * do NOT clear the asconf flag as it is
 				 * used to do appropriate source address
 				 * selection.
 				 */
 			}
 			if (cookie) {
 				sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net);
 				cookie = 0;
 			}
 			/* must start a send timer if data is being sent */
 			if (bundle_at && (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) {
 				/*
 				 * no timer running on this destination
 				 * restart it.
 				 */
 				sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
 			}
 			if (bundle_at || hbflag) {
 				/* For data/asconf and hb set time */
 				if (*now_filled == 0) {
 					(void)SCTP_GETTIME_TIMEVAL(now);
 					*now_filled = 1;
 				}
 				net->last_sent_time = *now;
 			}
 			/* Now send it, if there is anything to send :> */
 			if ((error = sctp_lowlevel_chunk_output(inp,
 			    stcb,
 			    net,
 			    (struct sockaddr *)&net->ro._l_addr,
 			    outchain,
 			    auth_offset,
 			    auth,
 			    auth_keyid,
 			    no_fragmentflg,
 			    bundle_at,
 			    asconf,
 			    inp->sctp_lport, stcb->rport,
 			    htonl(stcb->asoc.peer_vtag),
 			    net->port, NULL,
 			    0, 0,
 			    so_locked))) {
 				/* error, we could not output */
 				SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 				if (from_where == 0) {
 					SCTP_STAT_INCR(sctps_lowlevelerrusr);
 				}
 				if (error == ENOBUFS) {
 					asoc->ifp_had_enobuf = 1;
 					SCTP_STAT_INCR(sctps_lowlevelerr);
 				}
 				if (error == EHOSTUNREACH) {
 					/*
 					 * Destination went unreachable
 					 * during this send
 					 */
 					sctp_move_chunks_from_net(stcb, net);
 				}
 				*reason_code = 6;
 				/*-
 				 * I add this line to be paranoid. As far as
 				 * I can tell the continue, takes us back to
 				 * the top of the for, but just to make sure
 				 * I will reset these again here.
 				 */
 				ctl_cnt = bundle_at = 0;
 				continue;	/* This takes us back to the
 						 * for() for the nets. */
 			} else {
 				asoc->ifp_had_enobuf = 0;
 			}
 			endoutchain = NULL;
 			auth = NULL;
 			auth_offset = 0;
 			if (!no_out_cnt) {
 				*num_out += (ctl_cnt + bundle_at);
 			}
 			if (bundle_at) {
 				/* setup for a RTO measurement */
 				tsns_sent = data_list[0]->rec.data.tsn;
 				/* fill time if not already filled */
 				if (*now_filled == 0) {
 					(void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent);
 					*now_filled = 1;
 					*now = asoc->time_last_sent;
 				} else {
 					asoc->time_last_sent = *now;
 				}
 				if (net->rto_needed) {
 					data_list[0]->do_rtt = 1;
 					net->rto_needed = 0;
 				}
 				SCTP_STAT_INCR_BY(sctps_senddata, bundle_at);
 				sctp_clean_up_datalist(stcb, asoc, data_list, bundle_at, net);
 			}
 			if (one_chunk) {
 				break;
 			}
 		}
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 			sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_SEND);
 		}
 	}
 	if (old_start_at == NULL) {
 		old_start_at = start_at;
 		start_at = TAILQ_FIRST(&asoc->nets);
 		if (old_start_at)
 			goto again_one_more_time;
 	}
 
 	/*
 	 * At the end there should be no NON timed chunks hanging on this
 	 * queue.
 	 */
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 		sctp_log_cwnd(stcb, net, *num_out, SCTP_CWND_LOG_FROM_SEND);
 	}
 	if ((*num_out == 0) && (*reason_code == 0)) {
 		*reason_code = 4;
 	} else {
 		*reason_code = 5;
 	}
 	sctp_clean_up_ctl(stcb, asoc, so_locked);
 	return (0);
 }
 
 void
 sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err)
 {
 	/*-
 	 * Prepend a OPERATIONAL_ERROR chunk header and put on the end of
 	 * the control chunk queue.
 	 */
 	struct sctp_chunkhdr *hdr;
 	struct sctp_tmit_chunk *chk;
 	struct mbuf *mat, *last_mbuf;
 	uint32_t chunk_length;
 	uint16_t padding_length;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_NOWAIT);
 	if (op_err == NULL) {
 		return;
 	}
 	last_mbuf = NULL;
 	chunk_length = 0;
 	for (mat = op_err; mat != NULL; mat = SCTP_BUF_NEXT(mat)) {
 		chunk_length += SCTP_BUF_LEN(mat);
 		if (SCTP_BUF_NEXT(mat) == NULL) {
 			last_mbuf = mat;
 		}
 	}
 	if (chunk_length > SCTP_MAX_CHUNK_LENGTH) {
 		sctp_m_freem(op_err);
 		return;
 	}
 	padding_length = chunk_length % 4;
 	if (padding_length != 0) {
 		padding_length = 4 - padding_length;
 	}
 	if (padding_length != 0) {
 		if (sctp_add_pad_tombuf(last_mbuf, padding_length) == NULL) {
 			sctp_m_freem(op_err);
 			return;
 		}
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* no memory */
 		sctp_m_freem(op_err);
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_OPERATION_ERROR;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->send_size = (uint16_t)chunk_length;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->asoc = &stcb->asoc;
 	chk->data = op_err;
 	chk->whoTo = NULL;
 	hdr = mtod(op_err, struct sctp_chunkhdr *);
 	hdr->chunk_type = SCTP_OPERATION_ERROR;
 	hdr->chunk_flags = 0;
 	hdr->chunk_length = htons(chk->send_size);
 	TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
 	chk->asoc->ctrl_queue_cnt++;
 }
 
 int
 sctp_send_cookie_echo(struct mbuf *m,
     int offset, int limit,
     struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 	/*-
 	 * pull out the cookie and put it at the front of the control chunk
 	 * queue.
 	 */
 	int at;
 	struct mbuf *cookie;
 	struct sctp_paramhdr param, *phdr;
 	struct sctp_chunkhdr *hdr;
 	struct sctp_tmit_chunk *chk;
 	uint16_t ptype, plen;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	/* First find the cookie in the param area */
 	cookie = NULL;
 	at = offset + sizeof(struct sctp_init_chunk);
 	for (;;) {
 		phdr = sctp_get_next_param(m, at, &param, sizeof(param));
 		if (phdr == NULL) {
 			return (-3);
 		}
 		ptype = ntohs(phdr->param_type);
 		plen = ntohs(phdr->param_length);
 		if (plen < sizeof(struct sctp_paramhdr)) {
 			return (-6);
 		}
 		if (ptype == SCTP_STATE_COOKIE) {
 			int pad;
 
 			/* found the cookie */
 			if (at + plen > limit) {
 				return (-7);
 			}
 			cookie = SCTP_M_COPYM(m, at, plen, M_NOWAIT);
 			if (cookie == NULL) {
 				/* No memory */
 				return (-2);
 			}
 			if ((pad = (plen % 4)) > 0) {
 				pad = 4 - pad;
 			}
 			if (pad > 0) {
 				if (sctp_pad_lastmbuf(cookie, pad, NULL) == NULL) {
 					return (-8);
 				}
 			}
 #ifdef SCTP_MBUF_LOGGING
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 				sctp_log_mbc(cookie, SCTP_MBUF_ICOPY);
 			}
 #endif
 			break;
 		}
 		at += SCTP_SIZE32(plen);
 	}
 	/* ok, we got the cookie lets change it into a cookie echo chunk */
 	/* first the change from param to cookie */
 	hdr = mtod(cookie, struct sctp_chunkhdr *);
 	hdr->chunk_type = SCTP_COOKIE_ECHO;
 	hdr->chunk_flags = 0;
 	/* get the chunk stuff now and place it in the FRONT of the queue */
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* no memory */
 		sctp_m_freem(cookie);
 		return (-5);
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_COOKIE_ECHO;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
 	chk->send_size = SCTP_SIZE32(plen);
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->asoc = &stcb->asoc;
 	chk->data = cookie;
 	chk->whoTo = net;
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	TAILQ_INSERT_HEAD(&chk->asoc->control_send_queue, chk, sctp_next);
 	chk->asoc->ctrl_queue_cnt++;
 	return (0);
 }
 
 void
 sctp_send_heartbeat_ack(struct sctp_tcb *stcb,
     struct mbuf *m,
     int offset,
     int chk_length,
     struct sctp_nets *net)
 {
 	/*
 	 * take a HB request and make it into a HB ack and send it.
 	 */
 	struct mbuf *outchain;
 	struct sctp_chunkhdr *chdr;
 	struct sctp_tmit_chunk *chk;
 
 	if (net == NULL)
 		/* must have a net pointer */
 		return;
 
 	outchain = SCTP_M_COPYM(m, offset, chk_length, M_NOWAIT);
 	if (outchain == NULL) {
 		/* gak out of memory */
 		return;
 	}
 #ifdef SCTP_MBUF_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mbc(outchain, SCTP_MBUF_ICOPY);
 	}
 #endif
 	chdr = mtod(outchain, struct sctp_chunkhdr *);
 	chdr->chunk_type = SCTP_HEARTBEAT_ACK;
 	chdr->chunk_flags = 0;
 	if (chk_length % 4 != 0) {
 		sctp_pad_lastmbuf(outchain, 4 - (chk_length % 4), NULL);
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* no memory */
 		sctp_m_freem(outchain);
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK;
 	chk->rec.chunk_id.can_take_data = 1;
 	chk->flags = 0;
 	chk->send_size = chk_length;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->asoc = &stcb->asoc;
 	chk->data = outchain;
 	chk->whoTo = net;
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
 	chk->asoc->ctrl_queue_cnt++;
 }
 
 void
 sctp_send_cookie_ack(struct sctp_tcb *stcb)
 {
 	/* formulate and queue a cookie-ack back to sender */
 	struct mbuf *cookie_ack;
 	struct sctp_chunkhdr *hdr;
 	struct sctp_tmit_chunk *chk;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 
 	cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER);
 	if (cookie_ack == NULL) {
 		/* no mbuf's */
 		return;
 	}
 	SCTP_BUF_RESV_UF(cookie_ack, SCTP_MIN_OVERHEAD);
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* no memory */
 		sctp_m_freem(cookie_ack);
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_COOKIE_ACK;
 	chk->rec.chunk_id.can_take_data = 1;
 	chk->flags = 0;
 	chk->send_size = sizeof(struct sctp_chunkhdr);
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->asoc = &stcb->asoc;
 	chk->data = cookie_ack;
 	if (chk->asoc->last_control_chunk_from != NULL) {
 		chk->whoTo = chk->asoc->last_control_chunk_from;
 		atomic_add_int(&chk->whoTo->ref_count, 1);
 	} else {
 		chk->whoTo = NULL;
 	}
 	hdr = mtod(cookie_ack, struct sctp_chunkhdr *);
 	hdr->chunk_type = SCTP_COOKIE_ACK;
 	hdr->chunk_flags = 0;
 	hdr->chunk_length = htons(chk->send_size);
 	SCTP_BUF_LEN(cookie_ack) = chk->send_size;
 	TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
 	chk->asoc->ctrl_queue_cnt++;
 	return;
 }
 
 
 void
 sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	/* formulate and queue a SHUTDOWN-ACK back to the sender */
 	struct mbuf *m_shutdown_ack;
 	struct sctp_shutdown_ack_chunk *ack_cp;
 	struct sctp_tmit_chunk *chk;
 
 	m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_NOWAIT, 1, MT_HEADER);
 	if (m_shutdown_ack == NULL) {
 		/* no mbuf's */
 		return;
 	}
 	SCTP_BUF_RESV_UF(m_shutdown_ack, SCTP_MIN_OVERHEAD);
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* no memory */
 		sctp_m_freem(m_shutdown_ack);
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK;
 	chk->rec.chunk_id.can_take_data = 1;
 	chk->flags = 0;
 	chk->send_size = sizeof(struct sctp_chunkhdr);
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->asoc = &stcb->asoc;
 	chk->data = m_shutdown_ack;
 	chk->whoTo = net;
 	if (chk->whoTo) {
 		atomic_add_int(&chk->whoTo->ref_count, 1);
 	}
 	ack_cp = mtod(m_shutdown_ack, struct sctp_shutdown_ack_chunk *);
 	ack_cp->ch.chunk_type = SCTP_SHUTDOWN_ACK;
 	ack_cp->ch.chunk_flags = 0;
 	ack_cp->ch.chunk_length = htons(chk->send_size);
 	SCTP_BUF_LEN(m_shutdown_ack) = chk->send_size;
 	TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
 	chk->asoc->ctrl_queue_cnt++;
 	return;
 }
 
 void
 sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	/* formulate and queue a SHUTDOWN to the sender */
 	struct mbuf *m_shutdown;
 	struct sctp_shutdown_chunk *shutdown_cp;
 	struct sctp_tmit_chunk *chk;
 
 	TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
 		if (chk->rec.chunk_id.id == SCTP_SHUTDOWN) {
 			/* We already have a SHUTDOWN queued. Reuse it. */
 			if (chk->whoTo) {
 				sctp_free_remote_addr(chk->whoTo);
 				chk->whoTo = NULL;
 			}
 			break;
 		}
 	}
 	if (chk == NULL) {
 		m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_NOWAIT, 1, MT_HEADER);
 		if (m_shutdown == NULL) {
 			/* no mbuf's */
 			return;
 		}
 		SCTP_BUF_RESV_UF(m_shutdown, SCTP_MIN_OVERHEAD);
 		sctp_alloc_a_chunk(stcb, chk);
 		if (chk == NULL) {
 			/* no memory */
 			sctp_m_freem(m_shutdown);
 			return;
 		}
 		chk->copy_by_ref = 0;
 		chk->rec.chunk_id.id = SCTP_SHUTDOWN;
 		chk->rec.chunk_id.can_take_data = 1;
 		chk->flags = 0;
 		chk->send_size = sizeof(struct sctp_shutdown_chunk);
 		chk->sent = SCTP_DATAGRAM_UNSENT;
 		chk->snd_count = 0;
 		chk->asoc = &stcb->asoc;
 		chk->data = m_shutdown;
 		chk->whoTo = net;
 		if (chk->whoTo) {
 			atomic_add_int(&chk->whoTo->ref_count, 1);
 		}
 		shutdown_cp = mtod(m_shutdown, struct sctp_shutdown_chunk *);
 		shutdown_cp->ch.chunk_type = SCTP_SHUTDOWN;
 		shutdown_cp->ch.chunk_flags = 0;
 		shutdown_cp->ch.chunk_length = htons(chk->send_size);
 		shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn);
 		SCTP_BUF_LEN(m_shutdown) = chk->send_size;
 		TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
 		chk->asoc->ctrl_queue_cnt++;
 	} else {
 		TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, sctp_next);
 		chk->whoTo = net;
 		if (chk->whoTo) {
 			atomic_add_int(&chk->whoTo->ref_count, 1);
 		}
 		shutdown_cp = mtod(chk->data, struct sctp_shutdown_chunk *);
 		shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn);
 		TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
 	}
 	return;
 }
 
 void
 sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked)
 {
 	/*
 	 * formulate and queue an ASCONF to the peer. ASCONF parameters
 	 * should be queued on the assoc queue.
 	 */
 	struct sctp_tmit_chunk *chk;
 	struct mbuf *m_asconf;
 	int len;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 
 	if ((!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) &&
 	    (!sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS))) {
 		/* can't send a new one if there is one in flight already */
 		return;
 	}
 
 	/* compose an ASCONF chunk, maximum length is PMTU */
 	m_asconf = sctp_compose_asconf(stcb, &len, addr_locked);
 	if (m_asconf == NULL) {
 		return;
 	}
 
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		/* no memory */
 		sctp_m_freem(m_asconf);
 		return;
 	}
 
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_ASCONF;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
 	chk->data = m_asconf;
 	chk->send_size = len;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->asoc = &stcb->asoc;
 	chk->whoTo = net;
 	if (chk->whoTo) {
 		atomic_add_int(&chk->whoTo->ref_count, 1);
 	}
 	TAILQ_INSERT_TAIL(&chk->asoc->asconf_send_queue, chk, sctp_next);
 	chk->asoc->ctrl_queue_cnt++;
 	return;
 }
 
 void
 sctp_send_asconf_ack(struct sctp_tcb *stcb)
 {
 	/*
 	 * formulate and queue a asconf-ack back to sender. the asconf-ack
 	 * must be stored in the tcb.
 	 */
 	struct sctp_tmit_chunk *chk;
 	struct sctp_asconf_ack *ack, *latest_ack;
 	struct mbuf *m_ack;
 	struct sctp_nets *net = NULL;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	/* Get the latest ASCONF-ACK */
 	latest_ack = TAILQ_LAST(&stcb->asoc.asconf_ack_sent, sctp_asconf_ackhead);
 	if (latest_ack == NULL) {
 		return;
 	}
 	if (latest_ack->last_sent_to != NULL &&
 	    latest_ack->last_sent_to == stcb->asoc.last_control_chunk_from) {
 		/* we're doing a retransmission */
 		net = sctp_find_alternate_net(stcb, stcb->asoc.last_control_chunk_from, 0);
 		if (net == NULL) {
 			/* no alternate */
 			if (stcb->asoc.last_control_chunk_from == NULL) {
 				if (stcb->asoc.alternate) {
 					net = stcb->asoc.alternate;
 				} else {
 					net = stcb->asoc.primary_destination;
 				}
 			} else {
 				net = stcb->asoc.last_control_chunk_from;
 			}
 		}
 	} else {
 		/* normal case */
 		if (stcb->asoc.last_control_chunk_from == NULL) {
 			if (stcb->asoc.alternate) {
 				net = stcb->asoc.alternate;
 			} else {
 				net = stcb->asoc.primary_destination;
 			}
 		} else {
 			net = stcb->asoc.last_control_chunk_from;
 		}
 	}
 	latest_ack->last_sent_to = net;
 
 	TAILQ_FOREACH(ack, &stcb->asoc.asconf_ack_sent, next) {
 		if (ack->data == NULL) {
 			continue;
 		}
 
 		/* copy the asconf_ack */
 		m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_NOWAIT);
 		if (m_ack == NULL) {
 			/* couldn't copy it */
 			return;
 		}
 #ifdef SCTP_MBUF_LOGGING
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 			sctp_log_mbc(m_ack, SCTP_MBUF_ICOPY);
 		}
 #endif
 
 		sctp_alloc_a_chunk(stcb, chk);
 		if (chk == NULL) {
 			/* no memory */
 			if (m_ack)
 				sctp_m_freem(m_ack);
 			return;
 		}
 		chk->copy_by_ref = 0;
 		chk->rec.chunk_id.id = SCTP_ASCONF_ACK;
 		chk->rec.chunk_id.can_take_data = 1;
 		chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
 		chk->whoTo = net;
 		if (chk->whoTo) {
 			atomic_add_int(&chk->whoTo->ref_count, 1);
 		}
 		chk->data = m_ack;
 		chk->send_size = ack->len;
 		chk->sent = SCTP_DATAGRAM_UNSENT;
 		chk->snd_count = 0;
 		chk->asoc = &stcb->asoc;
 
 		TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
 		chk->asoc->ctrl_queue_cnt++;
 	}
 	return;
 }
 
 
 static int
 sctp_chunk_retransmission(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     int *cnt_out, struct timeval *now, int *now_filled, int *fr_done, int so_locked)
 {
 	/*-
 	 * send out one MTU of retransmission. If fast_retransmit is
 	 * happening we ignore the cwnd. Otherwise we obey the cwnd and
 	 * rwnd. For a Cookie or Asconf in the control chunk queue we
 	 * retransmit them by themselves.
 	 *
 	 * For data chunks we will pick out the lowest TSN's in the sent_queue
 	 * marked for resend and bundle them all together (up to a MTU of
 	 * destination). The address to send to should have been
 	 * selected/changed where the retransmission was marked (i.e. in FR
 	 * or t3-timeout routines).
 	 */
 	struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING];
 	struct sctp_tmit_chunk *chk, *fwd;
 	struct mbuf *m, *endofchain;
 	struct sctp_nets *net = NULL;
 	uint32_t tsns_sent = 0;
 	int no_fragmentflg, bundle_at, cnt_thru;
 	unsigned int mtu;
 	int error, i, one_chunk, fwd_tsn, ctl_cnt, tmr_started;
 	struct sctp_auth_chunk *auth = NULL;
 	uint32_t auth_offset = 0;
 	uint16_t auth_keyid;
 	int override_ok = 1;
 	int data_auth_reqd = 0;
 	uint32_t dmtu = 0;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	tmr_started = ctl_cnt = bundle_at = error = 0;
 	no_fragmentflg = 1;
 	fwd_tsn = 0;
 	*cnt_out = 0;
 	fwd = NULL;
 	endofchain = m = NULL;
 	auth_keyid = stcb->asoc.authinfo.active_keyid;
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xC3, 1);
 #endif
 	if ((TAILQ_EMPTY(&asoc->sent_queue)) &&
 	    (TAILQ_EMPTY(&asoc->control_send_queue))) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT1, "SCTP hits empty queue with cnt set to %d?\n",
 		    asoc->sent_queue_retran_cnt);
 		asoc->sent_queue_cnt = 0;
 		asoc->sent_queue_cnt_removeable = 0;
 		/* send back 0/0 so we enter normal transmission */
 		*cnt_out = 0;
 		return (0);
 	}
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if ((chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) ||
 		    (chk->rec.chunk_id.id == SCTP_STREAM_RESET) ||
 		    (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN)) {
 			if (chk->sent != SCTP_DATAGRAM_RESEND) {
 				continue;
 			}
 			if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) {
 				if (chk != asoc->str_reset) {
 					/*
 					 * not eligible for retran if its
 					 * not ours
 					 */
 					continue;
 				}
 			}
 			ctl_cnt++;
 			if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) {
 				fwd_tsn = 1;
 			}
 			/*
 			 * Add an AUTH chunk, if chunk requires it save the
 			 * offset into the chain for AUTH
 			 */
 			if ((auth == NULL) &&
 			    (sctp_auth_is_required_chunk(chk->rec.chunk_id.id,
 			    stcb->asoc.peer_auth_chunks))) {
 				m = sctp_add_auth_chunk(m, &endofchain,
 				    &auth, &auth_offset,
 				    stcb,
 				    chk->rec.chunk_id.id);
 				SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 			}
 			m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref);
 			break;
 		}
 	}
 	one_chunk = 0;
 	cnt_thru = 0;
 	/* do we have control chunks to retransmit? */
 	if (m != NULL) {
 		/* Start a timer no matter if we succeed or fail */
 		if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
 			sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo);
 		} else if (chk->rec.chunk_id.id == SCTP_ASCONF)
 			sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, chk->whoTo);
 		chk->snd_count++;	/* update our count */
 		if ((error = sctp_lowlevel_chunk_output(inp, stcb, chk->whoTo,
 		    (struct sockaddr *)&chk->whoTo->ro._l_addr, m,
 		    auth_offset, auth, stcb->asoc.authinfo.active_keyid,
 		    no_fragmentflg, 0, 0,
 		    inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
 		    chk->whoTo->port, NULL,
 		    0, 0,
 		    so_locked))) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 			if (error == ENOBUFS) {
 				asoc->ifp_had_enobuf = 1;
 				SCTP_STAT_INCR(sctps_lowlevelerr);
 			}
 			return (error);
 		} else {
 			asoc->ifp_had_enobuf = 0;
 		}
 		endofchain = NULL;
 		auth = NULL;
 		auth_offset = 0;
 		/*
 		 * We don't want to mark the net->sent time here since this
 		 * we use this for HB and retrans cannot measure RTT
 		 */
 		/* (void)SCTP_GETTIME_TIMEVAL(&chk->whoTo->last_sent_time); */
 		*cnt_out += 1;
 		chk->sent = SCTP_DATAGRAM_SENT;
 		sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt);
 		if (fwd_tsn == 0) {
 			return (0);
 		} else {
 			/* Clean up the fwd-tsn list */
 			sctp_clean_up_ctl(stcb, asoc, so_locked);
 			return (0);
 		}
 	}
 	/*
 	 * Ok, it is just data retransmission we need to do or that and a
 	 * fwd-tsn with it all.
 	 */
 	if (TAILQ_EMPTY(&asoc->sent_queue)) {
 		return (SCTP_RETRAN_DONE);
 	}
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT)) {
 		/* not yet open, resend the cookie and that is it */
 		return (1);
 	}
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_auditing(20, inp, stcb, NULL);
 #endif
 	data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks);
 	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 		if (chk->sent != SCTP_DATAGRAM_RESEND) {
 			/* No, not sent to this net or not ready for rtx */
 			continue;
 		}
 		if (chk->data == NULL) {
 			SCTP_PRINTF("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n",
 			    chk->rec.data.tsn, chk->snd_count, chk->sent);
 			continue;
 		}
 		if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) &&
 		    (chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) {
 			struct mbuf *op_err;
 			char msg[SCTP_DIAG_INFO_LEN];
 
 			SCTP_SNPRINTF(msg, sizeof(msg), "TSN %8.8x retransmitted %d times, giving up",
 			    chk->rec.data.tsn, chk->snd_count);
 			op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 			    msg);
 			atomic_add_int(&stcb->asoc.refcnt, 1);
 			sctp_abort_an_association(stcb->sctp_ep, stcb, op_err,
 			    so_locked);
 			SCTP_TCB_LOCK(stcb);
 			atomic_subtract_int(&stcb->asoc.refcnt, 1);
 			return (SCTP_RETRAN_EXIT);
 		}
 		/* pick up the net */
 		net = chk->whoTo;
 		switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			mtu = net->mtu - SCTP_MIN_OVERHEAD;
 			break;
 #endif
 		default:
 			/* TSNH */
 			mtu = net->mtu;
 			break;
 		}
 
 		if ((asoc->peers_rwnd < mtu) && (asoc->total_flight > 0)) {
 			/* No room in peers rwnd */
 			uint32_t tsn;
 
 			tsn = asoc->last_acked_seq + 1;
 			if (tsn == chk->rec.data.tsn) {
 				/*
 				 * we make a special exception for this
 				 * case. The peer has no rwnd but is missing
 				 * the lowest chunk.. which is probably what
 				 * is holding up the rwnd.
 				 */
 				goto one_chunk_around;
 			}
 			return (1);
 		}
 one_chunk_around:
 		if (asoc->peers_rwnd < mtu) {
 			one_chunk = 1;
 			if ((asoc->peers_rwnd == 0) &&
 			    (asoc->total_flight == 0)) {
 				chk->window_probe = 1;
 				chk->whoTo->window_probe = 1;
 			}
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_audit_log(0xC3, 2);
 #endif
 		bundle_at = 0;
 		m = NULL;
 		net->fast_retran_ip = 0;
 		if (chk->rec.data.doing_fast_retransmit == 0) {
 			/*
 			 * if no FR in progress skip destination that have
 			 * flight_size > cwnd.
 			 */
 			if (net->flight_size >= net->cwnd) {
 				continue;
 			}
 		} else {
 			/*
 			 * Mark the destination net to have FR recovery
 			 * limits put on it.
 			 */
 			*fr_done = 1;
 			net->fast_retran_ip = 1;
 		}
 
 		/*
 		 * if no AUTH is yet included and this chunk requires it,
 		 * make sure to account for it.  We don't apply the size
 		 * until the AUTH chunk is actually added below in case
 		 * there is no room for this chunk.
 		 */
 		if (data_auth_reqd && (auth == NULL)) {
 			dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 		} else
 			dmtu = 0;
 
 		if ((chk->send_size <= (mtu - dmtu)) ||
 		    (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) {
 			/* ok we will add this one */
 			if (data_auth_reqd) {
 				if (auth == NULL) {
 					m = sctp_add_auth_chunk(m,
 					    &endofchain,
 					    &auth,
 					    &auth_offset,
 					    stcb,
 					    SCTP_DATA);
 					auth_keyid = chk->auth_keyid;
 					override_ok = 0;
 					SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 				} else if (override_ok) {
 					auth_keyid = chk->auth_keyid;
 					override_ok = 0;
 				} else if (chk->auth_keyid != auth_keyid) {
 					/* different keyid, so done bundling */
 					break;
 				}
 			}
 			m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref);
 			if (m == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 				return (ENOMEM);
 			}
 			/* Do clear IP_DF ? */
 			if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) {
 				no_fragmentflg = 0;
 			}
 			/* upate our MTU size */
 			if (mtu > (chk->send_size + dmtu))
 				mtu -= (chk->send_size + dmtu);
 			else
 				mtu = 0;
 			data_list[bundle_at++] = chk;
 			if (one_chunk && (asoc->total_flight <= 0)) {
 				SCTP_STAT_INCR(sctps_windowprobed);
 			}
 		}
 		if (one_chunk == 0) {
 			/*
 			 * now are there anymore forward from chk to pick
 			 * up?
 			 */
 			for (fwd = TAILQ_NEXT(chk, sctp_next); fwd != NULL; fwd = TAILQ_NEXT(fwd, sctp_next)) {
 				if (fwd->sent != SCTP_DATAGRAM_RESEND) {
 					/* Nope, not for retran */
 					continue;
 				}
 				if (fwd->whoTo != net) {
 					/* Nope, not the net in question */
 					continue;
 				}
 				if (data_auth_reqd && (auth == NULL)) {
 					dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id);
 				} else
 					dmtu = 0;
 				if (fwd->send_size <= (mtu - dmtu)) {
 					if (data_auth_reqd) {
 						if (auth == NULL) {
 							m = sctp_add_auth_chunk(m,
 							    &endofchain,
 							    &auth,
 							    &auth_offset,
 							    stcb,
 							    SCTP_DATA);
 							auth_keyid = fwd->auth_keyid;
 							override_ok = 0;
 							SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 						} else if (override_ok) {
 							auth_keyid = fwd->auth_keyid;
 							override_ok = 0;
 						} else if (fwd->auth_keyid != auth_keyid) {
 							/*
 							 * different keyid,
 							 * so done bundling
 							 */
 							break;
 						}
 					}
 					m = sctp_copy_mbufchain(fwd->data, m, &endofchain, 0, fwd->send_size, fwd->copy_by_ref);
 					if (m == NULL) {
 						SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 						return (ENOMEM);
 					}
 					/* Do clear IP_DF ? */
 					if (fwd->flags & CHUNK_FLAGS_FRAGMENT_OK) {
 						no_fragmentflg = 0;
 					}
 					/* upate our MTU size */
 					if (mtu > (fwd->send_size + dmtu))
 						mtu -= (fwd->send_size + dmtu);
 					else
 						mtu = 0;
 					data_list[bundle_at++] = fwd;
 					if (bundle_at >= SCTP_MAX_DATA_BUNDLING) {
 						break;
 					}
 				} else {
 					/* can't fit so we are done */
 					break;
 				}
 			}
 		}
 		/* Is there something to send for this destination? */
 		if (m) {
 			/*
 			 * No matter if we fail/or succeed we should start a
 			 * timer. A failure is like a lost IP packet :-)
 			 */
 			if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
 				/*
 				 * no timer running on this destination
 				 * restart it.
 				 */
 				sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
 				tmr_started = 1;
 			}
 			/* Now lets send it, if there is anything to send :> */
 			if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
 			    (struct sockaddr *)&net->ro._l_addr, m,
 			    auth_offset, auth, auth_keyid,
 			    no_fragmentflg, 0, 0,
 			    inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
 			    net->port, NULL,
 			    0, 0,
 			    so_locked))) {
 				/* error, we could not output */
 				SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 				if (error == ENOBUFS) {
 					asoc->ifp_had_enobuf = 1;
 					SCTP_STAT_INCR(sctps_lowlevelerr);
 				}
 				return (error);
 			} else {
 				asoc->ifp_had_enobuf = 0;
 			}
 			endofchain = NULL;
 			auth = NULL;
 			auth_offset = 0;
 			/* For HB's */
 			/*
 			 * We don't want to mark the net->sent time here
 			 * since this we use this for HB and retrans cannot
 			 * measure RTT
 			 */
 			/* (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); */
 
 			/* For auto-close */
 			cnt_thru++;
 			if (*now_filled == 0) {
 				(void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent);
 				*now = asoc->time_last_sent;
 				*now_filled = 1;
 			} else {
 				asoc->time_last_sent = *now;
 			}
 			*cnt_out += bundle_at;
 #ifdef SCTP_AUDITING_ENABLED
 			sctp_audit_log(0xC4, bundle_at);
 #endif
 			if (bundle_at) {
 				tsns_sent = data_list[0]->rec.data.tsn;
 			}
 			for (i = 0; i < bundle_at; i++) {
 				SCTP_STAT_INCR(sctps_sendretransdata);
 				data_list[i]->sent = SCTP_DATAGRAM_SENT;
 				/*
 				 * When we have a revoked data, and we
 				 * retransmit it, then we clear the revoked
 				 * flag since this flag dictates if we
 				 * subtracted from the fs
 				 */
 				if (data_list[i]->rec.data.chunk_was_revoked) {
 					/* Deflate the cwnd */
 					data_list[i]->whoTo->cwnd -= data_list[i]->book_size;
 					data_list[i]->rec.data.chunk_was_revoked = 0;
 				}
 				data_list[i]->snd_count++;
 				sctp_ucount_decr(asoc->sent_queue_retran_cnt);
 				/* record the time */
 				data_list[i]->sent_rcv_time = asoc->time_last_sent;
 				if (data_list[i]->book_size_scale) {
 					/*
 					 * need to double the book size on
 					 * this one
 					 */
 					data_list[i]->book_size_scale = 0;
 					/*
 					 * Since we double the booksize, we
 					 * must also double the output queue
 					 * size, since this get shrunk when
 					 * we free by this amount.
 					 */
 					atomic_add_int(&((asoc)->total_output_queue_size), data_list[i]->book_size);
 					data_list[i]->book_size *= 2;
 
 
 				} else {
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) {
 						sctp_log_rwnd(SCTP_DECREASE_PEER_RWND,
 						    asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh));
 					}
 					asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd,
 					    (uint32_t)(data_list[i]->send_size +
 					    SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)));
 				}
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
 					sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND,
 					    data_list[i]->whoTo->flight_size,
 					    data_list[i]->book_size,
 					    (uint32_t)(uintptr_t)data_list[i]->whoTo,
 					    data_list[i]->rec.data.tsn);
 				}
 				sctp_flight_size_increase(data_list[i]);
 				sctp_total_flight_increase(stcb, data_list[i]);
 				if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) {
 					/* SWS sender side engages */
 					asoc->peers_rwnd = 0;
 				}
 				if ((i == 0) &&
 				    (data_list[i]->rec.data.doing_fast_retransmit)) {
 					SCTP_STAT_INCR(sctps_sendfastretrans);
 					if ((data_list[i] == TAILQ_FIRST(&asoc->sent_queue)) &&
 					    (tmr_started == 0)) {
 						/*-
 						 * ok we just fast-retrans'd
 						 * the lowest TSN, i.e the
 						 * first on the list. In
 						 * this case we want to give
 						 * some more time to get a
 						 * SACK back without a
 						 * t3-expiring.
 						 */
 						sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net,
 						    SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2);
 						sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
 					}
 				}
 			}
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 				sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_RESEND);
 			}
 #ifdef SCTP_AUDITING_ENABLED
 			sctp_auditing(21, inp, stcb, NULL);
 #endif
 		} else {
 			/* None will fit */
 			return (1);
 		}
 		if (asoc->sent_queue_retran_cnt <= 0) {
 			/* all done we have no more to retran */
 			asoc->sent_queue_retran_cnt = 0;
 			break;
 		}
 		if (one_chunk) {
 			/* No more room in rwnd */
 			return (1);
 		}
 		/* stop the for loop here. we sent out a packet */
 		break;
 	}
 	return (0);
 }
 
 static void
 sctp_timer_validation(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_association *asoc)
 {
 	struct sctp_nets *net;
 
 	/* Validate that a timer is running somewhere */
 	TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 		if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
 			/* Here is a timer */
 			return;
 		}
 	}
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	/* Gak, we did not have a timer somewhere */
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "Deadlock avoided starting timer on a dest at retran\n");
 	if (asoc->alternate) {
 		sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->alternate);
 	} else {
 		sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->primary_destination);
 	}
 	return;
 }
 
 void
 sctp_chunk_output(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     int from_where,
     int so_locked)
 {
 	/*-
 	 * Ok this is the generic chunk service queue. we must do the
 	 * following:
 	 * - See if there are retransmits pending, if so we must
 	 *   do these first.
 	 * - Service the stream queue that is next, moving any
 	 *   message (note I must get a complete message i.e.
 	 *   FIRST/MIDDLE and LAST to the out queue in one pass) and assigning
 	 *   TSN's
 	 * - Check to see if the cwnd/rwnd allows any output, if so we
 	 *   go ahead and fomulate and send the low level chunks. Making sure
 	 *   to combine any control in the control chunk queue also.
 	 */
 	struct sctp_association *asoc;
 	struct sctp_nets *net;
 	int error = 0, num_out, tot_out = 0, ret = 0, reason_code;
 	unsigned int burst_cnt = 0;
 	struct timeval now;
 	int now_filled = 0;
 	int nagle_on;
 	int frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
 	int un_sent = 0;
 	int fr_done;
 	unsigned int tot_frs = 0;
 
 	asoc = &stcb->asoc;
 do_it_again:
 	/* The Nagle algorithm is only applied when handling a send call. */
 	if (from_where == SCTP_OUTPUT_FROM_USR_SEND) {
 		if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) {
 			nagle_on = 0;
 		} else {
 			nagle_on = 1;
 		}
 	} else {
 		nagle_on = 0;
 	}
 	SCTP_TCB_LOCK_ASSERT(stcb);
 
 	un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight);
 
 	if ((un_sent <= 0) &&
 	    (TAILQ_EMPTY(&asoc->control_send_queue)) &&
 	    (TAILQ_EMPTY(&asoc->asconf_send_queue)) &&
 	    (asoc->sent_queue_retran_cnt == 0) &&
 	    (asoc->trigger_reset == 0)) {
 		/* Nothing to do unless there is something to be sent left */
 		return;
 	}
 	/*
 	 * Do we have something to send, data or control AND a sack timer
 	 * running, if so piggy-back the sack.
 	 */
 	if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
 		sctp_send_sack(stcb, so_locked);
 		sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL,
 		    SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3);
 	}
 	while (asoc->sent_queue_retran_cnt) {
 		/*-
 		 * Ok, it is retransmission time only, we send out only ONE
 		 * packet with a single call off to the retran code.
 		 */
 		if (from_where == SCTP_OUTPUT_FROM_COOKIE_ACK) {
 			/*-
 			 * Special hook for handling cookiess discarded
 			 * by peer that carried data. Send cookie-ack only
 			 * and then the next call with get the retran's.
 			 */
 			(void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1,
 			    from_where,
 			    &now, &now_filled, frag_point, so_locked);
 			return;
 		} else if (from_where != SCTP_OUTPUT_FROM_HB_TMR) {
 			/* if its not from a HB then do it */
 			fr_done = 0;
 			ret = sctp_chunk_retransmission(inp, stcb, asoc, &num_out, &now, &now_filled, &fr_done, so_locked);
 			if (fr_done) {
 				tot_frs++;
 			}
 		} else {
 			/*
 			 * its from any other place, we don't allow retran
 			 * output (only control)
 			 */
 			ret = 1;
 		}
 		if (ret > 0) {
 			/* Can't send anymore */
 			/*-
 			 * now lets push out control by calling med-level
 			 * output once. this assures that we WILL send HB's
 			 * if queued too.
 			 */
 			(void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1,
 			    from_where,
 			    &now, &now_filled, frag_point, so_locked);
 #ifdef SCTP_AUDITING_ENABLED
 			sctp_auditing(8, inp, stcb, NULL);
 #endif
 			sctp_timer_validation(inp, stcb, asoc);
 			return;
 		}
 		if (ret < 0) {
 			/*-
 			 * The count was off.. retran is not happening so do
 			 * the normal retransmission.
 			 */
 #ifdef SCTP_AUDITING_ENABLED
 			sctp_auditing(9, inp, stcb, NULL);
 #endif
 			if (ret == SCTP_RETRAN_EXIT) {
 				return;
 			}
 			break;
 		}
 		if (from_where == SCTP_OUTPUT_FROM_T3) {
 			/* Only one transmission allowed out of a timeout */
 #ifdef SCTP_AUDITING_ENABLED
 			sctp_auditing(10, inp, stcb, NULL);
 #endif
 			/* Push out any control */
 			(void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where,
 			    &now, &now_filled, frag_point, so_locked);
 			return;
 		}
 		if ((asoc->fr_max_burst > 0) && (tot_frs >= asoc->fr_max_burst)) {
 			/* Hit FR burst limit */
 			return;
 		}
 		if ((num_out == 0) && (ret == 0)) {
 			/* No more retrans to send */
 			break;
 		}
 	}
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_auditing(12, inp, stcb, NULL);
 #endif
 	/* Check for bad destinations, if they exist move chunks around. */
 	TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 		if (!(net->dest_state & SCTP_ADDR_REACHABLE)) {
 			/*-
 			 * if possible move things off of this address we
 			 * still may send below due to the dormant state but
 			 * we try to find an alternate address to send to
 			 * and if we have one we move all queued data on the
 			 * out wheel to this alternate address.
 			 */
 			if (net->ref_count > 1)
 				sctp_move_chunks_from_net(stcb, net);
 		} else {
 			/*-
 			 * if ((asoc->sat_network) || (net->addr_is_local))
 			 * { burst_limit = asoc->max_burst *
 			 * SCTP_SAT_NETWORK_BURST_INCR; }
 			 */
 			if (asoc->max_burst > 0) {
 				if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst)) {
 					if ((net->flight_size + (asoc->max_burst * net->mtu)) < net->cwnd) {
 						/*
 						 * JRS - Use the congestion
 						 * control given in the
 						 * congestion control module
 						 */
 						asoc->cc_functions.sctp_cwnd_update_after_output(stcb, net, asoc->max_burst);
 						if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
 							sctp_log_maxburst(stcb, net, 0, asoc->max_burst, SCTP_MAX_BURST_APPLIED);
 						}
 						SCTP_STAT_INCR(sctps_maxburstqueued);
 					}
 					net->fast_retran_ip = 0;
 				} else {
 					if (net->flight_size == 0) {
 						/*
 						 * Should be decaying the
 						 * cwnd here
 						 */
 						;
 					}
 				}
 			}
 		}
 
 	}
 	burst_cnt = 0;
 	do {
 		error = sctp_med_chunk_output(inp, stcb, asoc, &num_out,
 		    &reason_code, 0, from_where,
 		    &now, &now_filled, frag_point, so_locked);
 		if (error) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT1, "Error %d was returned from med-c-op\n", error);
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
 				sctp_log_maxburst(stcb, asoc->primary_destination, error, burst_cnt, SCTP_MAX_BURST_ERROR_STOP);
 			}
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 				sctp_log_cwnd(stcb, NULL, error, SCTP_SEND_NOW_COMPLETES);
 				sctp_log_cwnd(stcb, NULL, 0xdeadbeef, SCTP_SEND_NOW_COMPLETES);
 			}
 			break;
 		}
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "m-c-o put out %d\n", num_out);
 
 		tot_out += num_out;
 		burst_cnt++;
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 			sctp_log_cwnd(stcb, NULL, num_out, SCTP_SEND_NOW_COMPLETES);
 			if (num_out == 0) {
 				sctp_log_cwnd(stcb, NULL, reason_code, SCTP_SEND_NOW_COMPLETES);
 			}
 		}
 		if (nagle_on) {
 			/*
 			 * When the Nagle algorithm is used, look at how
 			 * much is unsent, then if its smaller than an MTU
 			 * and we have data in flight we stop, except if we
 			 * are handling a fragmented user message.
 			 */
 			un_sent = stcb->asoc.total_output_queue_size - stcb->asoc.total_flight;
 			if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) &&
 			    (stcb->asoc.total_flight > 0)) {
 /*	&&		     sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {*/
 				break;
 			}
 		}
 		if (TAILQ_EMPTY(&asoc->control_send_queue) &&
 		    TAILQ_EMPTY(&asoc->send_queue) &&
 		    sctp_is_there_unsent_data(stcb, so_locked) == 0) {
 			/* Nothing left to send */
 			break;
 		}
 		if ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) <= 0) {
 			/* Nothing left to send */
 			break;
 		}
 	} while (num_out &&
 	    ((asoc->max_burst == 0) ||
 	    SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) ||
 	    (burst_cnt < asoc->max_burst)));
 
 	if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) == 0) {
 		if ((asoc->max_burst > 0) && (burst_cnt >= asoc->max_burst)) {
 			SCTP_STAT_INCR(sctps_maxburstqueued);
 			asoc->burst_limit_applied = 1;
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
 				sctp_log_maxburst(stcb, asoc->primary_destination, 0, burst_cnt, SCTP_MAX_BURST_APPLIED);
 			}
 		} else {
 			asoc->burst_limit_applied = 0;
 		}
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
 		sctp_log_cwnd(stcb, NULL, tot_out, SCTP_SEND_NOW_COMPLETES);
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, we have put out %d chunks\n",
 	    tot_out);
 
 	/*-
 	 * Now we need to clean up the control chunk chain if a ECNE is on
 	 * it. It must be marked as UNSENT again so next call will continue
 	 * to send it until such time that we get a CWR, to remove it.
 	 */
 	if (stcb->asoc.ecn_echo_cnt_onq)
 		sctp_fix_ecn_echo(asoc);
 
 	if (stcb->asoc.trigger_reset) {
 		if (sctp_send_stream_reset_out_if_possible(stcb, so_locked) == 0) {
 			goto do_it_again;
 		}
 	}
 	return;
 }
 
 
 int
 sctp_output(
     struct sctp_inpcb *inp,
     struct mbuf *m,
     struct sockaddr *addr,
     struct mbuf *control,
     struct thread *p,
     int flags)
 {
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		return (EINVAL);
 	}
 
 	if (inp->sctp_socket == NULL) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		return (EINVAL);
 	}
 	return (sctp_sosend(inp->sctp_socket,
 	    addr,
 	    (struct uio *)NULL,
 	    m,
 	    control,
 	    flags, p
 	    ));
 }
 
 void
 send_forward_tsn(struct sctp_tcb *stcb,
     struct sctp_association *asoc)
 {
 	struct sctp_tmit_chunk *chk, *at, *tp1, *last;
 	struct sctp_forward_tsn_chunk *fwdtsn;
 	struct sctp_strseq *strseq;
 	struct sctp_strseq_mid *strseq_m;
 	uint32_t advance_peer_ack_point;
 	unsigned int cnt_of_space, i, ovh;
 	unsigned int space_needed;
 	unsigned int cnt_of_skipped = 0;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) {
 			/* mark it to unsent */
 			chk->sent = SCTP_DATAGRAM_UNSENT;
 			chk->snd_count = 0;
 			/* Do we correct its output location? */
 			if (chk->whoTo) {
 				sctp_free_remote_addr(chk->whoTo);
 				chk->whoTo = NULL;
 			}
 			goto sctp_fill_in_rest;
 		}
 	}
 	/* Ok if we reach here we must build one */
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		return;
 	}
 	asoc->fwd_tsn_cnt++;
 	chk->copy_by_ref = 0;
 	/*
 	 * We don't do the old thing here since this is used not for on-wire
 	 * but to tell if we are sending a fwd-tsn by the stack during
 	 * output. And if its a IFORWARD or a FORWARD it is a fwd-tsn.
 	 */
 	chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->asoc = asoc;
 	chk->whoTo = NULL;
 	chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		return;
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next);
 	asoc->ctrl_queue_cnt++;
 sctp_fill_in_rest:
 	/*-
 	 * Here we go through and fill out the part that deals with
 	 * stream/seq of the ones we skip.
 	 */
 	SCTP_BUF_LEN(chk->data) = 0;
 	TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) {
 		if ((at->sent != SCTP_FORWARD_TSN_SKIP) &&
 		    (at->sent != SCTP_DATAGRAM_NR_ACKED)) {
 			/* no more to look at */
 			break;
 		}
 		if (!asoc->idata_supported && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) {
 			/* We don't report these */
 			continue;
 		}
 		cnt_of_skipped++;
 	}
 	if (asoc->idata_supported) {
 		space_needed = (sizeof(struct sctp_forward_tsn_chunk) +
 		    (cnt_of_skipped * sizeof(struct sctp_strseq_mid)));
 	} else {
 		space_needed = (sizeof(struct sctp_forward_tsn_chunk) +
 		    (cnt_of_skipped * sizeof(struct sctp_strseq)));
 	}
 	cnt_of_space = (unsigned int)M_TRAILINGSPACE(chk->data);
 
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 		ovh = SCTP_MIN_OVERHEAD;
 	} else {
 		ovh = SCTP_MIN_V4_OVERHEAD;
 	}
 	if (cnt_of_space > (asoc->smallest_mtu - ovh)) {
 		/* trim to a mtu size */
 		cnt_of_space = asoc->smallest_mtu - ovh;
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
 		sctp_misc_ints(SCTP_FWD_TSN_CHECK,
 		    0xff, 0, cnt_of_skipped,
 		    asoc->advanced_peer_ack_point);
 	}
 	advance_peer_ack_point = asoc->advanced_peer_ack_point;
 	if (cnt_of_space < space_needed) {
 		/*-
 		 * ok we must trim down the chunk by lowering the
 		 * advance peer ack point.
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
 			sctp_misc_ints(SCTP_FWD_TSN_CHECK,
 			    0xff, 0xff, cnt_of_space,
 			    space_needed);
 		}
 		cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk);
 		if (asoc->idata_supported) {
 			cnt_of_skipped /= sizeof(struct sctp_strseq_mid);
 		} else {
 			cnt_of_skipped /= sizeof(struct sctp_strseq);
 		}
 		/*-
 		 * Go through and find the TSN that will be the one
 		 * we report.
 		 */
 		at = TAILQ_FIRST(&asoc->sent_queue);
 		if (at != NULL) {
 			for (i = 0; i < cnt_of_skipped; i++) {
 				tp1 = TAILQ_NEXT(at, sctp_next);
 				if (tp1 == NULL) {
 					break;
 				}
 				at = tp1;
 			}
 		}
 		if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
 			sctp_misc_ints(SCTP_FWD_TSN_CHECK,
 			    0xff, cnt_of_skipped, at->rec.data.tsn,
 			    asoc->advanced_peer_ack_point);
 		}
 		last = at;
 		/*-
 		 * last now points to last one I can report, update
 		 * peer ack point
 		 */
 		if (last) {
 			advance_peer_ack_point = last->rec.data.tsn;
 		}
 		if (asoc->idata_supported) {
 			space_needed = sizeof(struct sctp_forward_tsn_chunk) +
 			    cnt_of_skipped * sizeof(struct sctp_strseq_mid);
 		} else {
 			space_needed = sizeof(struct sctp_forward_tsn_chunk) +
 			    cnt_of_skipped * sizeof(struct sctp_strseq);
 		}
 	}
 	chk->send_size = space_needed;
 	/* Setup the chunk */
 	fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *);
 	fwdtsn->ch.chunk_length = htons(chk->send_size);
 	fwdtsn->ch.chunk_flags = 0;
 	if (asoc->idata_supported) {
 		fwdtsn->ch.chunk_type = SCTP_IFORWARD_CUM_TSN;
 	} else {
 		fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN;
 	}
 	fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	fwdtsn++;
 	/*-
 	 * Move pointer to after the fwdtsn and transfer to the
 	 * strseq pointer.
 	 */
 	if (asoc->idata_supported) {
 		strseq_m = (struct sctp_strseq_mid *)fwdtsn;
 		strseq = NULL;
 	} else {
 		strseq = (struct sctp_strseq *)fwdtsn;
 		strseq_m = NULL;
 	}
 	/*-
 	 * Now populate the strseq list. This is done blindly
 	 * without pulling out duplicate stream info. This is
 	 * inefficent but won't harm the process since the peer will
 	 * look at these in sequence and will thus release anything.
 	 * It could mean we exceed the PMTU and chop off some that
 	 * we could have included.. but this is unlikely (aka 1432/4
 	 * would mean 300+ stream seq's would have to be reported in
 	 * one FWD-TSN. With a bit of work we can later FIX this to
 	 * optimize and pull out duplicates.. but it does add more
 	 * overhead. So for now... not!
 	 */
 	i = 0;
 	TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) {
 		if (i >= cnt_of_skipped) {
 			break;
 		}
 		if (!asoc->idata_supported && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) {
 			/* We don't report these */
 			continue;
 		}
 		if (at->rec.data.tsn == advance_peer_ack_point) {
 			at->rec.data.fwd_tsn_cnt = 0;
 		}
 		if (asoc->idata_supported) {
 			strseq_m->sid = htons(at->rec.data.sid);
 			if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) {
 				strseq_m->flags = htons(PR_SCTP_UNORDERED_FLAG);
 			} else {
 				strseq_m->flags = 0;
 			}
 			strseq_m->mid = htonl(at->rec.data.mid);
 			strseq_m++;
 		} else {
 			strseq->sid = htons(at->rec.data.sid);
 			strseq->ssn = htons((uint16_t)at->rec.data.mid);
 			strseq++;
 		}
 		i++;
 	}
 	return;
 }
 
 void
 sctp_send_sack(struct sctp_tcb *stcb, int so_locked)
 {
 	/*-
 	 * Queue up a SACK or NR-SACK in the control queue.
 	 * We must first check to see if a SACK or NR-SACK is
 	 * somehow on the control queue.
 	 * If so, we will take and and remove the old one.
 	 */
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk, *a_chk;
 	struct sctp_sack_chunk *sack;
 	struct sctp_nr_sack_chunk *nr_sack;
 	struct sctp_gap_ack_block *gap_descriptor;
 	const struct sack_track *selector;
 	int mergeable = 0;
 	int offset;
 	caddr_t limit;
 	uint32_t *dup;
 	int limit_reached = 0;
 	unsigned int i, siz, j;
 	unsigned int num_gap_blocks = 0, num_nr_gap_blocks = 0, space;
 	int num_dups = 0;
 	int space_req;
 	uint32_t highest_tsn;
 	uint8_t flags;
 	uint8_t type;
 	uint8_t tsn_map;
 
 	if (stcb->asoc.nrsack_supported == 1) {
 		type = SCTP_NR_SELECTIVE_ACK;
 	} else {
 		type = SCTP_SELECTIVE_ACK;
 	}
 	a_chk = NULL;
 	asoc = &stcb->asoc;
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if (asoc->last_data_chunk_from == NULL) {
 		/* Hmm we never received anything */
 		return;
 	}
 	sctp_slide_mapping_arrays(stcb);
 	sctp_set_rwnd(stcb, asoc);
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if (chk->rec.chunk_id.id == type) {
 			/* Hmm, found a sack already on queue, remove it */
 			TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
 			asoc->ctrl_queue_cnt--;
 			a_chk = chk;
 			if (a_chk->data) {
 				sctp_m_freem(a_chk->data);
 				a_chk->data = NULL;
 			}
 			if (a_chk->whoTo) {
 				sctp_free_remote_addr(a_chk->whoTo);
 				a_chk->whoTo = NULL;
 			}
 			break;
 		}
 	}
 	if (a_chk == NULL) {
 		sctp_alloc_a_chunk(stcb, a_chk);
 		if (a_chk == NULL) {
 			/* No memory so we drop the idea, and set a timer */
 			if (stcb->asoc.delayed_ack) {
 				sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
 				    stcb->sctp_ep, stcb, NULL,
 				    SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4);
 				sctp_timer_start(SCTP_TIMER_TYPE_RECV,
 				    stcb->sctp_ep, stcb, NULL);
 			} else {
 				stcb->asoc.send_sack = 1;
 			}
 			return;
 		}
 		a_chk->copy_by_ref = 0;
 		a_chk->rec.chunk_id.id = type;
 		a_chk->rec.chunk_id.can_take_data = 1;
 	}
 	/* Clear our pkt counts */
 	asoc->data_pkts_seen = 0;
 
 	a_chk->flags = 0;
 	a_chk->asoc = asoc;
 	a_chk->snd_count = 0;
 	a_chk->send_size = 0;	/* fill in later */
 	a_chk->sent = SCTP_DATAGRAM_UNSENT;
 	a_chk->whoTo = NULL;
 
 	if (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE)) {
 		/*-
 		 * Ok, the destination for the SACK is unreachable, lets see if
 		 * we can select an alternate to asoc->last_data_chunk_from
 		 */
 		a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0);
 		if (a_chk->whoTo == NULL) {
 			/* Nope, no alternate */
 			a_chk->whoTo = asoc->last_data_chunk_from;
 		}
 	} else {
 		a_chk->whoTo = asoc->last_data_chunk_from;
 	}
 	if (a_chk->whoTo) {
 		atomic_add_int(&a_chk->whoTo->ref_count, 1);
 	}
 	if (SCTP_TSN_GT(asoc->highest_tsn_inside_map, asoc->highest_tsn_inside_nr_map)) {
 		highest_tsn = asoc->highest_tsn_inside_map;
 	} else {
 		highest_tsn = asoc->highest_tsn_inside_nr_map;
 	}
 	if (highest_tsn == asoc->cumulative_tsn) {
 		/* no gaps */
 		if (type == SCTP_SELECTIVE_ACK) {
 			space_req = sizeof(struct sctp_sack_chunk);
 		} else {
 			space_req = sizeof(struct sctp_nr_sack_chunk);
 		}
 	} else {
 		/* gaps get a cluster */
 		space_req = MCLBYTES;
 	}
 	/* Ok now lets formulate a MBUF with our sack */
 	a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_NOWAIT, 1, MT_DATA);
 	if ((a_chk->data == NULL) ||
 	    (a_chk->whoTo == NULL)) {
 		/* rats, no mbuf memory */
 		if (a_chk->data) {
 			/* was a problem with the destination */
 			sctp_m_freem(a_chk->data);
 			a_chk->data = NULL;
 		}
 		sctp_free_a_chunk(stcb, a_chk, so_locked);
 		/* sa_ignore NO_NULL_CHK */
 		if (stcb->asoc.delayed_ack) {
 			sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
 			    stcb->sctp_ep, stcb, NULL,
 			    SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5);
 			sctp_timer_start(SCTP_TIMER_TYPE_RECV,
 			    stcb->sctp_ep, stcb, NULL);
 		} else {
 			stcb->asoc.send_sack = 1;
 		}
 		return;
 	}
 	/* ok, lets go through and fill it in */
 	SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD);
 	space = (unsigned int)M_TRAILINGSPACE(a_chk->data);
 	if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) {
 		space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD);
 	}
 	limit = mtod(a_chk->data, caddr_t);
 	limit += space;
 
 	flags = 0;
 
 	if ((asoc->sctp_cmt_on_off > 0) &&
 	    SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) {
 		/*-
 		 * CMT DAC algorithm: If 2 (i.e., 0x10) packets have been
 		 * received, then set high bit to 1, else 0. Reset
 		 * pkts_rcvd.
 		 */
 		flags |= (asoc->cmt_dac_pkts_rcvd << 6);
 		asoc->cmt_dac_pkts_rcvd = 0;
 	}
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	stcb->asoc.cumack_logsnt[stcb->asoc.cumack_log_atsnt] = asoc->cumulative_tsn;
 	stcb->asoc.cumack_log_atsnt++;
 	if (stcb->asoc.cumack_log_atsnt >= SCTP_TSN_LOG_SIZE) {
 		stcb->asoc.cumack_log_atsnt = 0;
 	}
 #endif
 	/* reset the readers interpretation */
 	stcb->freed_by_sorcv_sincelast = 0;
 
 	if (type == SCTP_SELECTIVE_ACK) {
 		sack = mtod(a_chk->data, struct sctp_sack_chunk *);
 		nr_sack = NULL;
 		gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)sack + sizeof(struct sctp_sack_chunk));
 		if (highest_tsn > asoc->mapping_array_base_tsn) {
 			siz = (((highest_tsn - asoc->mapping_array_base_tsn) + 1) + 7) / 8;
 		} else {
 			siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + highest_tsn + 7) / 8;
 		}
 	} else {
 		sack = NULL;
 		nr_sack = mtod(a_chk->data, struct sctp_nr_sack_chunk *);
 		gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)nr_sack + sizeof(struct sctp_nr_sack_chunk));
 		if (asoc->highest_tsn_inside_map > asoc->mapping_array_base_tsn) {
 			siz = (((asoc->highest_tsn_inside_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8;
 		} else {
 			siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_map + 7) / 8;
 		}
 	}
 
 	if (SCTP_TSN_GT(asoc->mapping_array_base_tsn, asoc->cumulative_tsn)) {
 		offset = 1;
 	} else {
 		offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn;
 	}
 	if (((type == SCTP_SELECTIVE_ACK) &&
 	    SCTP_TSN_GT(highest_tsn, asoc->cumulative_tsn)) ||
 	    ((type == SCTP_NR_SELECTIVE_ACK) &&
 	    SCTP_TSN_GT(asoc->highest_tsn_inside_map, asoc->cumulative_tsn))) {
 		/* we have a gap .. maybe */
 		for (i = 0; i < siz; i++) {
 			tsn_map = asoc->mapping_array[i];
 			if (type == SCTP_SELECTIVE_ACK) {
 				tsn_map |= asoc->nr_mapping_array[i];
 			}
 			if (i == 0) {
 				/*
 				 * Clear all bits corresponding to TSNs
 				 * smaller or equal to the cumulative TSN.
 				 */
 				tsn_map &= (~0U << (1 - offset));
 			}
 			selector = &sack_array[tsn_map];
 			if (mergeable && selector->right_edge) {
 				/*
 				 * Backup, left and right edges were ok to
 				 * merge.
 				 */
 				num_gap_blocks--;
 				gap_descriptor--;
 			}
 			if (selector->num_entries == 0)
 				mergeable = 0;
 			else {
 				for (j = 0; j < selector->num_entries; j++) {
 					if (mergeable && selector->right_edge) {
 						/*
 						 * do a merge by NOT setting
 						 * the left side
 						 */
 						mergeable = 0;
 					} else {
 						/*
 						 * no merge, set the left
 						 * side
 						 */
 						mergeable = 0;
 						gap_descriptor->start = htons((selector->gaps[j].start + offset));
 					}
 					gap_descriptor->end = htons((selector->gaps[j].end + offset));
 					num_gap_blocks++;
 					gap_descriptor++;
 					if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) {
 						/* no more room */
 						limit_reached = 1;
 						break;
 					}
 				}
 				if (selector->left_edge) {
 					mergeable = 1;
 				}
 			}
 			if (limit_reached) {
 				/* Reached the limit stop */
 				break;
 			}
 			offset += 8;
 		}
 	}
 	if ((type == SCTP_NR_SELECTIVE_ACK) &&
 	    (limit_reached == 0)) {
 
 		mergeable = 0;
 
 		if (asoc->highest_tsn_inside_nr_map > asoc->mapping_array_base_tsn) {
 			siz = (((asoc->highest_tsn_inside_nr_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8;
 		} else {
 			siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_nr_map + 7) / 8;
 		}
 
 		if (SCTP_TSN_GT(asoc->mapping_array_base_tsn, asoc->cumulative_tsn)) {
 			offset = 1;
 		} else {
 			offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn;
 		}
 		if (SCTP_TSN_GT(asoc->highest_tsn_inside_nr_map, asoc->cumulative_tsn)) {
 			/* we have a gap .. maybe */
 			for (i = 0; i < siz; i++) {
 				tsn_map = asoc->nr_mapping_array[i];
 				if (i == 0) {
 					/*
 					 * Clear all bits corresponding to
 					 * TSNs smaller or equal to the
 					 * cumulative TSN.
 					 */
 					tsn_map &= (~0U << (1 - offset));
 				}
 				selector = &sack_array[tsn_map];
 				if (mergeable && selector->right_edge) {
 					/*
 					 * Backup, left and right edges were
 					 * ok to merge.
 					 */
 					num_nr_gap_blocks--;
 					gap_descriptor--;
 				}
 				if (selector->num_entries == 0)
 					mergeable = 0;
 				else {
 					for (j = 0; j < selector->num_entries; j++) {
 						if (mergeable && selector->right_edge) {
 							/*
 							 * do a merge by NOT
 							 * setting the left
 							 * side
 							 */
 							mergeable = 0;
 						} else {
 							/*
 							 * no merge, set the
 							 * left side
 							 */
 							mergeable = 0;
 							gap_descriptor->start = htons((selector->gaps[j].start + offset));
 						}
 						gap_descriptor->end = htons((selector->gaps[j].end + offset));
 						num_nr_gap_blocks++;
 						gap_descriptor++;
 						if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) {
 							/* no more room */
 							limit_reached = 1;
 							break;
 						}
 					}
 					if (selector->left_edge) {
 						mergeable = 1;
 					}
 				}
 				if (limit_reached) {
 					/* Reached the limit stop */
 					break;
 				}
 				offset += 8;
 			}
 		}
 	}
 	/* now we must add any dups we are going to report. */
 	if ((limit_reached == 0) && (asoc->numduptsns)) {
 		dup = (uint32_t *)gap_descriptor;
 		for (i = 0; i < asoc->numduptsns; i++) {
 			*dup = htonl(asoc->dup_tsns[i]);
 			dup++;
 			num_dups++;
 			if (((caddr_t)dup + sizeof(uint32_t)) > limit) {
 				/* no more room */
 				break;
 			}
 		}
 		asoc->numduptsns = 0;
 	}
 	/*
 	 * now that the chunk is prepared queue it to the control chunk
 	 * queue.
 	 */
 	if (type == SCTP_SELECTIVE_ACK) {
 		a_chk->send_size = (uint16_t)(sizeof(struct sctp_sack_chunk) +
 		    (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) +
 		    num_dups * sizeof(int32_t));
 		SCTP_BUF_LEN(a_chk->data) = a_chk->send_size;
 		sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn);
 		sack->sack.a_rwnd = htonl(asoc->my_rwnd);
 		sack->sack.num_gap_ack_blks = htons(num_gap_blocks);
 		sack->sack.num_dup_tsns = htons(num_dups);
 		sack->ch.chunk_type = type;
 		sack->ch.chunk_flags = flags;
 		sack->ch.chunk_length = htons(a_chk->send_size);
 	} else {
 		a_chk->send_size = (uint16_t)(sizeof(struct sctp_nr_sack_chunk) +
 		    (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) +
 		    num_dups * sizeof(int32_t));
 		SCTP_BUF_LEN(a_chk->data) = a_chk->send_size;
 		nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn);
 		nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd);
 		nr_sack->nr_sack.num_gap_ack_blks = htons(num_gap_blocks);
 		nr_sack->nr_sack.num_nr_gap_ack_blks = htons(num_nr_gap_blocks);
 		nr_sack->nr_sack.num_dup_tsns = htons(num_dups);
 		nr_sack->nr_sack.reserved = 0;
 		nr_sack->ch.chunk_type = type;
 		nr_sack->ch.chunk_flags = flags;
 		nr_sack->ch.chunk_length = htons(a_chk->send_size);
 	}
 	TAILQ_INSERT_TAIL(&asoc->control_send_queue, a_chk, sctp_next);
 	asoc->my_last_reported_rwnd = asoc->my_rwnd;
 	asoc->ctrl_queue_cnt++;
 	asoc->send_sack = 0;
 	SCTP_STAT_INCR(sctps_sendsacks);
 	return;
 }
 
 void
 sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked)
 {
 	struct mbuf *m_abort, *m, *m_last;
 	struct mbuf *m_out, *m_end = NULL;
 	struct sctp_abort_chunk *abort;
 	struct sctp_auth_chunk *auth = NULL;
 	struct sctp_nets *net;
 	uint32_t vtag;
 	uint32_t auth_offset = 0;
 	int error;
 	uint16_t cause_len, chunk_len, padding_len;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	/*-
 	 * Add an AUTH chunk, if chunk requires it and save the offset into
 	 * the chain for AUTH
 	 */
 	if (sctp_auth_is_required_chunk(SCTP_ABORT_ASSOCIATION,
 	    stcb->asoc.peer_auth_chunks)) {
 		m_out = sctp_add_auth_chunk(NULL, &m_end, &auth, &auth_offset,
 		    stcb, SCTP_ABORT_ASSOCIATION);
 		SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	} else {
 		m_out = NULL;
 	}
 	m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_NOWAIT, 1, MT_HEADER);
 	if (m_abort == NULL) {
 		if (m_out) {
 			sctp_m_freem(m_out);
 		}
 		if (operr) {
 			sctp_m_freem(operr);
 		}
 		return;
 	}
 	/* link in any error */
 	SCTP_BUF_NEXT(m_abort) = operr;
 	cause_len = 0;
 	m_last = NULL;
 	for (m = operr; m; m = SCTP_BUF_NEXT(m)) {
 		cause_len += (uint16_t)SCTP_BUF_LEN(m);
 		if (SCTP_BUF_NEXT(m) == NULL) {
 			m_last = m;
 		}
 	}
 	SCTP_BUF_LEN(m_abort) = sizeof(struct sctp_abort_chunk);
 	chunk_len = (uint16_t)sizeof(struct sctp_abort_chunk) + cause_len;
 	padding_len = SCTP_SIZE32(chunk_len) - chunk_len;
 	if (m_out == NULL) {
 		/* NO Auth chunk prepended, so reserve space in front */
 		SCTP_BUF_RESV_UF(m_abort, SCTP_MIN_OVERHEAD);
 		m_out = m_abort;
 	} else {
 		/* Put AUTH chunk at the front of the chain */
 		SCTP_BUF_NEXT(m_end) = m_abort;
 	}
 	if (stcb->asoc.alternate) {
 		net = stcb->asoc.alternate;
 	} else {
 		net = stcb->asoc.primary_destination;
 	}
 	/* Fill in the ABORT chunk header. */
 	abort = mtod(m_abort, struct sctp_abort_chunk *);
 	abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION;
 	if (stcb->asoc.peer_vtag == 0) {
 		/* This happens iff the assoc is in COOKIE-WAIT state. */
 		vtag = stcb->asoc.my_vtag;
 		abort->ch.chunk_flags = SCTP_HAD_NO_TCB;
 	} else {
 		vtag = stcb->asoc.peer_vtag;
 		abort->ch.chunk_flags = 0;
 	}
 	abort->ch.chunk_length = htons(chunk_len);
 	/* Add padding, if necessary. */
 	if (padding_len > 0) {
 		if ((m_last == NULL) ||
 		    (sctp_add_pad_tombuf(m_last, padding_len) == NULL)) {
 			sctp_m_freem(m_out);
 			return;
 		}
 	}
 	if ((error = sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net,
 	    (struct sockaddr *)&net->ro._l_addr,
 	    m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, 0,
 	    stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag),
 	    stcb->asoc.primary_destination->port, NULL,
 	    0, 0,
 	    so_locked))) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 		if (error == ENOBUFS) {
 			stcb->asoc.ifp_had_enobuf = 1;
 			SCTP_STAT_INCR(sctps_lowlevelerr);
 		}
 	} else {
 		stcb->asoc.ifp_had_enobuf = 0;
 	}
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 }
 
 void
 sctp_send_shutdown_complete(struct sctp_tcb *stcb,
     struct sctp_nets *net,
     int reflect_vtag)
 {
 	/* formulate and SEND a SHUTDOWN-COMPLETE */
 	struct mbuf *m_shutdown_comp;
 	struct sctp_shutdown_complete_chunk *shutdown_complete;
 	uint32_t vtag;
 	int error;
 	uint8_t flags;
 
 	m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER);
 	if (m_shutdown_comp == NULL) {
 		/* no mbuf's */
 		return;
 	}
 	if (reflect_vtag) {
 		flags = SCTP_HAD_NO_TCB;
 		vtag = stcb->asoc.my_vtag;
 	} else {
 		flags = 0;
 		vtag = stcb->asoc.peer_vtag;
 	}
 	shutdown_complete = mtod(m_shutdown_comp, struct sctp_shutdown_complete_chunk *);
 	shutdown_complete->ch.chunk_type = SCTP_SHUTDOWN_COMPLETE;
 	shutdown_complete->ch.chunk_flags = flags;
 	shutdown_complete->ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk));
 	SCTP_BUF_LEN(m_shutdown_comp) = sizeof(struct sctp_shutdown_complete_chunk);
 	if ((error = sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net,
 	    (struct sockaddr *)&net->ro._l_addr,
 	    m_shutdown_comp, 0, NULL, 0, 1, 0, 0,
 	    stcb->sctp_ep->sctp_lport, stcb->rport,
 	    htonl(vtag),
 	    net->port, NULL,
 	    0, 0,
 	    SCTP_SO_NOT_LOCKED))) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
 		if (error == ENOBUFS) {
 			stcb->asoc.ifp_had_enobuf = 1;
 			SCTP_STAT_INCR(sctps_lowlevelerr);
 		}
 	} else {
 		stcb->asoc.ifp_had_enobuf = 0;
 	}
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	return;
 }
 
 static void
 sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, uint32_t vtag,
     uint8_t type, struct mbuf *cause,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	struct mbuf *o_pak;
 	struct mbuf *mout;
 	struct sctphdr *shout;
 	struct sctp_chunkhdr *ch;
 #if defined(INET) || defined(INET6)
 	struct udphdr *udp;
 #endif
 	int ret, len, cause_len, padding_len;
 #ifdef INET
 	struct sockaddr_in *src_sin, *dst_sin;
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *src_sin6, *dst_sin6;
 	struct ip6_hdr *ip6;
 #endif
 
 	/* Compute the length of the cause and add final padding. */
 	cause_len = 0;
 	if (cause != NULL) {
 		struct mbuf *m_at, *m_last = NULL;
 
 		for (m_at = cause; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
 			if (SCTP_BUF_NEXT(m_at) == NULL)
 				m_last = m_at;
 			cause_len += SCTP_BUF_LEN(m_at);
 		}
 		padding_len = cause_len % 4;
 		if (padding_len != 0) {
 			padding_len = 4 - padding_len;
 		}
 		if (padding_len != 0) {
 			if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) {
 				sctp_m_freem(cause);
 				return;
 			}
 		}
 	} else {
 		padding_len = 0;
 	}
 	/* Get an mbuf for the header. */
 	len = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		len += sizeof(struct ip);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		len += sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		break;
 	}
 #if defined(INET) || defined(INET6)
 	if (port) {
 		len += sizeof(struct udphdr);
 	}
 #endif
 	mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_NOWAIT, 1, MT_DATA);
 	if (mout == NULL) {
 		if (cause) {
 			sctp_m_freem(cause);
 		}
 		return;
 	}
 	SCTP_BUF_RESV_UF(mout, max_linkhdr);
 	SCTP_BUF_LEN(mout) = len;
 	SCTP_BUF_NEXT(mout) = cause;
 	M_SETFIB(mout, fibnum);
 	mout->m_pkthdr.flowid = mflowid;
 	M_HASHTYPE_SET(mout, mflowtype);
 #ifdef INET
 	ip = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		src_sin = (struct sockaddr_in *)src;
 		dst_sin = (struct sockaddr_in *)dst;
 		ip = mtod(mout, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = (sizeof(struct ip) >> 2);
 		ip->ip_tos = 0;
 		ip->ip_off = htons(IP_DF);
 		ip_fillid(ip);
 		ip->ip_ttl = MODULE_GLOBAL(ip_defttl);
 		if (port) {
 			ip->ip_p = IPPROTO_UDP;
 		} else {
 			ip->ip_p = IPPROTO_SCTP;
 		}
 		ip->ip_src.s_addr = dst_sin->sin_addr.s_addr;
 		ip->ip_dst.s_addr = src_sin->sin_addr.s_addr;
 		ip->ip_sum = 0;
 		len = sizeof(struct ip);
 		shout = (struct sctphdr *)((caddr_t)ip + len);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		src_sin6 = (struct sockaddr_in6 *)src;
 		dst_sin6 = (struct sockaddr_in6 *)dst;
 		ip6 = mtod(mout, struct ip6_hdr *);
 		ip6->ip6_flow = htonl(0x60000000);
 		if (V_ip6_auto_flowlabel) {
 			ip6->ip6_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 		}
 		ip6->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
 		if (port) {
 			ip6->ip6_nxt = IPPROTO_UDP;
 		} else {
 			ip6->ip6_nxt = IPPROTO_SCTP;
 		}
 		ip6->ip6_src = dst_sin6->sin6_addr;
 		ip6->ip6_dst = src_sin6->sin6_addr;
 		len = sizeof(struct ip6_hdr);
 		shout = (struct sctphdr *)((caddr_t)ip6 + len);
 		break;
 #endif
 	default:
 		len = 0;
 		shout = mtod(mout, struct sctphdr *);
 		break;
 	}
 #if defined(INET) || defined(INET6)
 	if (port) {
 		if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
 			sctp_m_freem(mout);
 			return;
 		}
 		udp = (struct udphdr *)shout;
 		udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
 		udp->uh_dport = port;
 		udp->uh_sum = 0;
 		udp->uh_ulen = htons((uint16_t)(sizeof(struct udphdr) +
 		    sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr) +
 		    cause_len + padding_len));
 		len += sizeof(struct udphdr);
 		shout = (struct sctphdr *)((caddr_t)shout + sizeof(struct udphdr));
 	} else {
 		udp = NULL;
 	}
 #endif
 	shout->src_port = sh->dest_port;
 	shout->dest_port = sh->src_port;
 	shout->checksum = 0;
 	if (vtag) {
 		shout->v_tag = htonl(vtag);
 	} else {
 		shout->v_tag = sh->v_tag;
 	}
 	len += sizeof(struct sctphdr);
 	ch = (struct sctp_chunkhdr *)((caddr_t)shout + sizeof(struct sctphdr));
 	ch->chunk_type = type;
 	if (vtag) {
 		ch->chunk_flags = 0;
 	} else {
 		ch->chunk_flags = SCTP_HAD_NO_TCB;
 	}
 	ch->chunk_length = htons((uint16_t)(sizeof(struct sctp_chunkhdr) + cause_len));
 	len += sizeof(struct sctp_chunkhdr);
 	len += cause_len + padding_len;
 
 	if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
 		sctp_m_freem(mout);
 		return;
 	}
 	SCTP_ATTACH_CHAIN(o_pak, mout, len);
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (port) {
 			if (V_udp_cksum) {
 				udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
 			} else {
 				udp->uh_sum = 0;
 			}
 		}
 		ip->ip_len = htons(len);
 		if (port) {
 			shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip) + sizeof(struct udphdr));
 			SCTP_STAT_INCR(sctps_sendswcrc);
 			if (V_udp_cksum) {
 				SCTP_ENABLE_UDP_CSUM(o_pak);
 			}
 		} else {
 			mout->m_pkthdr.csum_flags = CSUM_SCTP;
 			mout->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum);
 			SCTP_STAT_INCR(sctps_sendhwcrc);
 		}
 #ifdef SCTP_PACKET_LOGGING
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
 			sctp_packet_log(o_pak);
 		}
 #endif
 		SCTP_PROBE5(send, NULL, NULL, ip, NULL, shout);
 		SCTP_IP_OUTPUT(ret, o_pak, NULL, NULL, vrf_id);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ip6->ip6_plen = htons((uint16_t)(len - sizeof(struct ip6_hdr)));
 		if (port) {
 			shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
 			SCTP_STAT_INCR(sctps_sendswcrc);
 			if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) {
 				udp->uh_sum = 0xffff;
 			}
 		} else {
 			mout->m_pkthdr.csum_flags = CSUM_SCTP_IPV6;
 			mout->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum);
 			SCTP_STAT_INCR(sctps_sendhwcrc);
 		}
 #ifdef SCTP_PACKET_LOGGING
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
 			sctp_packet_log(o_pak);
 		}
 #endif
 		SCTP_PROBE5(send, NULL, NULL, ip6, NULL, shout);
 		SCTP_IP6_OUTPUT(ret, o_pak, NULL, NULL, NULL, vrf_id);
 		break;
 #endif
 	default:
 		SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n",
 		    dst->sa_family);
 		sctp_m_freem(mout);
 		SCTP_LTRACE_ERR_RET_PKT(mout, NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT);
 		return;
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret);
 	if (port) {
 		UDPSTAT_INC(udps_opackets);
 	}
 	SCTP_STAT_INCR(sctps_sendpackets);
 	SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	if (ret) {
 		SCTP_STAT_INCR(sctps_senderrors);
 	}
 	return;
 }
 
 void
 sctp_send_shutdown_complete2(struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	sctp_send_resp_msg(src, dst, sh, 0, SCTP_SHUTDOWN_COMPLETE, NULL,
 	    mflowtype, mflowid, fibnum,
 	    vrf_id, port);
 }
 
 void
 sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked)
 {
 	struct sctp_tmit_chunk *chk;
 	struct sctp_heartbeat_chunk *hb;
 	struct timeval now;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if (net == NULL) {
 		return;
 	}
 	(void)SCTP_GETTIME_TIMEVAL(&now);
 	switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif
 	default:
 		return;
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak, can't get a chunk for hb\n");
 		return;
 	}
 
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST;
 	chk->rec.chunk_id.can_take_data = 1;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->send_size = sizeof(struct sctp_heartbeat_chunk);
 
 	chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, so_locked);
 		return;
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->whoTo = net;
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	/* Now we have a mbuf that we can fill in with the details */
 	hb = mtod(chk->data, struct sctp_heartbeat_chunk *);
 	memset(hb, 0, sizeof(struct sctp_heartbeat_chunk));
 	/* fill out chunk header */
 	hb->ch.chunk_type = SCTP_HEARTBEAT_REQUEST;
 	hb->ch.chunk_flags = 0;
 	hb->ch.chunk_length = htons(chk->send_size);
 	/* Fill out hb parameter */
 	hb->heartbeat.hb_info.ph.param_type = htons(SCTP_HEARTBEAT_INFO);
 	hb->heartbeat.hb_info.ph.param_length = htons(sizeof(struct sctp_heartbeat_info_param));
 	hb->heartbeat.hb_info.time_value_1 = now.tv_sec;
 	hb->heartbeat.hb_info.time_value_2 = now.tv_usec;
 	/* Did our user request this one, put it in */
 	hb->heartbeat.hb_info.addr_family = (uint8_t)net->ro._l_addr.sa.sa_family;
 	hb->heartbeat.hb_info.addr_len = net->ro._l_addr.sa.sa_len;
 	if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 		/*
 		 * we only take from the entropy pool if the address is not
 		 * confirmed.
 		 */
 		net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
 		net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
 	} else {
 		net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = 0;
 		net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = 0;
 	}
 	switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		memcpy(hb->heartbeat.hb_info.address,
 		    &net->ro._l_addr.sin.sin_addr,
 		    sizeof(net->ro._l_addr.sin.sin_addr));
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		memcpy(hb->heartbeat.hb_info.address,
 		    &net->ro._l_addr.sin6.sin6_addr,
 		    sizeof(net->ro._l_addr.sin6.sin6_addr));
 		break;
 #endif
 	default:
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		sctp_free_a_chunk(stcb, chk, so_locked);
 		return;
 		break;
 	}
 	net->hb_responded = 0;
 	TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
 	stcb->asoc.ctrl_queue_cnt++;
 	SCTP_STAT_INCR(sctps_sendheartbeat);
 	return;
 }
 
 void
 sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net,
     uint32_t high_tsn)
 {
 	struct sctp_association *asoc;
 	struct sctp_ecne_chunk *ecne;
 	struct sctp_tmit_chunk *chk;
 
 	if (net == NULL) {
 		return;
 	}
 	asoc = &stcb->asoc;
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if ((chk->rec.chunk_id.id == SCTP_ECN_ECHO) && (net == chk->whoTo)) {
 			/* found a previous ECN_ECHO update it if needed */
 			uint32_t cnt, ctsn;
 
 			ecne = mtod(chk->data, struct sctp_ecne_chunk *);
 			ctsn = ntohl(ecne->tsn);
 			if (SCTP_TSN_GT(high_tsn, ctsn)) {
 				ecne->tsn = htonl(high_tsn);
 				SCTP_STAT_INCR(sctps_queue_upd_ecne);
 			}
 			cnt = ntohl(ecne->num_pkts_since_cwr);
 			cnt++;
 			ecne->num_pkts_since_cwr = htonl(cnt);
 			return;
 		}
 	}
 	/* nope could not find one to update so we must build one */
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		return;
 	}
 	SCTP_STAT_INCR(sctps_queue_upd_ecne);
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_ECN_ECHO;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->send_size = sizeof(struct sctp_ecne_chunk);
 	chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		return;
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->whoTo = net;
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 
 	stcb->asoc.ecn_echo_cnt_onq++;
 	ecne = mtod(chk->data, struct sctp_ecne_chunk *);
 	ecne->ch.chunk_type = SCTP_ECN_ECHO;
 	ecne->ch.chunk_flags = 0;
 	ecne->ch.chunk_length = htons(sizeof(struct sctp_ecne_chunk));
 	ecne->tsn = htonl(high_tsn);
 	ecne->num_pkts_since_cwr = htonl(1);
 	TAILQ_INSERT_HEAD(&stcb->asoc.control_send_queue, chk, sctp_next);
 	asoc->ctrl_queue_cnt++;
 }
 
 void
 sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net,
     struct mbuf *m, int len, int iphlen, int bad_crc)
 {
 	struct sctp_association *asoc;
 	struct sctp_pktdrop_chunk *drp;
 	struct sctp_tmit_chunk *chk;
 	uint8_t *datap;
 	int was_trunc = 0;
 	int fullsz = 0;
 	long spc;
 	int offset;
 	struct sctp_chunkhdr *ch, chunk_buf;
 	unsigned int chk_length;
 
 	if (!stcb) {
 		return;
 	}
 	asoc = &stcb->asoc;
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if (asoc->pktdrop_supported == 0) {
 		/*-
 		 * peer must declare support before I send one.
 		 */
 		return;
 	}
 	if (stcb->sctp_socket == NULL) {
 		return;
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_PACKET_DROPPED;
 	chk->rec.chunk_id.can_take_data = 1;
 	chk->flags = 0;
 	len -= iphlen;
 	chk->send_size = len;
 	/* Validate that we do not have an ABORT in here. */
 	offset = iphlen + sizeof(struct sctphdr);
 	ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
 	    sizeof(*ch), (uint8_t *)&chunk_buf);
 	while (ch != NULL) {
 		chk_length = ntohs(ch->chunk_length);
 		if (chk_length < sizeof(*ch)) {
 			/* break to abort land */
 			break;
 		}
 		switch (ch->chunk_type) {
 		case SCTP_PACKET_DROPPED:
 		case SCTP_ABORT_ASSOCIATION:
 		case SCTP_INITIATION_ACK:
 			/**
 			 * We don't respond with an PKT-DROP to an ABORT
 			 * or PKT-DROP. We also do not respond to an
 			 * INIT-ACK, because we can't know if the initiation
 			 * tag is correct or not.
 			 */
 			sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 			return;
 		default:
 			break;
 		}
 		offset += SCTP_SIZE32(chk_length);
 		ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
 		    sizeof(*ch), (uint8_t *)&chunk_buf);
 	}
 
 	if ((len + SCTP_MAX_OVERHEAD + sizeof(struct sctp_pktdrop_chunk)) >
 	    min(stcb->asoc.smallest_mtu, MCLBYTES)) {
 		/*
 		 * only send 1 mtu worth, trim off the excess on the end.
 		 */
 		fullsz = len;
 		len = min(stcb->asoc.smallest_mtu, MCLBYTES) - SCTP_MAX_OVERHEAD;
 		was_trunc = 1;
 	}
 	chk->asoc = &stcb->asoc;
 	chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (chk->data == NULL) {
 jump_out:
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		return;
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 	drp = mtod(chk->data, struct sctp_pktdrop_chunk *);
 	if (drp == NULL) {
 		sctp_m_freem(chk->data);
 		chk->data = NULL;
 		goto jump_out;
 	}
 	chk->book_size = SCTP_SIZE32((chk->send_size + sizeof(struct sctp_pktdrop_chunk) +
 	    sizeof(struct sctphdr) + SCTP_MED_OVERHEAD));
 	chk->book_size_scale = 0;
 	if (was_trunc) {
 		drp->ch.chunk_flags = SCTP_PACKET_TRUNCATED;
 		drp->trunc_len = htons(fullsz);
 		/*
 		 * Len is already adjusted to size minus overhead above take
 		 * out the pkt_drop chunk itself from it.
 		 */
 		chk->send_size = (uint16_t)(len - sizeof(struct sctp_pktdrop_chunk));
 		len = chk->send_size;
 	} else {
 		/* no truncation needed */
 		drp->ch.chunk_flags = 0;
 		drp->trunc_len = htons(0);
 	}
 	if (bad_crc) {
 		drp->ch.chunk_flags |= SCTP_BADCRC;
 	}
 	chk->send_size += sizeof(struct sctp_pktdrop_chunk);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	if (net) {
 		/* we should hit here */
 		chk->whoTo = net;
 		atomic_add_int(&chk->whoTo->ref_count, 1);
 	} else {
 		chk->whoTo = NULL;
 	}
 	drp->ch.chunk_type = SCTP_PACKET_DROPPED;
 	drp->ch.chunk_length = htons(chk->send_size);
 	spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket);
 	if (spc < 0) {
 		spc = 0;
 	}
 	drp->bottle_bw = htonl(spc);
 	if (asoc->my_rwnd) {
 		drp->current_onq = htonl(asoc->size_on_reasm_queue +
 		    asoc->size_on_all_streams +
 		    asoc->my_rwnd_control_len +
 		    stcb->sctp_socket->so_rcv.sb_cc);
 	} else {
 		/*-
 		 * If my rwnd is 0, possibly from mbuf depletion as well as
 		 * space used, tell the peer there is NO space aka onq == bw
 		 */
 		drp->current_onq = htonl(spc);
 	}
 	drp->reserved = 0;
 	datap = drp->data;
 	m_copydata(m, iphlen, len, (caddr_t)datap);
 	TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
 	asoc->ctrl_queue_cnt++;
 }
 
 void
 sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, uint8_t override)
 {
 	struct sctp_association *asoc;
 	struct sctp_cwr_chunk *cwr;
 	struct sctp_tmit_chunk *chk;
 
 	SCTP_TCB_LOCK_ASSERT(stcb);
 	if (net == NULL) {
 		return;
 	}
 	asoc = &stcb->asoc;
 	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
 		if ((chk->rec.chunk_id.id == SCTP_ECN_CWR) && (net == chk->whoTo)) {
 			/*
 			 * found a previous CWR queued to same destination
 			 * update it if needed
 			 */
 			uint32_t ctsn;
 
 			cwr = mtod(chk->data, struct sctp_cwr_chunk *);
 			ctsn = ntohl(cwr->tsn);
 			if (SCTP_TSN_GT(high_tsn, ctsn)) {
 				cwr->tsn = htonl(high_tsn);
 			}
 			if (override & SCTP_CWR_REDUCE_OVERRIDE) {
 				/* Make sure override is carried */
 				cwr->ch.chunk_flags |= SCTP_CWR_REDUCE_OVERRIDE;
 			}
 			return;
 		}
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_ECN_CWR;
 	chk->rec.chunk_id.can_take_data = 1;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->send_size = sizeof(struct sctp_cwr_chunk);
 	chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 		return;
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	chk->whoTo = net;
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	cwr = mtod(chk->data, struct sctp_cwr_chunk *);
 	cwr->ch.chunk_type = SCTP_ECN_CWR;
 	cwr->ch.chunk_flags = override;
 	cwr->ch.chunk_length = htons(sizeof(struct sctp_cwr_chunk));
 	cwr->tsn = htonl(high_tsn);
 	TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next);
 	asoc->ctrl_queue_cnt++;
 }
 
 static int
 sctp_add_stream_reset_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk,
     uint32_t seq, uint32_t resp_seq, uint32_t last_sent)
 {
 	uint16_t len, old_len, i;
 	struct sctp_stream_reset_out_request *req_out;
 	struct sctp_chunkhdr *ch;
 	int at;
 	int number_entries = 0;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 	/* get to new offset for the param. */
 	req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 		if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) &&
 		    (stcb->asoc.strmout[i].chunks_on_queues == 0) &&
 		    TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
 			number_entries++;
 		}
 	}
 	if (number_entries == 0) {
 		return (0);
 	}
 	if (number_entries == stcb->asoc.streamoutcnt) {
 		number_entries = 0;
 	}
 	if (number_entries > SCTP_MAX_STREAMS_AT_ONCE_RESET) {
 		number_entries = SCTP_MAX_STREAMS_AT_ONCE_RESET;
 	}
 	len = (uint16_t)(sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries));
 	req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST);
 	req_out->ph.param_length = htons(len);
 	req_out->request_seq = htonl(seq);
 	req_out->response_seq = htonl(resp_seq);
 	req_out->send_reset_at_tsn = htonl(last_sent);
 	at = 0;
 	if (number_entries) {
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) &&
 			    (stcb->asoc.strmout[i].chunks_on_queues == 0) &&
 			    TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
 				req_out->list_of_streams[at] = htons(i);
 				at++;
 				stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT;
 				if (at >= number_entries) {
 					break;
 				}
 			}
 		}
 	} else {
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT;
 		}
 	}
 	if (SCTP_SIZE32(len) > len) {
 		/*-
 		 * Need to worry about the pad we may end up adding to the
 		 * end. This is easy since the struct is either aligned to 4
 		 * bytes or 2 bytes off.
 		 */
 		req_out->list_of_streams[number_entries] = 0;
 	}
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->book_size = len + old_len;
 	chk->book_size_scale = 0;
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	return (1);
 }
 
 static void
 sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk,
     int number_entries, uint16_t *list,
     uint32_t seq)
 {
 	uint16_t len, old_len, i;
 	struct sctp_stream_reset_in_request *req_in;
 	struct sctp_chunkhdr *ch;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 
 	/* get to new offset for the param. */
 	req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	len = (uint16_t)(sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries));
 	req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST);
 	req_in->ph.param_length = htons(len);
 	req_in->request_seq = htonl(seq);
 	if (number_entries) {
 		for (i = 0; i < number_entries; i++) {
 			req_in->list_of_streams[i] = htons(list[i]);
 		}
 	}
 	if (SCTP_SIZE32(len) > len) {
 		/*-
 		 * Need to worry about the pad we may end up adding to the
 		 * end. This is easy since the struct is either aligned to 4
 		 * bytes or 2 bytes off.
 		 */
 		req_in->list_of_streams[number_entries] = 0;
 	}
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->book_size = len + old_len;
 	chk->book_size_scale = 0;
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	return;
 }
 
 static void
 sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk,
     uint32_t seq)
 {
 	uint16_t len, old_len;
 	struct sctp_stream_reset_tsn_request *req_tsn;
 	struct sctp_chunkhdr *ch;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 
 	/* get to new offset for the param. */
 	req_tsn = (struct sctp_stream_reset_tsn_request *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	len = sizeof(struct sctp_stream_reset_tsn_request);
 	req_tsn->ph.param_type = htons(SCTP_STR_RESET_TSN_REQUEST);
 	req_tsn->ph.param_length = htons(len);
 	req_tsn->request_seq = htonl(seq);
 
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->send_size = len + old_len;
 	chk->book_size = SCTP_SIZE32(chk->send_size);
 	chk->book_size_scale = 0;
 	SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
 	return;
 }
 
 void
 sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk,
     uint32_t resp_seq, uint32_t result)
 {
 	uint16_t len, old_len;
 	struct sctp_stream_reset_response *resp;
 	struct sctp_chunkhdr *ch;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 
 	/* get to new offset for the param. */
 	resp = (struct sctp_stream_reset_response *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	len = sizeof(struct sctp_stream_reset_response);
 	resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE);
 	resp->ph.param_length = htons(len);
 	resp->response_seq = htonl(resp_seq);
 	resp->result = ntohl(result);
 
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->book_size = len + old_len;
 	chk->book_size_scale = 0;
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	return;
 }
 
 void
 sctp_send_deferred_reset_response(struct sctp_tcb *stcb,
     struct sctp_stream_reset_list *ent,
     int response)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_chunkhdr *ch;
 
 	asoc = &stcb->asoc;
 
 	/*
 	 * Reset our last reset action to the new one IP -> response
 	 * (PERFORMED probably). This assures that if we fail to send, a
 	 * retran from the peer will get the new response.
 	 */
 	asoc->last_reset_action[0] = response;
 	if (asoc->stream_reset_outstanding) {
 		return;
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return;
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_STREAM_RESET;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->book_size = sizeof(struct sctp_chunkhdr);
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	chk->book_size_scale = 0;
 	chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED);
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return;
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 	/* setup chunk parameters */
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	if (stcb->asoc.alternate) {
 		chk->whoTo = stcb->asoc.alternate;
 	} else {
 		chk->whoTo = stcb->asoc.primary_destination;
 	}
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	ch->chunk_type = SCTP_STREAM_RESET;
 	ch->chunk_flags = 0;
 	ch->chunk_length = htons(chk->book_size);
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	sctp_add_stream_reset_result(chk, ent->seq, response);
 	/* insert the chunk for sending */
 	TAILQ_INSERT_TAIL(&asoc->control_send_queue,
 	    chk,
 	    sctp_next);
 	asoc->ctrl_queue_cnt++;
 }
 
 void
 sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk,
     uint32_t resp_seq, uint32_t result,
     uint32_t send_una, uint32_t recv_next)
 {
 	uint16_t len, old_len;
 	struct sctp_stream_reset_response_tsn *resp;
 	struct sctp_chunkhdr *ch;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 
 	/* get to new offset for the param. */
 	resp = (struct sctp_stream_reset_response_tsn *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	len = sizeof(struct sctp_stream_reset_response_tsn);
 	resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE);
 	resp->ph.param_length = htons(len);
 	resp->response_seq = htonl(resp_seq);
 	resp->result = htonl(result);
 	resp->senders_next_tsn = htonl(send_una);
 	resp->receivers_next_tsn = htonl(recv_next);
 
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->book_size = len + old_len;
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	chk->book_size_scale = 0;
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	return;
 }
 
 static void
 sctp_add_an_out_stream(struct sctp_tmit_chunk *chk,
     uint32_t seq,
     uint16_t adding)
 {
 	uint16_t len, old_len;
 	struct sctp_chunkhdr *ch;
 	struct sctp_stream_reset_add_strm *addstr;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 
 	/* get to new offset for the param. */
 	addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	len = sizeof(struct sctp_stream_reset_add_strm);
 
 	/* Fill it out. */
 	addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_OUT_STREAMS);
 	addstr->ph.param_length = htons(len);
 	addstr->request_seq = htonl(seq);
 	addstr->number_of_streams = htons(adding);
 	addstr->reserved = 0;
 
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->send_size = len + old_len;
 	chk->book_size = SCTP_SIZE32(chk->send_size);
 	chk->book_size_scale = 0;
 	SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
 	return;
 }
 
 static void
 sctp_add_an_in_stream(struct sctp_tmit_chunk *chk,
     uint32_t seq,
     uint16_t adding)
 {
 	uint16_t len, old_len;
 	struct sctp_chunkhdr *ch;
 	struct sctp_stream_reset_add_strm *addstr;
 
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
 
 	/* get to new offset for the param. */
 	addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len);
 	/* now how long will this param be? */
 	len = sizeof(struct sctp_stream_reset_add_strm);
 	/* Fill it out. */
 	addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_IN_STREAMS);
 	addstr->ph.param_length = htons(len);
 	addstr->request_seq = htonl(seq);
 	addstr->number_of_streams = htons(adding);
 	addstr->reserved = 0;
 
 	/* now fix the chunk length */
 	ch->chunk_length = htons(len + old_len);
 	chk->send_size = len + old_len;
 	chk->book_size = SCTP_SIZE32(chk->send_size);
 	chk->book_size_scale = 0;
 	SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size);
 	return;
 }
 
 int
 sctp_send_stream_reset_out_if_possible(struct sctp_tcb *stcb, int so_locked)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_chunkhdr *ch;
 	uint32_t seq;
 
 	asoc = &stcb->asoc;
 	asoc->trigger_reset = 0;
 	if (asoc->stream_reset_outstanding) {
 		return (EALREADY);
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return (ENOMEM);
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_STREAM_RESET;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->book_size = sizeof(struct sctp_chunkhdr);
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	chk->book_size_scale = 0;
 	chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, so_locked);
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return (ENOMEM);
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 
 	/* setup chunk parameters */
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	if (stcb->asoc.alternate) {
 		chk->whoTo = stcb->asoc.alternate;
 	} else {
 		chk->whoTo = stcb->asoc.primary_destination;
 	}
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	ch->chunk_type = SCTP_STREAM_RESET;
 	ch->chunk_flags = 0;
 	ch->chunk_length = htons(chk->book_size);
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 	seq = stcb->asoc.str_reset_seq_out;
 	if (sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1))) {
 		seq++;
 		asoc->stream_reset_outstanding++;
 	} else {
 		m_freem(chk->data);
 		chk->data = NULL;
 		sctp_free_a_chunk(stcb, chk, so_locked);
 		return (ENOENT);
 	}
 	asoc->str_reset = chk;
 	/* insert the chunk for sending */
 	TAILQ_INSERT_TAIL(&asoc->control_send_queue,
 	    chk,
 	    sctp_next);
 	asoc->ctrl_queue_cnt++;
 
 	if (stcb->asoc.send_sack) {
 		sctp_send_sack(stcb, so_locked);
 	}
 	sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
 	return (0);
 }
 
 int
 sctp_send_str_reset_req(struct sctp_tcb *stcb,
     uint16_t number_entries, uint16_t *list,
     uint8_t send_in_req,
     uint8_t send_tsn_req,
     uint8_t add_stream,
     uint16_t adding_o,
     uint16_t adding_i, uint8_t peer_asked)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 	struct sctp_chunkhdr *ch;
 	int can_send_out_req = 0;
 	uint32_t seq;
 
 	asoc = &stcb->asoc;
 	if (asoc->stream_reset_outstanding) {
 		/*-
 		 * Already one pending, must get ACK back to clear the flag.
 		 */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY);
 		return (EBUSY);
 	}
 	if ((send_in_req == 0) && (send_tsn_req == 0) &&
 	    (add_stream == 0)) {
 		/* nothing to do */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		return (EINVAL);
 	}
 	if (send_tsn_req && send_in_req) {
 		/* error, can't do that */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		return (EINVAL);
 	} else if (send_in_req) {
 		can_send_out_req = 1;
 	}
 	if (number_entries > (MCLBYTES -
 	    SCTP_MIN_OVERHEAD -
 	    sizeof(struct sctp_chunkhdr) -
 	    sizeof(struct sctp_stream_reset_out_request)) /
 	    sizeof(uint16_t)) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return (ENOMEM);
 	}
 	sctp_alloc_a_chunk(stcb, chk);
 	if (chk == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return (ENOMEM);
 	}
 	chk->copy_by_ref = 0;
 	chk->rec.chunk_id.id = SCTP_STREAM_RESET;
 	chk->rec.chunk_id.can_take_data = 0;
 	chk->flags = 0;
 	chk->asoc = &stcb->asoc;
 	chk->book_size = sizeof(struct sctp_chunkhdr);
 	chk->send_size = SCTP_SIZE32(chk->book_size);
 	chk->book_size_scale = 0;
 	chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (chk->data == NULL) {
 		sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED);
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		return (ENOMEM);
 	}
 	SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
 
 	/* setup chunk parameters */
 	chk->sent = SCTP_DATAGRAM_UNSENT;
 	chk->snd_count = 0;
 	if (stcb->asoc.alternate) {
 		chk->whoTo = stcb->asoc.alternate;
 	} else {
 		chk->whoTo = stcb->asoc.primary_destination;
 	}
 	atomic_add_int(&chk->whoTo->ref_count, 1);
 	ch = mtod(chk->data, struct sctp_chunkhdr *);
 	ch->chunk_type = SCTP_STREAM_RESET;
 	ch->chunk_flags = 0;
 	ch->chunk_length = htons(chk->book_size);
 	SCTP_BUF_LEN(chk->data) = chk->send_size;
 
 	seq = stcb->asoc.str_reset_seq_out;
 	if (can_send_out_req) {
 		int ret;
 
 		ret = sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1));
 		if (ret) {
 			seq++;
 			asoc->stream_reset_outstanding++;
 		}
 	}
 	if ((add_stream & 1) &&
 	    ((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < adding_o)) {
 		/* Need to allocate more */
 		struct sctp_stream_out *oldstream;
 		struct sctp_stream_queue_pending *sp, *nsp;
 		int i;
 #if defined(SCTP_DETAILED_STR_STATS)
 		int j;
 #endif
 
 		oldstream = stcb->asoc.strmout;
 		/* get some more */
 		SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *,
 		    (stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out),
 		    SCTP_M_STRMO);
 		if (stcb->asoc.strmout == NULL) {
 			uint8_t x;
 
 			stcb->asoc.strmout = oldstream;
 			/* Turn off the bit */
 			x = add_stream & 0xfe;
 			add_stream = x;
 			goto skip_stuff;
 		}
 		/*
 		 * Ok now we proceed with copying the old out stuff and
 		 * initializing the new stuff.
 		 */
 		SCTP_TCB_SEND_LOCK(stcb);
 		stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 0, 1);
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
 			stcb->asoc.strmout[i].chunks_on_queues = oldstream[i].chunks_on_queues;
 			stcb->asoc.strmout[i].next_mid_ordered = oldstream[i].next_mid_ordered;
 			stcb->asoc.strmout[i].next_mid_unordered = oldstream[i].next_mid_unordered;
 			stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete;
 			stcb->asoc.strmout[i].sid = i;
 			stcb->asoc.strmout[i].state = oldstream[i].state;
 			/* FIX ME FIX ME */
 			/*
 			 * This should be a SS_COPY operation FIX ME STREAM
 			 * SCHEDULER EXPERT
 			 */
 			stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], &oldstream[i]);
 			/* now anything on those queues? */
 			TAILQ_FOREACH_SAFE(sp, &oldstream[i].outqueue, next, nsp) {
 				TAILQ_REMOVE(&oldstream[i].outqueue, sp, next);
 				TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next);
 			}
 
 		}
 		/* now the new streams */
 		stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1);
 		for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + adding_o); i++) {
 			TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
 			stcb->asoc.strmout[i].chunks_on_queues = 0;
 #if defined(SCTP_DETAILED_STR_STATS)
 			for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
 				stcb->asoc.strmout[i].abandoned_sent[j] = 0;
 				stcb->asoc.strmout[i].abandoned_unsent[j] = 0;
 			}
 #else
 			stcb->asoc.strmout[i].abandoned_sent[0] = 0;
 			stcb->asoc.strmout[i].abandoned_unsent[0] = 0;
 #endif
 			stcb->asoc.strmout[i].next_mid_ordered = 0;
 			stcb->asoc.strmout[i].next_mid_unordered = 0;
 			stcb->asoc.strmout[i].sid = i;
 			stcb->asoc.strmout[i].last_msg_incomplete = 0;
 			stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL);
 			stcb->asoc.strmout[i].state = SCTP_STREAM_CLOSED;
 		}
 		stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + adding_o;
 		SCTP_FREE(oldstream, SCTP_M_STRMO);
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 skip_stuff:
 	if ((add_stream & 1) && (adding_o > 0)) {
 		asoc->strm_pending_add_size = adding_o;
 		asoc->peer_req_out = peer_asked;
 		sctp_add_an_out_stream(chk, seq, adding_o);
 		seq++;
 		asoc->stream_reset_outstanding++;
 	}
 	if ((add_stream & 2) && (adding_i > 0)) {
 		sctp_add_an_in_stream(chk, seq, adding_i);
 		seq++;
 		asoc->stream_reset_outstanding++;
 	}
 	if (send_in_req) {
 		sctp_add_stream_reset_in(chk, number_entries, list, seq);
 		seq++;
 		asoc->stream_reset_outstanding++;
 	}
 	if (send_tsn_req) {
 		sctp_add_stream_reset_tsn(chk, seq);
 		asoc->stream_reset_outstanding++;
 	}
 	asoc->str_reset = chk;
 	/* insert the chunk for sending */
 	TAILQ_INSERT_TAIL(&asoc->control_send_queue,
 	    chk,
 	    sctp_next);
 	asoc->ctrl_queue_cnt++;
 	if (stcb->asoc.send_sack) {
 		sctp_send_sack(stcb, SCTP_SO_LOCKED);
 	}
 	sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
 	return (0);
 }
 
 void
 sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, uint32_t vtag, struct mbuf *cause,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	/* Don't respond to an ABORT with an ABORT. */
 	if (sctp_is_there_an_abort_here(m, iphlen, &vtag)) {
 		if (cause)
 			sctp_m_freem(cause);
 		return;
 	}
 	sctp_send_resp_msg(src, dst, sh, vtag, SCTP_ABORT_ASSOCIATION, cause,
 	    mflowtype, mflowid, fibnum,
 	    vrf_id, port);
 	return;
 }
 
 void
 sctp_send_operr_to(struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, uint32_t vtag, struct mbuf *cause,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	sctp_send_resp_msg(src, dst, sh, vtag, SCTP_OPERATION_ERROR, cause,
 	    mflowtype, mflowid, fibnum,
 	    vrf_id, port);
 	return;
 }
 
 static struct mbuf *
 sctp_copy_resume(struct uio *uio,
     int max_send_len,
     int user_marks_eor,
     int *error,
     uint32_t *sndout,
     struct mbuf **new_tail)
 {
 	struct mbuf *m;
 
 	m = m_uiotombuf(uio, M_WAITOK, max_send_len, 0,
 	    (M_PKTHDR | (user_marks_eor ? M_EOR : 0)));
 	if (m == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS);
 		*error = ENOBUFS;
 	} else {
 		*sndout = m_length(m, NULL);
 		*new_tail = m_last(m);
 	}
 	return (m);
 }
 
 static int
 sctp_copy_one(struct sctp_stream_queue_pending *sp,
     struct uio *uio,
     int resv_upfront)
 {
 	sp->data = m_uiotombuf(uio, M_WAITOK, sp->length,
 	    resv_upfront, 0);
 	if (sp->data == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS);
 		return (ENOBUFS);
 	}
 
 	sp->tail_mbuf = m_last(sp->data);
 	return (0);
 }
 
 
 
 static struct sctp_stream_queue_pending *
 sctp_copy_it_in(struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     struct sctp_sndrcvinfo *srcv,
     struct uio *uio,
     struct sctp_nets *net,
     ssize_t max_send_len,
     int user_marks_eor,
     int *error)
 {
 
 	/*-
 	 * This routine must be very careful in its work. Protocol
 	 * processing is up and running so care must be taken to spl...()
 	 * when you need to do something that may effect the stcb/asoc. The
 	 * sb is locked however. When data is copied the protocol processing
 	 * should be enabled since this is a slower operation...
 	 */
 	struct sctp_stream_queue_pending *sp = NULL;
 	int resv_in_first;
 
 	*error = 0;
 	/* Now can we send this? */
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
 	    (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) {
 		/* got data while shutting down */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
 		*error = ECONNRESET;
 		goto out_now;
 	}
 	sctp_alloc_a_strmoq(stcb, sp);
 	if (sp == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 		*error = ENOMEM;
 		goto out_now;
 	}
 	sp->act_flags = 0;
 	sp->sender_all_done = 0;
 	sp->sinfo_flags = srcv->sinfo_flags;
 	sp->timetolive = srcv->sinfo_timetolive;
 	sp->ppid = srcv->sinfo_ppid;
 	sp->context = srcv->sinfo_context;
 	sp->fsn = 0;
 	(void)SCTP_GETTIME_TIMEVAL(&sp->ts);
 
 	sp->sid = srcv->sinfo_stream;
 	sp->length = (uint32_t)min(uio->uio_resid, max_send_len);
 	if ((sp->length == (uint32_t)uio->uio_resid) &&
 	    ((user_marks_eor == 0) ||
 	    (srcv->sinfo_flags & SCTP_EOF) ||
 	    (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) {
 		sp->msg_is_complete = 1;
 	} else {
 		sp->msg_is_complete = 0;
 	}
 	sp->sender_all_done = 0;
 	sp->some_taken = 0;
 	sp->put_last_out = 0;
 	resv_in_first = SCTP_DATA_CHUNK_OVERHEAD(stcb);
 	sp->data = sp->tail_mbuf = NULL;
 	if (sp->length == 0) {
 		goto skip_copy;
 	}
 	if (srcv->sinfo_keynumber_valid) {
 		sp->auth_keyid = srcv->sinfo_keynumber;
 	} else {
 		sp->auth_keyid = stcb->asoc.authinfo.active_keyid;
 	}
 	if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) {
 		sctp_auth_key_acquire(stcb, sp->auth_keyid);
 		sp->holds_key_ref = 1;
 	}
 	*error = sctp_copy_one(sp, uio, resv_in_first);
 skip_copy:
 	if (*error) {
 		sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED);
 		sp = NULL;
 	} else {
 		if (sp->sinfo_flags & SCTP_ADDR_OVER) {
 			sp->net = net;
 			atomic_add_int(&sp->net->ref_count, 1);
 		} else {
 			sp->net = NULL;
 		}
 		sctp_set_prsctp_policy(sp);
 	}
 out_now:
 	return (sp);
 }
 
 
 int
 sctp_sosend(struct socket *so,
     struct sockaddr *addr,
     struct uio *uio,
     struct mbuf *top,
     struct mbuf *control,
     int flags,
     struct thread *p
 )
 {
 	int error, use_sndinfo = 0;
 	struct sctp_sndrcvinfo sndrcvninfo;
 	struct sockaddr *addr_to_use;
 #if defined(INET) && defined(INET6)
 	struct sockaddr_in sin;
 #endif
 
 	if (control) {
 		/* process cmsg snd/rcv info (maybe a assoc-id) */
 		if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&sndrcvninfo, control,
 		    sizeof(sndrcvninfo))) {
 			/* got one */
 			use_sndinfo = 1;
 		}
 	}
 	addr_to_use = addr;
 #if defined(INET) && defined(INET6)
 	if ((addr) && (addr->sa_family == AF_INET6)) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)addr;
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			in6_sin6_2_sin(&sin, sin6);
 			addr_to_use = (struct sockaddr *)&sin;
 		}
 	}
 #endif
 	error = sctp_lower_sosend(so, addr_to_use, uio, top,
 	    control,
 	    flags,
 	    use_sndinfo ? &sndrcvninfo : NULL
 	    ,p
 	    );
 	return (error);
 }
 
 
 int
 sctp_lower_sosend(struct socket *so,
     struct sockaddr *addr,
     struct uio *uio,
     struct mbuf *i_pak,
     struct mbuf *control,
     int flags,
     struct sctp_sndrcvinfo *srcv
     ,
     struct thread *p
 )
 {
 	struct epoch_tracker et;
 	ssize_t sndlen = 0, max_len, local_add_more;
 	int error, len;
 	struct mbuf *top = NULL;
 	int queue_only = 0, queue_only_for_init = 0;
 	int free_cnt_applied = 0;
 	int un_sent;
 	int now_filled = 0;
 	unsigned int inqueue_bytes = 0;
 	struct sctp_block_entry be;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb = NULL;
 	struct timeval now;
 	struct sctp_nets *net;
 	struct sctp_association *asoc;
 	struct sctp_inpcb *t_inp;
 	int user_marks_eor;
 	int create_lock_applied = 0;
 	int nagle_applies = 0;
 	int some_on_control = 0;
 	int got_all_of_the_send = 0;
 	int hold_tcblock = 0;
 	int non_blocking = 0;
 	ssize_t local_soresv = 0;
 	uint16_t port;
 	uint16_t sinfo_flags;
 	sctp_assoc_t sinfo_assoc_id;
 
 	error = 0;
 	net = NULL;
 	stcb = NULL;
 	asoc = NULL;
 
 	t_inp = inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		error = EINVAL;
 		if (i_pak) {
 			SCTP_RELEASE_PKT(i_pak);
 		}
 		return (error);
 	}
 	if ((uio == NULL) && (i_pak == NULL)) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		return (EINVAL);
 	}
 	user_marks_eor = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
 	atomic_add_int(&inp->total_sends, 1);
 	if (uio) {
 		if (uio->uio_resid < 0) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			return (EINVAL);
 		}
 		sndlen = uio->uio_resid;
 	} else {
 		top = SCTP_HEADER_TO_CHAIN(i_pak);
 		sndlen = SCTP_HEADER_LEN(i_pak);
 	}
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %zd\n",
 	    (void *)addr,
 	    sndlen);
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    SCTP_IS_LISTENING(inp)) {
 		/* The listener can NOT send */
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOTCONN);
 		error = ENOTCONN;
 		goto out_unlocked;
 	}
 	/**
 	 * Pre-screen address, if one is given the sin-len
 	 * must be set correctly!
 	 */
 	if (addr) {
 		union sctp_sockstore *raddr = (union sctp_sockstore *)addr;
 
 		switch (raddr->sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (raddr->sin.sin_len != sizeof(struct sockaddr_in)) {
 				SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 				error = EINVAL;
 				goto out_unlocked;
 			}
 			port = raddr->sin.sin_port;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (raddr->sin6.sin6_len != sizeof(struct sockaddr_in6)) {
 				SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 				error = EINVAL;
 				goto out_unlocked;
 			}
 			port = raddr->sin6.sin6_port;
 			break;
 #endif
 		default:
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAFNOSUPPORT);
 			error = EAFNOSUPPORT;
 			goto out_unlocked;
 		}
 	} else
 		port = 0;
 
 	if (srcv) {
 		sinfo_flags = srcv->sinfo_flags;
 		sinfo_assoc_id = srcv->sinfo_assoc_id;
 		if (INVALID_SINFO_FLAG(sinfo_flags) ||
 		    PR_SCTP_INVALID_POLICY(sinfo_flags)) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out_unlocked;
 		}
 		if (srcv->sinfo_flags)
 			SCTP_STAT_INCR(sctps_sends_with_flags);
 	} else {
 		sinfo_flags = inp->def_send.sinfo_flags;
 		sinfo_assoc_id = inp->def_send.sinfo_assoc_id;
 	}
 	if (flags & MSG_EOR) {
 		sinfo_flags |= SCTP_EOR;
 	}
 	if (flags & MSG_EOF) {
 		sinfo_flags |= SCTP_EOF;
 	}
 	if (sinfo_flags & SCTP_SENDALL) {
 		/* its a sendall */
 		error = sctp_sendall(inp, uio, top, srcv);
 		top = NULL;
 		goto out_unlocked;
 	}
 	if ((sinfo_flags & SCTP_ADDR_OVER) && (addr == NULL)) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		error = EINVAL;
 		goto out_unlocked;
 	}
 	/* now we must find the assoc */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		SCTP_INP_RLOCK(inp);
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		if (stcb) {
 			SCTP_TCB_LOCK(stcb);
 			hold_tcblock = 1;
 		}
 		SCTP_INP_RUNLOCK(inp);
 	} else if (sinfo_assoc_id) {
 		stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 1);
 		if (stcb != NULL) {
 			hold_tcblock = 1;
 		}
 	} else if (addr) {
 		/*-
 		 * Since we did not use findep we must
 		 * increment it, and if we don't find a tcb
 		 * decrement it.
 		 */
 		SCTP_INP_WLOCK(inp);
 		SCTP_INP_INCR_REF(inp);
 		SCTP_INP_WUNLOCK(inp);
 		stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL);
 		if (stcb == NULL) {
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			SCTP_INP_WUNLOCK(inp);
 		} else {
 			hold_tcblock = 1;
 		}
 	}
 	if ((stcb == NULL) && (addr)) {
 		/* Possible implicit send? */
 		SCTP_ASOC_CREATE_LOCK(inp);
 		create_lock_applied = 1;
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 			/* Should I really unlock ? */
 			SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out_unlocked;
 
 		}
 		if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
 		    (addr->sa_family == AF_INET6)) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out_unlocked;
 		}
 		SCTP_INP_WLOCK(inp);
 		SCTP_INP_INCR_REF(inp);
 		SCTP_INP_WUNLOCK(inp);
 		/* With the lock applied look again */
 		stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL);
 #if defined(INET) || defined(INET6)
 		if ((stcb == NULL) && (control != NULL) && (port > 0)) {
 			stcb = sctp_findassociation_cmsgs(&t_inp, port, control, &net, &error);
 		}
 #endif
 		if (stcb == NULL) {
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			SCTP_INP_WUNLOCK(inp);
 		} else {
 			hold_tcblock = 1;
 		}
 		if (error) {
 			goto out_unlocked;
 		}
 		if (t_inp != inp) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN);
 			error = ENOTCONN;
 			goto out_unlocked;
 		}
 	}
 	if (stcb == NULL) {
 		if (addr == NULL) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT);
 			error = ENOENT;
 			goto out_unlocked;
 		} else {
 			/* We must go ahead and start the INIT process */
 			uint32_t vrf_id;
 
 			if ((sinfo_flags & SCTP_ABORT) ||
 			    ((sinfo_flags & SCTP_EOF) && (sndlen == 0))) {
 				/*-
 				 * User asks to abort a non-existant assoc,
 				 * or EOF a non-existant assoc with no data
 				 */
 				SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT);
 				error = ENOENT;
 				goto out_unlocked;
 			}
 			/* get an asoc/stcb struct */
 			vrf_id = inp->def_vrf_id;
 #ifdef INVARIANTS
 			if (create_lock_applied == 0) {
 				panic("Error, should hold create lock and I don't?");
 			}
 #endif
 			stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id,
 			    inp->sctp_ep.pre_open_stream_count,
 			    inp->sctp_ep.port,
 			    p,
 			    SCTP_INITIALIZE_AUTH_PARAMS);
 			if (stcb == NULL) {
 				/* Error is setup for us in the call */
 				goto out_unlocked;
 			}
 			if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
 				stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
 				/*
 				 * Set the connected flag so we can queue
 				 * data
 				 */
 				soisconnecting(so);
 			}
 			hold_tcblock = 1;
 			if (create_lock_applied) {
 				SCTP_ASOC_CREATE_UNLOCK(inp);
 				create_lock_applied = 0;
 			} else {
 				SCTP_PRINTF("Huh-3? create lock should have been on??\n");
 			}
 			/*
 			 * Turn on queue only flag to prevent data from
 			 * being sent
 			 */
 			queue_only = 1;
 			asoc = &stcb->asoc;
 			SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 			(void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered);
 
 			if (control) {
 				if (sctp_process_cmsgs_for_init(stcb, control, &error)) {
 					sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE,
 					    SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_6);
 					hold_tcblock = 0;
 					stcb = NULL;
 					goto out_unlocked;
 				}
 			}
 			/* out with the INIT */
 			queue_only_for_init = 1;
 			/*-
 			 * we may want to dig in after this call and adjust the MTU
 			 * value. It defaulted to 1500 (constant) but the ro
 			 * structure may now have an update and thus we may need to
 			 * change it BEFORE we append the message.
 			 */
 		}
 	} else
 		asoc = &stcb->asoc;
 	if (srcv == NULL) {
 		srcv = (struct sctp_sndrcvinfo *)&asoc->def_send;
 		sinfo_flags = srcv->sinfo_flags;
 		if (flags & MSG_EOR) {
 			sinfo_flags |= SCTP_EOR;
 		}
 		if (flags & MSG_EOF) {
 			sinfo_flags |= SCTP_EOF;
 		}
 	}
 	if (sinfo_flags & SCTP_ADDR_OVER) {
 		if (addr)
 			net = sctp_findnet(stcb, addr);
 		else
 			net = NULL;
 		if ((net == NULL) ||
 		    ((port != 0) && (port != stcb->rport))) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out_unlocked;
 		}
 	} else {
 		if (stcb->asoc.alternate) {
 			net = stcb->asoc.alternate;
 		} else {
 			net = stcb->asoc.primary_destination;
 		}
 	}
 	atomic_add_int(&stcb->total_sends, 1);
 	/* Keep the stcb from being freed under our feet */
 	atomic_add_int(&asoc->refcnt, 1);
 	free_cnt_applied = 1;
 
 	if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT)) {
 		if (sndlen > (ssize_t)asoc->smallest_mtu) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE);
 			error = EMSGSIZE;
 			goto out_unlocked;
 		}
 	}
 	if (SCTP_SO_IS_NBIO(so)
 	    || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0
 	    ) {
 		non_blocking = 1;
 	}
 	/* would we block? */
 	if (non_blocking) {
 		ssize_t amount;
 
 		if (hold_tcblock == 0) {
 			SCTP_TCB_LOCK(stcb);
 			hold_tcblock = 1;
 		}
 		inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 		if (user_marks_eor == 0) {
 			amount = sndlen;
 		} else {
 			amount = 1;
 		}
 		if ((SCTP_SB_LIMIT_SND(so) < (amount + inqueue_bytes + stcb->asoc.sb_send_resv)) ||
 		    (stcb->asoc.chunks_on_out_queue >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EWOULDBLOCK);
 			if (sndlen > (ssize_t)SCTP_SB_LIMIT_SND(so))
 				error = EMSGSIZE;
 			else
 				error = EWOULDBLOCK;
 			goto out_unlocked;
 		}
 		stcb->asoc.sb_send_resv += (uint32_t)sndlen;
 		SCTP_TCB_UNLOCK(stcb);
 		hold_tcblock = 0;
 	} else {
 		atomic_add_int(&stcb->asoc.sb_send_resv, sndlen);
 	}
 	local_soresv = sndlen;
 	if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
 		error = ECONNRESET;
 		goto out_unlocked;
 	}
 	if (create_lock_applied) {
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		create_lock_applied = 0;
 	}
 	/* Is the stream no. valid? */
 	if (srcv->sinfo_stream >= asoc->streamoutcnt) {
 		/* Invalid stream number */
 		SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		error = EINVAL;
 		goto out_unlocked;
 	}
 	if ((asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPEN) &&
 	    (asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPENING)) {
 		/*
 		 * Can't queue any data while stream reset is underway.
 		 */
 		if (asoc->strmout[srcv->sinfo_stream].state > SCTP_STREAM_OPEN) {
 			error = EAGAIN;
 		} else {
 			error = EINVAL;
 		}
 		SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, error);
 		goto out_unlocked;
 	}
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 		queue_only = 1;
 	}
 	/* we are now done with all control */
 	if (control) {
 		sctp_m_freem(control);
 		control = NULL;
 	}
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
 	    (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) {
 		if (sinfo_flags & SCTP_ABORT) {
 			;
 		} else {
 			SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
 			error = ECONNRESET;
 			goto out_unlocked;
 		}
 	}
 	/* Ok, we will attempt a msgsnd :> */
 	if (p) {
 		p->td_ru.ru_msgsnd++;
 	}
 	/* Are we aborting? */
 	if (sinfo_flags & SCTP_ABORT) {
 		struct mbuf *mm;
 		ssize_t tot_demand, tot_out = 0, max_out;
 
 		SCTP_STAT_INCR(sctps_sends_with_abort);
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 			/* It has to be up before we abort */
 			/* how big is the user initiated abort? */
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out;
 		}
 		if (hold_tcblock) {
 			SCTP_TCB_UNLOCK(stcb);
 			hold_tcblock = 0;
 		}
 		if (top) {
 			struct mbuf *cntm = NULL;
 
 			mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAITOK, 1, MT_DATA);
 			if (sndlen != 0) {
 				for (cntm = top; cntm; cntm = SCTP_BUF_NEXT(cntm)) {
 					tot_out += SCTP_BUF_LEN(cntm);
 				}
 			}
 		} else {
 			/* Must fit in a MTU */
 			tot_out = sndlen;
 			tot_demand = (tot_out + sizeof(struct sctp_paramhdr));
 			if (tot_demand > SCTP_DEFAULT_ADD_MORE) {
 				/* To big */
 				SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE);
 				error = EMSGSIZE;
 				goto out;
 			}
 			mm = sctp_get_mbuf_for_msg((unsigned int)tot_demand, 0, M_WAITOK, 1, MT_DATA);
 		}
 		if (mm == NULL) {
 			SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 			error = ENOMEM;
 			goto out;
 		}
 		max_out = asoc->smallest_mtu - sizeof(struct sctp_paramhdr);
 		max_out -= sizeof(struct sctp_abort_msg);
 		if (tot_out > max_out) {
 			tot_out = max_out;
 		}
 		if (mm) {
 			struct sctp_paramhdr *ph;
 
 			/* now move forward the data pointer */
 			ph = mtod(mm, struct sctp_paramhdr *);
 			ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
 			ph->param_length = htons((uint16_t)(sizeof(struct sctp_paramhdr) + tot_out));
 			ph++;
 			SCTP_BUF_LEN(mm) = (int)(tot_out + sizeof(struct sctp_paramhdr));
 			if (top == NULL) {
 				error = uiomove((caddr_t)ph, (int)tot_out, uio);
 				if (error) {
 					/*-
 					 * Here if we can't get his data we
 					 * still abort we just don't get to
 					 * send the users note :-0
 					 */
 					sctp_m_freem(mm);
 					mm = NULL;
 				}
 			} else {
 				if (sndlen != 0) {
 					SCTP_BUF_NEXT(mm) = top;
 				}
 			}
 		}
 		if (hold_tcblock == 0) {
 			SCTP_TCB_LOCK(stcb);
 		}
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 		free_cnt_applied = 0;
 		/* release this lock, otherwise we hang on ourselves */
 		NET_EPOCH_ENTER(et);
 		sctp_abort_an_association(stcb->sctp_ep, stcb, mm, SCTP_SO_LOCKED);
 		NET_EPOCH_EXIT(et);
 		/* now relock the stcb so everything is sane */
 		hold_tcblock = 0;
 		stcb = NULL;
 		/*
 		 * In this case top is already chained to mm avoid double
 		 * free, since we free it below if top != NULL and driver
 		 * would free it after sending the packet out
 		 */
 		if (sndlen != 0) {
 			top = NULL;
 		}
 		goto out_unlocked;
 	}
 	/* Calculate the maximum we can send */
 	inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 	if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) {
 		max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
 	} else {
 		max_len = 0;
 	}
 	if (hold_tcblock) {
 		SCTP_TCB_UNLOCK(stcb);
 		hold_tcblock = 0;
 	}
 	if (asoc->strmout == NULL) {
 		/* huh? software error */
 		SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
 		error = EFAULT;
 		goto out_unlocked;
 	}
 
 	/* Unless E_EOR mode is on, we must make a send FIT in one call. */
 	if ((user_marks_eor == 0) &&
 	    (sndlen > (ssize_t)SCTP_SB_LIMIT_SND(stcb->sctp_socket))) {
 		/* It will NEVER fit */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE);
 		error = EMSGSIZE;
 		goto out_unlocked;
 	}
 	if ((uio == NULL) && user_marks_eor) {
 		/*-
 		 * We do not support eeor mode for
 		 * sending with mbuf chains (like sendfile).
 		 */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 		error = EINVAL;
 		goto out_unlocked;
 	}
 
 	if (user_marks_eor) {
 		local_add_more = (ssize_t)min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold));
 	} else {
 		/*-
 		 * For non-eeor the whole message must fit in
 		 * the socket send buffer.
 		 */
 		local_add_more = sndlen;
 	}
 	len = 0;
 	if (non_blocking) {
 		goto skip_preblock;
 	}
 	if (((max_len <= local_add_more) &&
 	    ((ssize_t)SCTP_SB_LIMIT_SND(so) >= local_add_more)) ||
 	    (max_len == 0) ||
 	    ((stcb->asoc.chunks_on_out_queue + stcb->asoc.stream_queue_cnt) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) {
 		/* No room right now ! */
 		SOCKBUF_LOCK(&so->so_snd);
 		inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 		while ((SCTP_SB_LIMIT_SND(so) < (inqueue_bytes + local_add_more)) ||
 		    ((stcb->asoc.stream_queue_cnt + stcb->asoc.chunks_on_out_queue) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) {
 			SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %zd) || (%d+%d > %d)\n",
 			    (unsigned int)SCTP_SB_LIMIT_SND(so),
 			    inqueue_bytes,
 			    local_add_more,
 			    stcb->asoc.stream_queue_cnt,
 			    stcb->asoc.chunks_on_out_queue,
 			    SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue));
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
 				sctp_log_block(SCTP_BLOCK_LOG_INTO_BLKA, asoc, sndlen);
 			}
 			be.error = 0;
 			stcb->block_entry = &be;
 			error = sbwait(&so->so_snd);
 			stcb->block_entry = NULL;
 			if (error || so->so_error || be.error) {
 				if (error == 0) {
 					if (so->so_error)
 						error = so->so_error;
 					if (be.error) {
 						error = be.error;
 					}
 				}
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto out_unlocked;
 			}
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
 				sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK,
 				    asoc, stcb->asoc.total_output_queue_size);
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto out_unlocked;
 			}
 			inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 		}
 		if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) {
 			max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
 		} else {
 			max_len = 0;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 	}
 
 skip_preblock:
 	if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 		goto out_unlocked;
 	}
 	/*
 	 * sndlen covers for mbuf case uio_resid covers for the non-mbuf
 	 * case NOTE: uio will be null when top/mbuf is passed
 	 */
 	if (sndlen == 0) {
 		if (sinfo_flags & SCTP_EOF) {
 			got_all_of_the_send = 1;
 			goto dataless_eof;
 		} else {
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out;
 		}
 	}
 	if (top == NULL) {
 		struct sctp_stream_queue_pending *sp;
 		struct sctp_stream_out *strm;
 		uint32_t sndout;
 
 		SCTP_TCB_SEND_LOCK(stcb);
 		if ((asoc->stream_locked) &&
 		    (asoc->stream_locked_on != srcv->sinfo_stream)) {
 			SCTP_TCB_SEND_UNLOCK(stcb);
 			SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
 			error = EINVAL;
 			goto out;
 		}
 		SCTP_TCB_SEND_UNLOCK(stcb);
 
 		strm = &stcb->asoc.strmout[srcv->sinfo_stream];
 		if (strm->last_msg_incomplete == 0) {
 	do_a_copy_in:
 			sp = sctp_copy_it_in(stcb, asoc, srcv, uio, net, max_len, user_marks_eor, &error);
 			if (error) {
 				goto out;
 			}
 			SCTP_TCB_SEND_LOCK(stcb);
 			if (sp->msg_is_complete) {
 				strm->last_msg_incomplete = 0;
 				asoc->stream_locked = 0;
 			} else {
 				/*
 				 * Just got locked to this guy in case of an
 				 * interrupt.
 				 */
 				strm->last_msg_incomplete = 1;
 				if (stcb->asoc.idata_supported == 0) {
 					asoc->stream_locked = 1;
 					asoc->stream_locked_on = srcv->sinfo_stream;
 				}
 				sp->sender_all_done = 0;
 			}
 			sctp_snd_sb_alloc(stcb, sp->length);
 			atomic_add_int(&asoc->stream_queue_cnt, 1);
 			if (sinfo_flags & SCTP_UNORDERED) {
 				SCTP_STAT_INCR(sctps_sends_with_unord);
 			}
+			sp->processing = 1;
 			TAILQ_INSERT_TAIL(&strm->outqueue, sp, next);
 			stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, asoc, strm, sp, 1);
 			SCTP_TCB_SEND_UNLOCK(stcb);
 		} else {
 			SCTP_TCB_SEND_LOCK(stcb);
 			sp = TAILQ_LAST(&strm->outqueue, sctp_streamhead);
+			if (sp->processing) {
+				SCTP_TCB_SEND_UNLOCK(stcb);
+				SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+				error = EINVAL;
+				goto out;
+			} else {
+				sp->processing = 1;
+			}
 			SCTP_TCB_SEND_UNLOCK(stcb);
 			if (sp == NULL) {
 				/* ???? Huh ??? last msg is gone */
 #ifdef INVARIANTS
 				panic("Warning: Last msg marked incomplete, yet nothing left?");
 #else
 				SCTP_PRINTF("Warning: Last msg marked incomplete, yet nothing left?\n");
 				strm->last_msg_incomplete = 0;
 #endif
 				goto do_a_copy_in;
 
 			}
 		}
 		while (uio->uio_resid > 0) {
 			/* How much room do we have? */
 			struct mbuf *new_tail, *mm;
 
 			inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 			if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes)
 				max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
 			else
 				max_len = 0;
 
 			if ((max_len > (ssize_t)SCTP_BASE_SYSCTL(sctp_add_more_threshold)) ||
 			    (max_len && (SCTP_SB_LIMIT_SND(so) < SCTP_BASE_SYSCTL(sctp_add_more_threshold))) ||
 			    (uio->uio_resid && (uio->uio_resid <= max_len))) {
 				sndout = 0;
 				new_tail = NULL;
 				if (hold_tcblock) {
 					SCTP_TCB_UNLOCK(stcb);
 					hold_tcblock = 0;
 				}
 				mm = sctp_copy_resume(uio, (int)max_len, user_marks_eor, &error, &sndout, &new_tail);
 				if ((mm == NULL) || error) {
 					if (mm) {
 						sctp_m_freem(mm);
 					}
 					goto out;
 				}
 				/* Update the mbuf and count */
 				SCTP_TCB_SEND_LOCK(stcb);
-				if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+				if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) ||
+				    (stcb->asoc.state & SCTP_STATE_WAS_ABORTED)) {
 					/*
 					 * we need to get out. Peer probably
 					 * aborted.
 					 */
 					sctp_m_freem(mm);
-					if (stcb->asoc.state & SCTP_PCB_FLAGS_WAS_ABORTED) {
+					if (stcb->asoc.state & SCTP_STATE_WAS_ABORTED) {
 						SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET);
 						error = ECONNRESET;
 					}
 					SCTP_TCB_SEND_UNLOCK(stcb);
 					goto out;
 				}
 				if (sp->tail_mbuf) {
 					/* tack it to the end */
 					SCTP_BUF_NEXT(sp->tail_mbuf) = mm;
 					sp->tail_mbuf = new_tail;
 				} else {
 					/* A stolen mbuf */
 					sp->data = mm;
 					sp->tail_mbuf = new_tail;
 				}
 				sctp_snd_sb_alloc(stcb, sndout);
 				atomic_add_int(&sp->length, sndout);
 				len += sndout;
 				if (sinfo_flags & SCTP_SACK_IMMEDIATELY) {
 					sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY;
 				}
 
 				/* Did we reach EOR? */
 				if ((uio->uio_resid == 0) &&
 				    ((user_marks_eor == 0) ||
 				    (sinfo_flags & SCTP_EOF) ||
 				    (user_marks_eor && (sinfo_flags & SCTP_EOR)))) {
 					sp->msg_is_complete = 1;
 				} else {
 					sp->msg_is_complete = 0;
 				}
 				SCTP_TCB_SEND_UNLOCK(stcb);
 			}
 			if (uio->uio_resid == 0) {
 				/* got it all? */
 				continue;
 			}
 			/* PR-SCTP? */
 			if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) {
 				/*
 				 * This is ugly but we must assure locking
 				 * order
 				 */
 				if (hold_tcblock == 0) {
 					SCTP_TCB_LOCK(stcb);
 					hold_tcblock = 1;
 				}
 				sctp_prune_prsctp(stcb, asoc, srcv, (int)sndlen);
 				inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 				if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes)
 					max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes;
 				else
 					max_len = 0;
 				if (max_len > 0) {
 					continue;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				hold_tcblock = 0;
 			}
 			/* wait for space now */
 			if (non_blocking) {
 				/* Non-blocking io in place out */
 				goto skip_out_eof;
 			}
 			/* What about the INIT, send it maybe */
 			if (queue_only_for_init) {
 				if (hold_tcblock == 0) {
 					SCTP_TCB_LOCK(stcb);
 					hold_tcblock = 1;
 				}
 				if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 					/* a collision took us forward? */
 					queue_only = 0;
 				} else {
 					NET_EPOCH_ENTER(et);
 					sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 					NET_EPOCH_EXIT(et);
 					SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 					queue_only = 1;
 				}
 			}
 			if ((net->flight_size > net->cwnd) &&
 			    (asoc->sctp_cmt_on_off == 0)) {
 				SCTP_STAT_INCR(sctps_send_cwnd_avoid);
 				queue_only = 1;
 			} else if (asoc->ifp_had_enobuf) {
 				SCTP_STAT_INCR(sctps_ifnomemqueued);
 				if (net->flight_size > (2 * net->mtu)) {
 					queue_only = 1;
 				}
 				asoc->ifp_had_enobuf = 0;
 			}
 			un_sent = stcb->asoc.total_output_queue_size - stcb->asoc.total_flight;
 			if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) &&
 			    (stcb->asoc.total_flight > 0) &&
 			    (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) &&
 			    (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) {
 
 				/*-
 				 * Ok, Nagle is set on and we have data outstanding.
 				 * Don't send anything and let SACKs drive out the
 				 * data unless we have a "full" segment to send.
 				 */
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
 					sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED);
 				}
 				SCTP_STAT_INCR(sctps_naglequeued);
 				nagle_applies = 1;
 			} else {
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
 					if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY))
 						sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED);
 				}
 				SCTP_STAT_INCR(sctps_naglesent);
 				nagle_applies = 0;
 			}
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
 
 				sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only,
 				    nagle_applies, un_sent);
 				sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size,
 				    stcb->asoc.total_flight,
 				    stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count);
 			}
 			if (queue_only_for_init)
 				queue_only_for_init = 0;
 			if ((queue_only == 0) && (nagle_applies == 0)) {
 				/*-
 				 * need to start chunk output
 				 * before blocking.. note that if
 				 * a lock is already applied, then
 				 * the input via the net is happening
 				 * and I don't need to start output :-D
 				 */
 				NET_EPOCH_ENTER(et);
 				if (hold_tcblock == 0) {
 					if (SCTP_TCB_TRYLOCK(stcb)) {
 						hold_tcblock = 1;
 						sctp_chunk_output(inp,
 						    stcb,
 						    SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
 					}
 				} else {
 					sctp_chunk_output(inp,
 					    stcb,
 					    SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
 				}
 				NET_EPOCH_EXIT(et);
 			}
 			if (hold_tcblock == 1) {
 				SCTP_TCB_UNLOCK(stcb);
 				hold_tcblock = 0;
 			}
 			SOCKBUF_LOCK(&so->so_snd);
 			/*-
 			 * This is a bit strange, but I think it will
 			 * work. The total_output_queue_size is locked and
 			 * protected by the TCB_LOCK, which we just released.
 			 * There is a race that can occur between releasing it
 			 * above, and me getting the socket lock, where sacks
 			 * come in but we have not put the SB_WAIT on the
 			 * so_snd buffer to get the wakeup. After the LOCK
 			 * is applied the sack_processing will also need to
 			 * LOCK the so->so_snd to do the actual sowwakeup(). So
 			 * once we have the socket buffer lock if we recheck the
 			 * size we KNOW we will get to sleep safely with the
 			 * wakeup flag in place.
 			 */
 			inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb));
 			if (SCTP_SB_LIMIT_SND(so) <= (inqueue_bytes +
 			    min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) {
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
 					sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK,
 					    asoc, uio->uio_resid);
 				}
 				be.error = 0;
 				stcb->block_entry = &be;
 				error = sbwait(&so->so_snd);
 				stcb->block_entry = NULL;
 
 				if (error || so->so_error || be.error) {
 					if (error == 0) {
 						if (so->so_error)
 							error = so->so_error;
 						if (be.error) {
 							error = be.error;
 						}
 					}
 					SOCKBUF_UNLOCK(&so->so_snd);
 					goto out_unlocked;
 				}
 
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
 					sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK,
 					    asoc, stcb->asoc.total_output_queue_size);
 				}
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				goto out_unlocked;
 			}
 		}
 		SCTP_TCB_SEND_LOCK(stcb);
-		if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+		if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) ||
+		    (stcb->asoc.state & SCTP_STATE_WAS_ABORTED)) {
 			SCTP_TCB_SEND_UNLOCK(stcb);
 			goto out_unlocked;
 		}
 		if (sp) {
 			if (sp->msg_is_complete == 0) {
 				strm->last_msg_incomplete = 1;
 				if (stcb->asoc.idata_supported == 0) {
 					asoc->stream_locked = 1;
 					asoc->stream_locked_on = srcv->sinfo_stream;
 				}
 			} else {
 				sp->sender_all_done = 1;
 				strm->last_msg_incomplete = 0;
 				asoc->stream_locked = 0;
 			}
+			sp->processing = 0;
 		} else {
 			SCTP_PRINTF("Huh no sp TSNH?\n");
 			strm->last_msg_incomplete = 0;
 			asoc->stream_locked = 0;
 		}
 		SCTP_TCB_SEND_UNLOCK(stcb);
 		if (uio->uio_resid == 0) {
 			got_all_of_the_send = 1;
 		}
 	} else {
 		/* We send in a 0, since we do NOT have any locks */
 		error = sctp_msg_append(stcb, net, top, srcv, 0);
 		top = NULL;
 		if (sinfo_flags & SCTP_EOF) {
 			got_all_of_the_send = 1;
 		}
 	}
 	if (error) {
 		goto out;
 	}
 dataless_eof:
 	/* EOF thing ? */
 	if ((sinfo_flags & SCTP_EOF) &&
 	    (got_all_of_the_send == 1)) {
 		SCTP_STAT_INCR(sctps_sends_with_eof);
 		error = 0;
 		if (hold_tcblock == 0) {
 			SCTP_TCB_LOCK(stcb);
 			hold_tcblock = 1;
 		}
 		if (TAILQ_EMPTY(&asoc->send_queue) &&
 		    TAILQ_EMPTY(&asoc->sent_queue) &&
 		    sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED) == 0) {
 			if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 				goto abort_anyway;
 			}
 			/* there is nothing queued to send, so I'm done... */
 			if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 				struct sctp_nets *netp;
 
 				/* only send SHUTDOWN the first time through */
 				if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 					SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 				}
 				SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT);
 				sctp_stop_timers_for_shutdown(stcb);
 				if (stcb->asoc.alternate) {
 					netp = stcb->asoc.alternate;
 				} else {
 					netp = stcb->asoc.primary_destination;
 				}
 				sctp_send_shutdown(stcb, netp);
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb,
 				    netp);
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
 				    NULL);
 			}
 		} else {
 			/*-
 			 * we still got (or just got) data to send, so set
 			 * SHUTDOWN_PENDING
 			 */
 			/*-
 			 * XXX sockets draft says that SCTP_EOF should be
 			 * sent with no data.  currently, we will allow user
 			 * data to be sent first and move to
 			 * SHUTDOWN-PENDING
 			 */
 			if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
 			    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 				if (hold_tcblock == 0) {
 					SCTP_TCB_LOCK(stcb);
 					hold_tcblock = 1;
 				}
 				if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 					SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT);
 				}
 				SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING);
 				if (TAILQ_EMPTY(&asoc->send_queue) &&
 				    TAILQ_EMPTY(&asoc->sent_queue) &&
 				    (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 					struct mbuf *op_err;
 					char msg[SCTP_DIAG_INFO_LEN];
 
 			abort_anyway:
 					if (free_cnt_applied) {
 						atomic_add_int(&stcb->asoc.refcnt, -1);
 						free_cnt_applied = 0;
 					}
 					SCTP_SNPRINTF(msg, sizeof(msg),
 					    "%s:%d at %s", __FILE__, __LINE__, __func__);
 					op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 					    msg);
 					NET_EPOCH_ENTER(et);
 					sctp_abort_an_association(stcb->sctp_ep, stcb,
 					    op_err, SCTP_SO_LOCKED);
 					NET_EPOCH_EXIT(et);
 					/*
 					 * now relock the stcb so everything
 					 * is sane
 					 */
 					hold_tcblock = 0;
 					stcb = NULL;
 					goto out;
 				}
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
 				    NULL);
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_NODELAY);
 			}
 		}
 	}
 skip_out_eof:
 	if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
 		some_on_control = 1;
 	}
 	if (queue_only_for_init) {
 		if (hold_tcblock == 0) {
 			SCTP_TCB_LOCK(stcb);
 			hold_tcblock = 1;
 		}
 		if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 			/* a collision took us forward? */
 			queue_only = 0;
 		} else {
 			NET_EPOCH_ENTER(et);
 			sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 			NET_EPOCH_EXIT(et);
 			SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 			queue_only = 1;
 		}
 	}
 	if ((net->flight_size > net->cwnd) &&
 	    (stcb->asoc.sctp_cmt_on_off == 0)) {
 		SCTP_STAT_INCR(sctps_send_cwnd_avoid);
 		queue_only = 1;
 	} else if (asoc->ifp_had_enobuf) {
 		SCTP_STAT_INCR(sctps_ifnomemqueued);
 		if (net->flight_size > (2 * net->mtu)) {
 			queue_only = 1;
 		}
 		asoc->ifp_had_enobuf = 0;
 	}
 	un_sent = stcb->asoc.total_output_queue_size - stcb->asoc.total_flight;
 	if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) &&
 	    (stcb->asoc.total_flight > 0) &&
 	    (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) &&
 	    (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) {
 		/*-
 		 * Ok, Nagle is set on and we have data outstanding.
 		 * Don't send anything and let SACKs drive out the
 		 * data unless wen have a "full" segment to send.
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
 			sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED);
 		}
 		SCTP_STAT_INCR(sctps_naglequeued);
 		nagle_applies = 1;
 	} else {
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
 			if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY))
 				sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED);
 		}
 		SCTP_STAT_INCR(sctps_naglesent);
 		nagle_applies = 0;
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
 		sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only,
 		    nagle_applies, un_sent);
 		sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size,
 		    stcb->asoc.total_flight,
 		    stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count);
 	}
 	NET_EPOCH_ENTER(et);
 	if ((queue_only == 0) && (nagle_applies == 0) && (stcb->asoc.peers_rwnd && un_sent)) {
 		/* we can attempt to send too. */
 		if (hold_tcblock == 0) {
 			/*
 			 * If there is activity recv'ing sacks no need to
 			 * send
 			 */
 			if (SCTP_TCB_TRYLOCK(stcb)) {
 				sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
 				hold_tcblock = 1;
 			}
 		} else {
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
 		}
 	} else if ((queue_only == 0) &&
 		    (stcb->asoc.peers_rwnd == 0) &&
 	    (stcb->asoc.total_flight == 0)) {
 		/* We get to have a probe outstanding */
 		if (hold_tcblock == 0) {
 			hold_tcblock = 1;
 			SCTP_TCB_LOCK(stcb);
 		}
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED);
 	} else if (some_on_control) {
 		int num_out, reason, frag_point;
 
 		/* Here we do control only */
 		if (hold_tcblock == 0) {
 			hold_tcblock = 1;
 			SCTP_TCB_LOCK(stcb);
 		}
 		frag_point = sctp_get_frag_point(stcb, &stcb->asoc);
 		(void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out,
 		    &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_LOCKED);
 	}
 	NET_EPOCH_EXIT(et);
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "USR Send complete qo:%d prw:%d unsent:%d tf:%d cooq:%d toqs:%d err:%d\n",
 	    queue_only, stcb->asoc.peers_rwnd, un_sent,
 	    stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue,
 	    stcb->asoc.total_output_queue_size, error);
 
 out:
 out_unlocked:
 
 	if (local_soresv && stcb) {
 		atomic_subtract_int(&stcb->asoc.sb_send_resv, sndlen);
 	}
 	if (create_lock_applied) {
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 	}
 	if ((stcb) && hold_tcblock) {
 		SCTP_TCB_UNLOCK(stcb);
 	}
 	if (stcb && free_cnt_applied) {
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 	}
 #ifdef INVARIANTS
 	if (stcb) {
 		if (mtx_owned(&stcb->tcb_mtx)) {
 			panic("Leaving with tcb mtx owned?");
 		}
 		if (mtx_owned(&stcb->tcb_send_mtx)) {
 			panic("Leaving with tcb send mtx owned?");
 		}
 	}
 #endif
 	if (top) {
 		sctp_m_freem(top);
 	}
 	if (control) {
 		sctp_m_freem(control);
 	}
 	return (error);
 }
 
 
 /*
  * generate an AUTHentication chunk, if required
  */
 struct mbuf *
 sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end,
     struct sctp_auth_chunk **auth_ret, uint32_t *offset,
     struct sctp_tcb *stcb, uint8_t chunk)
 {
 	struct mbuf *m_auth;
 	struct sctp_auth_chunk *auth;
 	int chunk_len;
 	struct mbuf *cn;
 
 	if ((m_end == NULL) || (auth_ret == NULL) || (offset == NULL) ||
 	    (stcb == NULL))
 		return (m);
 
 	if (stcb->asoc.auth_supported == 0) {
 		return (m);
 	}
 	/* does the requested chunk require auth? */
 	if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) {
 		return (m);
 	}
 	m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_NOWAIT, 1, MT_HEADER);
 	if (m_auth == NULL) {
 		/* no mbuf's */
 		return (m);
 	}
 	/* reserve some space if this will be the first mbuf */
 	if (m == NULL)
 		SCTP_BUF_RESV_UF(m_auth, SCTP_MIN_OVERHEAD);
 	/* fill in the AUTH chunk details */
 	auth = mtod(m_auth, struct sctp_auth_chunk *);
 	memset(auth, 0, sizeof(*auth));
 	auth->ch.chunk_type = SCTP_AUTHENTICATION;
 	auth->ch.chunk_flags = 0;
 	chunk_len = sizeof(*auth) +
 	    sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id);
 	auth->ch.chunk_length = htons(chunk_len);
 	auth->hmac_id = htons(stcb->asoc.peer_hmac_id);
 	/* key id and hmac digest will be computed and filled in upon send */
 
 	/* save the offset where the auth was inserted into the chain */
 	*offset = 0;
 	for (cn = m; cn; cn = SCTP_BUF_NEXT(cn)) {
 		*offset += SCTP_BUF_LEN(cn);
 	}
 
 	/* update length and return pointer to the auth chunk */
 	SCTP_BUF_LEN(m_auth) = chunk_len;
 	m = sctp_copy_mbufchain(m_auth, m, m_end, 1, chunk_len, 0);
 	if (auth_ret != NULL)
 		*auth_ret = auth;
 
 	return (m);
 }
 
 #ifdef INET6
 int
 sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t *ro)
 {
 	struct nd_prefix *pfx = NULL;
 	struct nd_pfxrouter *pfxrtr = NULL;
 	struct sockaddr_in6 gw6;
 
 	if (ro == NULL || ro->ro_nh == NULL || src6->sin6_family != AF_INET6)
 		return (0);
 
 	/* get prefix entry of address */
 	ND6_RLOCK();
 	LIST_FOREACH(pfx, &MODULE_GLOBAL(nd_prefix), ndpr_entry) {
 		if (pfx->ndpr_stateflags & NDPRF_DETACHED)
 			continue;
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&pfx->ndpr_prefix.sin6_addr,
 		    &src6->sin6_addr, &pfx->ndpr_mask))
 			break;
 	}
 	/* no prefix entry in the prefix list */
 	if (pfx == NULL) {
 		ND6_RUNLOCK();
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefix entry for ");
 		SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6);
 		return (0);
 	}
 
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "v6src_match_nexthop(), Prefix entry is ");
 	SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6);
 
 	/* search installed gateway from prefix entry */
 	LIST_FOREACH(pfxrtr, &pfx->ndpr_advrtrs, pfr_entry) {
 		memset(&gw6, 0, sizeof(struct sockaddr_in6));
 		gw6.sin6_family = AF_INET6;
 		gw6.sin6_len = sizeof(struct sockaddr_in6);
 		memcpy(&gw6.sin6_addr, &pfxrtr->router->rtaddr,
 		    sizeof(struct in6_addr));
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is ");
 		SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6);
 		SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is ");
 		SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa);
 		if (sctp_cmpaddr((struct sockaddr *)&gw6, &ro->ro_nh->gw_sa)) {
 			ND6_RUNLOCK();
 			SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n");
 			return (1);
 		}
 	}
 	ND6_RUNLOCK();
 	SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is not installed\n");
 	return (0);
 }
 #endif
 
 int
 sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t *ro)
 {
 #ifdef INET
 	struct sockaddr_in *sin, *mask;
 	struct ifaddr *ifa;
 	struct in_addr srcnetaddr, gwnetaddr;
 
 	if (ro == NULL || ro->ro_nh == NULL ||
 	    sifa->address.sa.sa_family != AF_INET) {
 		return (0);
 	}
 	ifa = (struct ifaddr *)sifa->ifa;
 	mask = (struct sockaddr_in *)(ifa->ifa_netmask);
 	sin = &sifa->address.sin;
 	srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr);
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is ");
 	SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa);
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr);
 
 	sin = &ro->ro_nh->gw4_sa;
 	gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr);
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is ");
 	SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa);
 	SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr);
 	if (srcnetaddr.s_addr == gwnetaddr.s_addr) {
 		return (1);
 	}
 #endif
 	return (0);
 }
Index: projects/clang1100-import/sys/netinet/sctp_pcb.c
===================================================================
--- projects/clang1100-import/sys/netinet/sctp_pcb.c	(revision 364278)
+++ projects/clang1100-import/sys/netinet/sctp_pcb.c	(revision 364279)
@@ -1,7138 +1,7143 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_bsd_addr.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/udp.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/unistd.h>
 
 /* FIX: we don't handle multiple link local scopes */
 /* "scopeless" replacement IN6_ARE_ADDR_EQUAL */
 #ifdef INET6
 int
 SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b)
 {
 	struct sockaddr_in6 tmp_a, tmp_b;
 
 	memcpy(&tmp_a, a, sizeof(struct sockaddr_in6));
 	if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 		return (0);
 	}
 	memcpy(&tmp_b, b, sizeof(struct sockaddr_in6));
 	if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 		return (0);
 	}
 	return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr));
 }
 #endif
 
 void
 sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb)
 {
 	/*
 	 * We really don't need to lock this, but I will just because it
 	 * does not hurt.
 	 */
 	SCTP_INP_INFO_RLOCK();
 	spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep);
 	spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc);
 	spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr);
 	spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr);
 	spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk);
 	spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq);
 	spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq);
 	spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks);
 	SCTP_INP_INFO_RUNLOCK();
 }
 
 /*-
  * Addresses are added to VRF's (Virtual Router's). For BSD we
  * have only the default VRF 0. We maintain a hash list of
  * VRF's. Each VRF has its own list of sctp_ifn's. Each of
  * these has a list of addresses. When we add a new address
  * to a VRF we lookup the ifn/ifn_index, if the ifn does
  * not exist we create it and add it to the list of IFN's
  * within the VRF. Once we have the sctp_ifn, we add the
  * address to the list. So we look something like:
  *
  * hash-vrf-table
  *   vrf-> ifn-> ifn -> ifn
  *   vrf    |
  *    ...   +--ifa-> ifa -> ifa
  *   vrf
  *
  * We keep these separate lists since the SCTP subsystem will
  * point to these from its source address selection nets structure.
  * When an address is deleted it does not happen right away on
  * the SCTP side, it gets scheduled. What we do when a
  * delete happens is immediately remove the address from
  * the master list and decrement the refcount. As our
  * addip iterator works through and frees the src address
  * selection pointing to the sctp_ifa, eventually the refcount
  * will reach 0 and we will delete it. Note that it is assumed
  * that any locking on system level ifn/ifa is done at the
  * caller of these functions and these routines will only
  * lock the SCTP structures as they add or delete things.
  *
  * Other notes on VRF concepts.
  *  - An endpoint can be in multiple VRF's
  *  - An association lives within a VRF and only one VRF.
  *  - Any incoming packet we can deduce the VRF for by
  *    looking at the mbuf/pak inbound (for BSD its VRF=0 :D)
  *  - Any downward send call or connect call must supply the
  *    VRF via ancillary data or via some sort of set default
  *    VRF socket option call (again for BSD no brainer since
  *    the VRF is always 0).
  *  - An endpoint may add multiple VRF's to it.
  *  - Listening sockets can accept associations in any
  *    of the VRF's they are in but the assoc will end up
  *    in only one VRF (gotten from the packet or connect/send).
  *
  */
 
 struct sctp_vrf *
 sctp_allocate_vrf(int vrf_id)
 {
 	struct sctp_vrf *vrf = NULL;
 	struct sctp_vrflist *bucket;
 
 	/* First allocate the VRF structure */
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf) {
 		/* Already allocated */
 		return (vrf);
 	}
 	SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf),
 	    SCTP_M_VRF);
 	if (vrf == NULL) {
 		/* No memory */
 #ifdef INVARIANTS
 		panic("No memory for VRF:%d", vrf_id);
 #endif
 		return (NULL);
 	}
 	/* setup the VRF */
 	memset(vrf, 0, sizeof(struct sctp_vrf));
 	vrf->vrf_id = vrf_id;
 	LIST_INIT(&vrf->ifnlist);
 	vrf->total_ifa_count = 0;
 	vrf->refcount = 0;
 	/* now also setup table ids */
 	SCTP_INIT_VRF_TABLEID(vrf);
 	/* Init the HASH of addresses */
 	vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE,
 	    &vrf->vrf_addr_hashmark);
 	if (vrf->vrf_addr_hash == NULL) {
 		/* No memory */
 #ifdef INVARIANTS
 		panic("No memory for VRF:%d", vrf_id);
 #endif
 		SCTP_FREE(vrf, SCTP_M_VRF);
 		return (NULL);
 	}
 
 	/* Add it to the hash table */
 	bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))];
 	LIST_INSERT_HEAD(bucket, vrf, next_vrf);
 	atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1);
 	return (vrf);
 }
 
 
 struct sctp_ifn *
 sctp_find_ifn(void *ifn, uint32_t ifn_index)
 {
 	struct sctp_ifn *sctp_ifnp;
 	struct sctp_ifnlist *hash_ifn_head;
 
 	/*
 	 * We assume the lock is held for the addresses if that's wrong
 	 * problems could occur :-)
 	 */
 	SCTP_IPI_ADDR_LOCK_ASSERT();
 	hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))];
 	LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) {
 		if (sctp_ifnp->ifn_index == ifn_index) {
 			return (sctp_ifnp);
 		}
 		if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) {
 			return (sctp_ifnp);
 		}
 	}
 	return (NULL);
 }
 
 
 struct sctp_vrf *
 sctp_find_vrf(uint32_t vrf_id)
 {
 	struct sctp_vrflist *bucket;
 	struct sctp_vrf *liste;
 
 	bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))];
 	LIST_FOREACH(liste, bucket, next_vrf) {
 		if (vrf_id == liste->vrf_id) {
 			return (liste);
 		}
 	}
 	return (NULL);
 }
 
 
 void
 sctp_free_vrf(struct sctp_vrf *vrf)
 {
 	if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) {
 		if (vrf->vrf_addr_hash) {
 			SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark);
 			vrf->vrf_addr_hash = NULL;
 		}
 		/* We zero'd the count */
 		LIST_REMOVE(vrf, next_vrf);
 		SCTP_FREE(vrf, SCTP_M_VRF);
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1);
 	}
 }
 
 
 void
 sctp_free_ifn(struct sctp_ifn *sctp_ifnp)
 {
 	if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) {
 		/* We zero'd the count */
 		if (sctp_ifnp->vrf) {
 			sctp_free_vrf(sctp_ifnp->vrf);
 		}
 		SCTP_FREE(sctp_ifnp, SCTP_M_IFN);
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1);
 	}
 }
 
 
 void
 sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu)
 {
 	struct sctp_ifn *sctp_ifnp;
 
 	sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index);
 	if (sctp_ifnp != NULL) {
 		sctp_ifnp->ifn_mtu = mtu;
 	}
 }
 
 
 void
 sctp_free_ifa(struct sctp_ifa *sctp_ifap)
 {
 	if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) {
 		/* We zero'd the count */
 		if (sctp_ifap->ifn_p) {
 			sctp_free_ifn(sctp_ifap->ifn_p);
 		}
 		SCTP_FREE(sctp_ifap, SCTP_M_IFA);
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1);
 	}
 }
 
 
 static void
 sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock)
 {
 	struct sctp_ifn *found;
 
 	found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index);
 	if (found == NULL) {
 		/* Not in the list.. sorry */
 		return;
 	}
 	if (hold_addr_lock == 0) {
 		SCTP_IPI_ADDR_WLOCK();
 	} else {
 		SCTP_IPI_ADDR_WLOCK_ASSERT();
 	}
 	LIST_REMOVE(sctp_ifnp, next_bucket);
 	LIST_REMOVE(sctp_ifnp, next_ifn);
 	if (hold_addr_lock == 0) {
 		SCTP_IPI_ADDR_WUNLOCK();
 	}
 	/* Take away the reference, and possibly free it */
 	sctp_free_ifn(sctp_ifnp);
 }
 
 
 void
 sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr,
     const char *if_name, uint32_t ifn_index)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifa *sctp_ifap;
 
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
 		goto out;
 
 	}
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n");
 		goto out;
 	}
 	if (sctp_ifap->ifn_p == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n");
 		goto out;
 	}
 	if (if_name) {
 		if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n",
 			    sctp_ifap->ifn_p->ifn_name, if_name);
 			goto out;
 		}
 	} else {
 		if (sctp_ifap->ifn_p->ifn_index != ifn_index) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n",
 			    sctp_ifap->ifn_p->ifn_index, ifn_index);
 			goto out;
 		}
 	}
 
 	sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID);
 	sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE;
 out:
 	SCTP_IPI_ADDR_RUNLOCK();
 }
 
 
 void
 sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr,
     const char *if_name, uint32_t ifn_index)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifa *sctp_ifap;
 
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
 		goto out;
 
 	}
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n");
 		goto out;
 	}
 	if (sctp_ifap->ifn_p == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n");
 		goto out;
 	}
 	if (if_name) {
 		if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n",
 			    sctp_ifap->ifn_p->ifn_name, if_name);
 			goto out;
 		}
 	} else {
 		if (sctp_ifap->ifn_p->ifn_index != ifn_index) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n",
 			    sctp_ifap->ifn_p->ifn_index, ifn_index);
 			goto out;
 		}
 	}
 
 	sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE);
 	sctp_ifap->localifa_flags |= SCTP_ADDR_VALID;
 out:
 	SCTP_IPI_ADDR_RUNLOCK();
 }
 
 
 /*-
  * Add an ifa to an ifn.
  * Register the interface as necessary.
  * NOTE: ADDR write lock MUST be held.
  */
 static void
 sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap)
 {
 	int ifa_af;
 
 	LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa);
 	sctp_ifap->ifn_p = sctp_ifnp;
 	atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
 	/* update address counts */
 	sctp_ifnp->ifa_count++;
 	ifa_af = sctp_ifap->address.sa.sa_family;
 	switch (ifa_af) {
 #ifdef INET
 	case AF_INET:
 		sctp_ifnp->num_v4++;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sctp_ifnp->num_v6++;
 		break;
 #endif
 	default:
 		break;
 	}
 	if (sctp_ifnp->ifa_count == 1) {
 		/* register the new interface */
 		sctp_ifnp->registered_af = ifa_af;
 	}
 }
 
 
 /*-
  * Remove an ifa from its ifn.
  * If no more addresses exist, remove the ifn too. Otherwise, re-register
  * the interface based on the remaining address families left.
  * NOTE: ADDR write lock MUST be held.
  */
 static void
 sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap)
 {
 	LIST_REMOVE(sctp_ifap, next_ifa);
 	if (sctp_ifap->ifn_p) {
 		/* update address counts */
 		sctp_ifap->ifn_p->ifa_count--;
 		switch (sctp_ifap->address.sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			sctp_ifap->ifn_p->num_v4--;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			sctp_ifap->ifn_p->num_v6--;
 			break;
 #endif
 		default:
 			break;
 		}
 
 		if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) {
 			/* remove the ifn, possibly freeing it */
 			sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED);
 		} else {
 			/* re-register address family type, if needed */
 			if ((sctp_ifap->ifn_p->num_v6 == 0) &&
 			    (sctp_ifap->ifn_p->registered_af == AF_INET6)) {
 				sctp_ifap->ifn_p->registered_af = AF_INET;
 			} else if ((sctp_ifap->ifn_p->num_v4 == 0) &&
 			    (sctp_ifap->ifn_p->registered_af == AF_INET)) {
 				sctp_ifap->ifn_p->registered_af = AF_INET6;
 			}
 			/* free the ifn refcount */
 			sctp_free_ifn(sctp_ifap->ifn_p);
 		}
 		sctp_ifap->ifn_p = NULL;
 	}
 }
 
 
 struct sctp_ifa *
 sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
     uint32_t ifn_type, const char *if_name, void *ifa,
     struct sockaddr *addr, uint32_t ifa_flags,
     int dynamic_add)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifn *sctp_ifnp, *new_sctp_ifnp;
 	struct sctp_ifa *sctp_ifap, *new_sctp_ifap;
 	struct sctp_ifalist *hash_addr_head;
 	struct sctp_ifnlist *hash_ifn_head;
 	uint32_t hash_of_addr;
 	int new_ifn_af = 0;
 
 #ifdef SCTP_DEBUG
 	SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id);
 	SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr);
 #endif
 	SCTP_MALLOC(new_sctp_ifnp, struct sctp_ifn *,
 	    sizeof(struct sctp_ifn), SCTP_M_IFN);
 	if (new_sctp_ifnp == NULL) {
 #ifdef INVARIANTS
 		panic("No memory for IFN");
 #endif
 		return (NULL);
 	}
 	SCTP_MALLOC(new_sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA);
 	if (new_sctp_ifap == NULL) {
 #ifdef INVARIANTS
 		panic("No memory for IFA");
 #endif
 		SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN);
 		return (NULL);
 	}
 
 	SCTP_IPI_ADDR_WLOCK();
 	sctp_ifnp = sctp_find_ifn(ifn, ifn_index);
 	if (sctp_ifnp) {
 		vrf = sctp_ifnp->vrf;
 	} else {
 		vrf = sctp_find_vrf(vrf_id);
 		if (vrf == NULL) {
 			vrf = sctp_allocate_vrf(vrf_id);
 			if (vrf == NULL) {
 				SCTP_IPI_ADDR_WUNLOCK();
 				SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN);
 				SCTP_FREE(new_sctp_ifap, SCTP_M_IFA);
 				return (NULL);
 			}
 		}
 	}
 	if (sctp_ifnp == NULL) {
 		/*
 		 * build one and add it, can't hold lock until after malloc
 		 * done though.
 		 */
 		sctp_ifnp = new_sctp_ifnp;
 		new_sctp_ifnp = NULL;
 		memset(sctp_ifnp, 0, sizeof(struct sctp_ifn));
 		sctp_ifnp->ifn_index = ifn_index;
 		sctp_ifnp->ifn_p = ifn;
 		sctp_ifnp->ifn_type = ifn_type;
 		sctp_ifnp->refcount = 0;
 		sctp_ifnp->vrf = vrf;
 		atomic_add_int(&vrf->refcount, 1);
 		sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, addr->sa_family);
 		if (if_name != NULL) {
 			SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", if_name);
 		} else {
 			SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", "unknown");
 		}
 		hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))];
 		LIST_INIT(&sctp_ifnp->ifalist);
 		LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket);
 		LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn);
 		atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1);
 		new_ifn_af = 1;
 	}
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap) {
 		/* Hmm, it already exists? */
 		if ((sctp_ifap->ifn_p) &&
 		    (sctp_ifap->ifn_p->ifn_index == ifn_index)) {
 			SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n",
 			    sctp_ifap->ifn_p->ifn_name, ifn_index,
 			    (void *)sctp_ifap);
 			if (new_ifn_af) {
 				/* Remove the created one that we don't want */
 				sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED);
 			}
 			if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) {
 				/* easy to solve, just switch back to active */
 				SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n");
 				sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
 				sctp_ifap->ifn_p = sctp_ifnp;
 				atomic_add_int(&sctp_ifap->ifn_p->refcount, 1);
 			}
 	exit_stage_left:
 			SCTP_IPI_ADDR_WUNLOCK();
 			if (new_sctp_ifnp != NULL) {
 				SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN);
 			}
 			SCTP_FREE(new_sctp_ifap, SCTP_M_IFA);
 			return (sctp_ifap);
 		} else {
 			if (sctp_ifap->ifn_p) {
 				/*
 				 * The last IFN gets the address, remove the
 				 * old one
 				 */
 				SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n",
 				    (void *)sctp_ifap, sctp_ifap->ifn_p->ifn_name,
 				    sctp_ifap->ifn_p->ifn_index, if_name,
 				    ifn_index);
 				/* remove the address from the old ifn */
 				sctp_remove_ifa_from_ifn(sctp_ifap);
 				/* move the address over to the new ifn */
 				sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap);
 				goto exit_stage_left;
 			} else {
 				/* repair ifnp which was NULL ? */
 				sctp_ifap->localifa_flags = SCTP_ADDR_VALID;
 				SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n",
 				    (void *)sctp_ifnp, (void *)sctp_ifap);
 				sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap);
 			}
 			goto exit_stage_left;
 		}
 	}
 	sctp_ifap = new_sctp_ifap;
 	memset(sctp_ifap, 0, sizeof(struct sctp_ifa));
 	sctp_ifap->ifn_p = sctp_ifnp;
 	atomic_add_int(&sctp_ifnp->refcount, 1);
 	sctp_ifap->vrf_id = vrf_id;
 	sctp_ifap->ifa = ifa;
 	memcpy(&sctp_ifap->address, addr, addr->sa_len);
 	sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE;
 	sctp_ifap->flags = ifa_flags;
 	/* Set scope */
 	switch (sctp_ifap->address.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = &sctp_ifap->address.sin;
 			if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
 			    (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) {
 				sctp_ifap->src_is_loop = 1;
 			}
 			if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 				sctp_ifap->src_is_priv = 1;
 			}
 			sctp_ifnp->num_v4++;
 			if (new_ifn_af)
 				new_ifn_af = AF_INET;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			/* ok to use deprecated addresses? */
 			struct sockaddr_in6 *sin6;
 
 			sin6 = &sctp_ifap->address.sin6;
 			if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
 			    (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) {
 				sctp_ifap->src_is_loop = 1;
 			}
 			if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 				sctp_ifap->src_is_priv = 1;
 			}
 			sctp_ifnp->num_v6++;
 			if (new_ifn_af)
 				new_ifn_af = AF_INET6;
 			break;
 		}
 #endif
 	default:
 		new_ifn_af = 0;
 		break;
 	}
 	hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa);
 
 	if ((sctp_ifap->src_is_priv == 0) &&
 	    (sctp_ifap->src_is_loop == 0)) {
 		sctp_ifap->src_is_glob = 1;
 	}
 	hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)];
 	LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket);
 	sctp_ifap->refcount = 1;
 	LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa);
 	sctp_ifnp->ifa_count++;
 	vrf->total_ifa_count++;
 	atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1);
 	if (new_ifn_af) {
 		sctp_ifnp->registered_af = new_ifn_af;
 	}
 	SCTP_IPI_ADDR_WUNLOCK();
 	if (new_sctp_ifnp != NULL) {
 		SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN);
 	}
 
 	if (dynamic_add) {
 		/*
 		 * Bump up the refcount so that when the timer completes it
 		 * will drop back down.
 		 */
 		struct sctp_laddr *wi;
 
 		atomic_add_int(&sctp_ifap->refcount, 1);
 		wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 		if (wi == NULL) {
 			/*
 			 * Gak, what can we do? We have lost an address
 			 * change can you say HOSED?
 			 */
 			SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n");
 			/* Opps, must decrement the count */
 			sctp_del_addr_from_vrf(vrf_id, addr, ifn_index,
 			    if_name);
 			return (NULL);
 		}
 		SCTP_INCR_LADDR_COUNT();
 		memset(wi, 0, sizeof(*wi));
 		(void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
 		wi->ifa = sctp_ifap;
 		wi->action = SCTP_ADD_IP_ADDRESS;
 
 		SCTP_WQ_ADDR_LOCK();
 		LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
 		sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
 		    (struct sctp_inpcb *)NULL,
 		    (struct sctp_tcb *)NULL,
 		    (struct sctp_nets *)NULL);
 		SCTP_WQ_ADDR_UNLOCK();
 	} else {
 		/* it's ready for use */
 		sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE;
 	}
 	return (sctp_ifap);
 }
 
 void
 sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr,
     uint32_t ifn_index, const char *if_name)
 {
 	struct sctp_vrf *vrf;
 	struct sctp_ifa *sctp_ifap = NULL;
 
 	SCTP_IPI_ADDR_WLOCK();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id);
 		goto out_now;
 	}
 
 #ifdef SCTP_DEBUG
 	SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id);
 	SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr);
 #endif
 	sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED);
 	if (sctp_ifap) {
 		/* Validate the delete */
 		if (sctp_ifap->ifn_p) {
 			int valid = 0;
 
 			/*-
 			 * The name has priority over the ifn_index
 			 * if its given.
 			 */
 			if (if_name) {
 				if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) == 0) {
 					/* They match its a correct delete */
 					valid = 1;
 				}
 			}
 			if (!valid) {
 				/* last ditch check ifn_index */
 				if (ifn_index == sctp_ifap->ifn_p->ifn_index) {
 					valid = 1;
 				}
 			}
 			if (!valid) {
 				SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n",
 				    ifn_index, ((if_name == NULL) ? "NULL" : if_name));
 				SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n",
 				    sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name);
 				SCTP_IPI_ADDR_WUNLOCK();
 				return;
 			}
 		}
 		SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap);
 		sctp_ifap->localifa_flags &= SCTP_ADDR_VALID;
 		/*
 		 * We don't set the flag. This means that the structure will
 		 * hang around in EP's that have bound specific to it until
 		 * they close. This gives us TCP like behavior if someone
 		 * removes an address (or for that matter adds it right
 		 * back).
 		 */
 		/* sctp_ifap->localifa_flags |= SCTP_BEING_DELETED; */
 		vrf->total_ifa_count--;
 		LIST_REMOVE(sctp_ifap, next_bucket);
 		sctp_remove_ifa_from_ifn(sctp_ifap);
 	}
 #ifdef SCTP_DEBUG
 	else {
 		SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:",
 		    ifn_index);
 		SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr);
 	}
 #endif
 
 out_now:
 	SCTP_IPI_ADDR_WUNLOCK();
 	if (sctp_ifap) {
 		struct sctp_laddr *wi;
 
 		wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 		if (wi == NULL) {
 			/*
 			 * Gak, what can we do? We have lost an address
 			 * change can you say HOSED?
 			 */
 			SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n");
 
 			/* Oops, must decrement the count */
 			sctp_free_ifa(sctp_ifap);
 			return;
 		}
 		SCTP_INCR_LADDR_COUNT();
 		memset(wi, 0, sizeof(*wi));
 		(void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
 		wi->ifa = sctp_ifap;
 		wi->action = SCTP_DEL_IP_ADDRESS;
 		SCTP_WQ_ADDR_LOCK();
 		/*
 		 * Should this really be a tailq? As it is we will process
 		 * the newest first :-0
 		 */
 		LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
 		sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
 		    (struct sctp_inpcb *)NULL,
 		    (struct sctp_tcb *)NULL,
 		    (struct sctp_nets *)NULL);
 		SCTP_WQ_ADDR_UNLOCK();
 	}
 	return;
 }
 
 
 static int
 sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to)
 {
 	int loopback_scope;
 #if defined(INET)
 	int ipv4_local_scope, ipv4_addr_legal;
 #endif
 #if defined(INET6)
 	int local_scope, site_scope, ipv6_addr_legal;
 #endif
 	struct sctp_vrf *vrf;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 
 	loopback_scope = stcb->asoc.scope.loopback_scope;
 #if defined(INET)
 	ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope;
 	ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal;
 #endif
 #if defined(INET6)
 	local_scope = stcb->asoc.scope.local_scope;
 	site_scope = stcb->asoc.scope.site_scope;
 	ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal;
 #endif
 
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(stcb->asoc.vrf_id);
 	if (vrf == NULL) {
 		/* no vrf, no addresses */
 		SCTP_IPI_ADDR_RUNLOCK();
 		return (0);
 	}
 
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			if ((loopback_scope == 0) &&
 			    SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 				continue;
 			}
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				if (sctp_is_addr_restricted(stcb, sctp_ifa) &&
 				    (!sctp_is_addr_pending(stcb, sctp_ifa))) {
 					/*
 					 * We allow pending addresses, where
 					 * we have sent an asconf-add to be
 					 * considered valid.
 					 */
 					continue;
 				}
 				if (sctp_ifa->address.sa.sa_family != to->sa_family) {
 					continue;
 				}
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (ipv4_addr_legal) {
 						struct sockaddr_in *sin,
 						           *rsin;
 
 						sin = &sctp_ifa->address.sin;
 						rsin = (struct sockaddr_in *)to;
 						if ((ipv4_local_scope == 0) &&
 						    IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
 							continue;
 						}
 						if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
 						    &sin->sin_addr) != 0) {
 							continue;
 						}
 						if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
 							SCTP_IPI_ADDR_RUNLOCK();
 							return (1);
 						}
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					if (ipv6_addr_legal) {
 						struct sockaddr_in6 *sin6,
 						            *rsin6;
 
 						sin6 = &sctp_ifa->address.sin6;
 						rsin6 = (struct sockaddr_in6 *)to;
 						if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
 						    &sin6->sin6_addr) != 0) {
 							continue;
 						}
 						if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 							if (local_scope == 0)
 								continue;
 							if (sin6->sin6_scope_id == 0) {
 								if (sa6_recoverscope(sin6) != 0)
 									continue;
 							}
 						}
 						if ((site_scope == 0) &&
 						    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 							continue;
 						}
 						if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
 							SCTP_IPI_ADDR_RUNLOCK();
 							return (1);
 						}
 					}
 					break;
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 				SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n");
 				continue;
 			}
 			if (sctp_is_addr_restricted(stcb, laddr->ifa) &&
 			    (!sctp_is_addr_pending(stcb, laddr->ifa))) {
 				/*
 				 * We allow pending addresses, where we have
 				 * sent an asconf-add to be considered
 				 * valid.
 				 */
 				continue;
 			}
 			if (laddr->ifa->address.sa.sa_family != to->sa_family) {
 				continue;
 			}
 			switch (to->sa_family) {
 #ifdef INET
 			case AF_INET:
 				{
 					struct sockaddr_in *sin, *rsin;
 
 					sin = &laddr->ifa->address.sin;
 					rsin = (struct sockaddr_in *)to;
 					if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
 						SCTP_IPI_ADDR_RUNLOCK();
 						return (1);
 					}
 					break;
 				}
 #endif
 #ifdef INET6
 			case AF_INET6:
 				{
 					struct sockaddr_in6 *sin6, *rsin6;
 
 					sin6 = &laddr->ifa->address.sin6;
 					rsin6 = (struct sockaddr_in6 *)to;
 					if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
 						SCTP_IPI_ADDR_RUNLOCK();
 						return (1);
 					}
 					break;
 				}
 
 #endif
 			default:
 				/* TSNH */
 				break;
 			}
 
 		}
 	}
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (0);
 }
 
 
 static struct sctp_tcb *
 sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from,
     struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	/**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */
 	/*
 	 * If we support the TCP model, then we must now dig through to see
 	 * if we can find our endpoint in the list of tcp ep's.
 	 */
 	uint16_t lport, rport;
 	struct sctppcbhead *ephead;
 	struct sctp_inpcb *inp;
 	struct sctp_laddr *laddr;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 
 	if ((to == NULL) || (from == NULL)) {
 		return (NULL);
 	}
 
 	switch (to->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (from->sa_family == AF_INET) {
 			lport = ((struct sockaddr_in *)to)->sin_port;
 			rport = ((struct sockaddr_in *)from)->sin_port;
 		} else {
 			return (NULL);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (from->sa_family == AF_INET6) {
 			lport = ((struct sockaddr_in6 *)to)->sin6_port;
 			rport = ((struct sockaddr_in6 *)from)->sin6_port;
 		} else {
 			return (NULL);
 		}
 		break;
 #endif
 	default:
 		return (NULL);
 	}
 	ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))];
 	/*
 	 * Ok now for each of the guys in this bucket we must look and see:
 	 * - Does the remote port match. - Does there single association's
 	 * addresses match this address (to). If so we update p_ep to point
 	 * to this ep and return the tcb from it.
 	 */
 	LIST_FOREACH(inp, ephead, sctp_hash) {
 		SCTP_INP_RLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (lport != inp->sctp_lport) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		switch (to->sa_family) {
 #ifdef INET
 		case AF_INET:
 			{
 				struct sockaddr_in *sin;
 
 				sin = (struct sockaddr_in *)to;
 				if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 				    &sin->sin_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 			}
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)to;
 				if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 				    &sin6->sin6_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 			}
 #endif
 		default:
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (inp->def_vrf_id != vrf_id) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/* check to see if the ep has one of the addresses */
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 			/* We are NOT bound all, so look further */
 			int match = 0;
 
 			LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 
 				if (laddr->ifa == NULL) {
 					SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__);
 					continue;
 				}
 				if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 					SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n");
 					continue;
 				}
 				if (laddr->ifa->address.sa.sa_family ==
 				    to->sa_family) {
 					/* see if it matches */
 #ifdef INET
 					if (from->sa_family == AF_INET) {
 						struct sockaddr_in *intf_addr,
 						           *sin;
 
 						intf_addr = &laddr->ifa->address.sin;
 						sin = (struct sockaddr_in *)to;
 						if (sin->sin_addr.s_addr ==
 						    intf_addr->sin_addr.s_addr) {
 							match = 1;
 							break;
 						}
 					}
 #endif
 #ifdef INET6
 					if (from->sa_family == AF_INET6) {
 						struct sockaddr_in6 *intf_addr6;
 						struct sockaddr_in6 *sin6;
 
 						sin6 = (struct sockaddr_in6 *)
 						    to;
 						intf_addr6 = &laddr->ifa->address.sin6;
 
 						if (SCTP6_ARE_ADDR_EQUAL(sin6,
 						    intf_addr6)) {
 							match = 1;
 							break;
 						}
 					}
 #endif
 				}
 			}
 			if (match == 0) {
 				/* This endpoint does not have this address */
 				SCTP_INP_RUNLOCK(inp);
 				continue;
 			}
 		}
 		/*
 		 * Ok if we hit here the ep has the address, does it hold
 		 * the tcb?
 		 */
 		/* XXX: Why don't we TAILQ_FOREACH through sctp_asoc_list? */
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		if (stcb == NULL) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		SCTP_TCB_LOCK(stcb);
 		if (!sctp_does_stcb_own_this_addr(stcb, to)) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (stcb->rport != rport) {
 			/* remote port does not match. */
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if (!sctp_does_stcb_own_this_addr(stcb, to)) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/* Does this TCB have a matching address? */
 		TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 
 			if (net->ro._l_addr.sa.sa_family != from->sa_family) {
 				/* not the same family, can't be a match */
 				continue;
 			}
 			switch (from->sa_family) {
 #ifdef INET
 			case AF_INET:
 				{
 					struct sockaddr_in *sin, *rsin;
 
 					sin = (struct sockaddr_in *)&net->ro._l_addr;
 					rsin = (struct sockaddr_in *)from;
 					if (sin->sin_addr.s_addr ==
 					    rsin->sin_addr.s_addr) {
 						/* found it */
 						if (netp != NULL) {
 							*netp = net;
 						}
 						/*
 						 * Update the endpoint
 						 * pointer
 						 */
 						*inp_p = inp;
 						SCTP_INP_RUNLOCK(inp);
 						return (stcb);
 					}
 					break;
 				}
 #endif
 #ifdef INET6
 			case AF_INET6:
 				{
 					struct sockaddr_in6 *sin6, *rsin6;
 
 					sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 					rsin6 = (struct sockaddr_in6 *)from;
 					if (SCTP6_ARE_ADDR_EQUAL(sin6,
 					    rsin6)) {
 						/* found it */
 						if (netp != NULL) {
 							*netp = net;
 						}
 						/*
 						 * Update the endpoint
 						 * pointer
 						 */
 						*inp_p = inp;
 						SCTP_INP_RUNLOCK(inp);
 						return (stcb);
 					}
 					break;
 				}
 #endif
 			default:
 				/* TSNH */
 				break;
 			}
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_RUNLOCK(inp);
 	}
 	return (NULL);
 }
 
 
 /*
  * rules for use
  *
  * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an
  * stcb, both will be locked (locked_tcb and stcb) but decrement will be done
  * (if locked == NULL). 3) Decrement happens on return ONLY if locked ==
  * NULL.
  */
 
 struct sctp_tcb *
 sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote,
     struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb)
 {
 	struct sctpasochead *head;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb = NULL;
 	struct sctp_nets *net;
 	uint16_t rport;
 
 	inp = *inp_p;
 	switch (remote->sa_family) {
 #ifdef INET
 	case AF_INET:
 		rport = (((struct sockaddr_in *)remote)->sin_port);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		rport = (((struct sockaddr_in6 *)remote)->sin6_port);
 		break;
 #endif
 	default:
 		return (NULL);
 	}
 	if (locked_tcb) {
 		/*
 		 * UN-lock so we can do proper locking here this occurs when
 		 * called from load_addresses_from_init.
 		 */
 		atomic_add_int(&locked_tcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(locked_tcb);
 	}
 	SCTP_INP_INFO_RLOCK();
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/*-
 		 * Now either this guy is our listener or it's the
 		 * connector. If it is the one that issued the connect, then
 		 * it's only chance is to be the first TCB in the list. If
 		 * it is the acceptor, then do the special_lookup to hash
 		 * and find the real inp.
 		 */
 		if ((inp->sctp_socket) && SCTP_IS_LISTENING(inp)) {
 			/* to is peer addr, from is my addr */
 			stcb = sctp_tcb_special_locate(inp_p, remote, local,
 			    netp, inp->def_vrf_id);
 			if ((stcb != NULL) && (locked_tcb == NULL)) {
 				/* we have a locked tcb, lower refcount */
 				SCTP_INP_DECR_REF(inp);
 			}
 			if ((locked_tcb != NULL) && (locked_tcb != stcb)) {
 				SCTP_INP_RLOCK(locked_tcb->sctp_ep);
 				SCTP_TCB_LOCK(locked_tcb);
 				atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 				SCTP_INP_RUNLOCK(locked_tcb->sctp_ep);
 			}
 			SCTP_INP_INFO_RUNLOCK();
 			return (stcb);
 		} else {
 			SCTP_INP_WLOCK(inp);
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 				goto null_return;
 			}
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				goto null_return;
 			}
 			SCTP_TCB_LOCK(stcb);
 
 			if (stcb->rport != rport) {
 				/* remote port does not match. */
 				SCTP_TCB_UNLOCK(stcb);
 				goto null_return;
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SCTP_TCB_UNLOCK(stcb);
 				goto null_return;
 			}
 			if (local && !sctp_does_stcb_own_this_addr(stcb, local)) {
 				SCTP_TCB_UNLOCK(stcb);
 				goto null_return;
 			}
 			/* now look at the list of remote addresses */
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 #ifdef INVARIANTS
 				if (net == (TAILQ_NEXT(net, sctp_next))) {
 					panic("Corrupt net list");
 				}
 #endif
 				if (net->ro._l_addr.sa.sa_family !=
 				    remote->sa_family) {
 					/* not the same family */
 					continue;
 				}
 				switch (remote->sa_family) {
 #ifdef INET
 				case AF_INET:
 					{
 						struct sockaddr_in *sin,
 						           *rsin;
 
 						sin = (struct sockaddr_in *)
 						    &net->ro._l_addr;
 						rsin = (struct sockaddr_in *)remote;
 						if (sin->sin_addr.s_addr ==
 						    rsin->sin_addr.s_addr) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 #ifdef INET6
 				case AF_INET6:
 					{
 						struct sockaddr_in6 *sin6,
 						            *rsin6;
 
 						sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 						rsin6 = (struct sockaddr_in6 *)remote;
 						if (SCTP6_ARE_ADDR_EQUAL(sin6,
 						    rsin6)) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	} else {
 		SCTP_INP_WLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			goto null_return;
 		}
 		head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport,
 		    inp->sctp_hashmark)];
 		LIST_FOREACH(stcb, head, sctp_tcbhash) {
 			if (stcb->rport != rport) {
 				/* remote port does not match */
 				continue;
 			}
 			SCTP_TCB_LOCK(stcb);
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (local && !sctp_does_stcb_own_this_addr(stcb, local)) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			/* now look at the list of remote addresses */
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 #ifdef INVARIANTS
 				if (net == (TAILQ_NEXT(net, sctp_next))) {
 					panic("Corrupt net list");
 				}
 #endif
 				if (net->ro._l_addr.sa.sa_family !=
 				    remote->sa_family) {
 					/* not the same family */
 					continue;
 				}
 				switch (remote->sa_family) {
 #ifdef INET
 				case AF_INET:
 					{
 						struct sockaddr_in *sin,
 						           *rsin;
 
 						sin = (struct sockaddr_in *)
 						    &net->ro._l_addr;
 						rsin = (struct sockaddr_in *)remote;
 						if (sin->sin_addr.s_addr ==
 						    rsin->sin_addr.s_addr) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 #ifdef INET6
 				case AF_INET6:
 					{
 						struct sockaddr_in6 *sin6,
 						            *rsin6;
 
 						sin6 = (struct sockaddr_in6 *)
 						    &net->ro._l_addr;
 						rsin6 = (struct sockaddr_in6 *)remote;
 						if (SCTP6_ARE_ADDR_EQUAL(sin6,
 						    rsin6)) {
 							/* found it */
 							if (netp != NULL) {
 								*netp = net;
 							}
 							if (locked_tcb == NULL) {
 								SCTP_INP_DECR_REF(inp);
 							} else if (locked_tcb != stcb) {
 								SCTP_TCB_LOCK(locked_tcb);
 							}
 							if (locked_tcb) {
 								atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 							}
 							SCTP_INP_WUNLOCK(inp);
 							SCTP_INP_INFO_RUNLOCK();
 							return (stcb);
 						}
 						break;
 					}
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	}
 null_return:
 	/* clean up for returning null */
 	if (locked_tcb) {
 		SCTP_TCB_LOCK(locked_tcb);
 		atomic_subtract_int(&locked_tcb->asoc.refcnt, 1);
 	}
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_INFO_RUNLOCK();
 	/* not found */
 	return (NULL);
 }
 
 
 /*
  * Find an association for a specific endpoint using the association id given
  * out in the COMM_UP notification
  */
 struct sctp_tcb *
 sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock)
 {
 	/*
 	 * Use my the assoc_id to find a endpoint
 	 */
 	struct sctpasochead *head;
 	struct sctp_tcb *stcb;
 	uint32_t id;
 
 	if (inp == NULL) {
 		SCTP_PRINTF("TSNH ep_associd\n");
 		return (NULL);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 		SCTP_PRINTF("TSNH ep_associd0\n");
 		return (NULL);
 	}
 	id = (uint32_t)asoc_id;
 	head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)];
 	if (head == NULL) {
 		/* invalid id TSNH */
 		SCTP_PRINTF("TSNH ep_associd1\n");
 		return (NULL);
 	}
 	LIST_FOREACH(stcb, head, sctp_tcbasocidhash) {
 		if (stcb->asoc.assoc_id == id) {
 			if (inp != stcb->sctp_ep) {
 				/*
 				 * some other guy has the same id active (id
 				 * collision ??).
 				 */
 				SCTP_PRINTF("TSNH ep_associd2\n");
 				continue;
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				continue;
 			}
 			if (want_lock) {
 				SCTP_TCB_LOCK(stcb);
 			}
 			return (stcb);
 		}
 	}
 	return (NULL);
 }
 
 
 struct sctp_tcb *
 sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock)
 {
 	struct sctp_tcb *stcb;
 
 	SCTP_INP_RLOCK(inp);
 	stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock);
 	SCTP_INP_RUNLOCK(inp);
 	return (stcb);
 }
 
 
 /*
  * Endpoint probe expects that the INP_INFO is locked.
  */
 static struct sctp_inpcb *
 sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head,
     uint16_t lport, uint32_t vrf_id)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_laddr *laddr;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sockaddr_in6 *intf_addr6;
 #endif
 	int fnd;
 
 #ifdef INET
 	sin = NULL;
 #endif
 #ifdef INET6
 	sin6 = NULL;
 #endif
 	switch (nam->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)nam;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)nam;
 		break;
 #endif
 	default:
 		/* unsupported family */
 		return (NULL);
 	}
 
 	if (head == NULL)
 		return (NULL);
 
 	LIST_FOREACH(inp, head, sctp_hash) {
 		SCTP_INP_RLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) &&
 		    (inp->sctp_lport == lport)) {
 			/* got it */
 			switch (nam->sa_family) {
 #ifdef INET
 			case AF_INET:
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 				    SCTP_IPV6_V6ONLY(inp)) {
 					/*
 					 * IPv4 on a IPv6 socket with ONLY
 					 * IPv6 set
 					 */
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 				    &sin->sin_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				/*
 				 * A V6 address and the endpoint is NOT
 				 * bound V6
 				 */
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 				    &sin6->sin6_addr) != 0) {
 					SCTP_INP_RUNLOCK(inp);
 					continue;
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 			/* does a VRF id match? */
 			fnd = 0;
 			if (inp->def_vrf_id == vrf_id)
 				fnd = 1;
 
 			SCTP_INP_RUNLOCK(inp);
 			if (!fnd)
 				continue;
 			return (inp);
 		}
 		SCTP_INP_RUNLOCK(inp);
 	}
 	switch (nam->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (sin->sin_addr.s_addr == INADDR_ANY) {
 			/* Can't hunt for one that has no address specified */
 			return (NULL);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/* Can't hunt for one that has no address specified */
 			return (NULL);
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 	/*
 	 * ok, not bound to all so see if we can find a EP bound to this
 	 * address.
 	 */
 	LIST_FOREACH(inp, head, sctp_hash) {
 		SCTP_INP_RLOCK(inp);
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/*
 		 * Ok this could be a likely candidate, look at all of its
 		 * addresses
 		 */
 		if (inp->sctp_lport != lport) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		/* does a VRF id match? */
 		fnd = 0;
 		if (inp->def_vrf_id == vrf_id)
 			fnd = 1;
 
 		if (!fnd) {
 			SCTP_INP_RUNLOCK(inp);
 			continue;
 		}
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa == NULL) {
 				SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
 				    __func__);
 				continue;
 			}
 			SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ",
 			    (void *)laddr->ifa);
 			if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 				SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n");
 				continue;
 			}
 			if (laddr->ifa->address.sa.sa_family == nam->sa_family) {
 				/* possible, see if it matches */
 				switch (nam->sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (sin->sin_addr.s_addr ==
 					    laddr->ifa->address.sin.sin_addr.s_addr) {
 						SCTP_INP_RUNLOCK(inp);
 						return (inp);
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					intf_addr6 = &laddr->ifa->address.sin6;
 					if (SCTP6_ARE_ADDR_EQUAL(sin6,
 					    intf_addr6)) {
 						SCTP_INP_RUNLOCK(inp);
 						return (inp);
 					}
 					break;
 #endif
 				}
 			}
 		}
 		SCTP_INP_RUNLOCK(inp);
 	}
 	return (NULL);
 }
 
 
 static struct sctp_inpcb *
 sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id)
 {
 	struct sctppcbhead *head;
 	struct sctp_inpcb *t_inp;
 	int fnd;
 
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport,
 	    SCTP_BASE_INFO(hashmark))];
 	LIST_FOREACH(t_inp, head, sctp_hash) {
 		if (t_inp->sctp_lport != lport) {
 			continue;
 		}
 		/* is it in the VRF in question */
 		fnd = 0;
 		if (t_inp->def_vrf_id == vrf_id)
 			fnd = 1;
 		if (!fnd)
 			continue;
 
 		/* This one is in use. */
 		/* check the v6/v4 binding issue */
 		if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 		    SCTP_IPV6_V6ONLY(t_inp)) {
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				/* collision in V6 space */
 				return (t_inp);
 			} else {
 				/* inp is BOUND_V4 no conflict */
 				continue;
 			}
 		} else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 			/* t_inp is bound v4 and v6, conflict always */
 			return (t_inp);
 		} else {
 			/* t_inp is bound only V4 */
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 			    SCTP_IPV6_V6ONLY(inp)) {
 				/* no conflict */
 				continue;
 			}
 			/* else fall through to conflict */
 		}
 		return (t_inp);
 	}
 	return (NULL);
 }
 
 
 int
 sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp)
 {
 	/* For 1-2-1 with port reuse */
 	struct sctppcbhead *head;
 	struct sctp_inpcb *tinp, *ninp;
 
 	if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
 		/* only works with port reuse on */
 		return (-1);
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) {
 		return (0);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	SCTP_INP_INFO_WLOCK();
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport,
 	    SCTP_BASE_INFO(hashmark))];
 	/* Kick out all non-listeners to the TCP hash */
 	LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) {
 		if (tinp->sctp_lport != inp->sctp_lport) {
 			continue;
 		}
 		if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			continue;
 		}
 		if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 			continue;
 		}
 		if (SCTP_IS_LISTENING(tinp)) {
 			continue;
 		}
 		SCTP_INP_WLOCK(tinp);
 		LIST_REMOVE(tinp, sctp_hash);
 		head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))];
 		tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL;
 		LIST_INSERT_HEAD(head, tinp, sctp_hash);
 		SCTP_INP_WUNLOCK(tinp);
 	}
 	SCTP_INP_WLOCK(inp);
 	/* Pull from where he was */
 	LIST_REMOVE(inp, sctp_hash);
 	inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL;
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))];
 	LIST_INSERT_HEAD(head, inp, sctp_hash);
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_RLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 	return (0);
 }
 
 
 struct sctp_inpcb *
 sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock,
     uint32_t vrf_id)
 {
 	/*
 	 * First we check the hash table to see if someone has this port
 	 * bound with just the port.
 	 */
 	struct sctp_inpcb *inp;
 	struct sctppcbhead *head;
 	int lport;
 	unsigned int i;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 
 	switch (nam->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)nam;
 		lport = sin->sin_port;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)nam;
 		lport = sin6->sin6_port;
 		break;
 #endif
 	default:
 		return (NULL);
 	}
 	/*
 	 * I could cheat here and just cast to one of the types but we will
 	 * do it right. It also provides the check against an Unsupported
 	 * type too.
 	 */
 	/* Find the head of the ALLADDR chain */
 	if (have_lock == 0) {
 		SCTP_INP_INFO_RLOCK();
 	}
 	head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport,
 	    SCTP_BASE_INFO(hashmark))];
 	inp = sctp_endpoint_probe(nam, head, lport, vrf_id);
 
 	/*
 	 * If the TCP model exists it could be that the main listening
 	 * endpoint is gone but there still exists a connected socket for
 	 * this guy. If so we can return the first one that we find. This
 	 * may NOT be the correct one so the caller should be wary on the
 	 * returned INP. Currently the only caller that sets find_tcp_pool
 	 * is in bindx where we are verifying that a user CAN bind the
 	 * address. He either has bound it already, or someone else has, or
 	 * its open to bind, so this is good enough.
 	 */
 	if (inp == NULL && find_tcp_pool) {
 		for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) {
 			head = &SCTP_BASE_INFO(sctp_tcpephash)[i];
 			inp = sctp_endpoint_probe(nam, head, lport, vrf_id);
 			if (inp) {
 				break;
 			}
 		}
 	}
 	if (inp) {
 		SCTP_INP_INCR_REF(inp);
 	}
 	if (have_lock == 0) {
 		SCTP_INP_INFO_RUNLOCK();
 	}
 	return (inp);
 }
 
 
 /*
  * Find an association for an endpoint with the pointer to whom you want to
  * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may
  * need to change the *to to some other struct like a mbuf...
  */
 struct sctp_tcb *
 sctp_findassociation_addr_sa(struct sockaddr *from, struct sockaddr *to,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool,
     uint32_t vrf_id)
 {
 	struct sctp_inpcb *inp = NULL;
 	struct sctp_tcb *stcb;
 
 	SCTP_INP_INFO_RLOCK();
 	if (find_tcp_pool) {
 		if (inp_p != NULL) {
 			stcb = sctp_tcb_special_locate(inp_p, from, to, netp,
 			    vrf_id);
 		} else {
 			stcb = sctp_tcb_special_locate(&inp, from, to, netp,
 			    vrf_id);
 		}
 		if (stcb != NULL) {
 			SCTP_INP_INFO_RUNLOCK();
 			return (stcb);
 		}
 	}
 	inp = sctp_pcb_findep(to, 0, 1, vrf_id);
 	if (inp_p != NULL) {
 		*inp_p = inp;
 	}
 	SCTP_INP_INFO_RUNLOCK();
 	if (inp == NULL) {
 		return (NULL);
 	}
 	/*
 	 * ok, we have an endpoint, now lets find the assoc for it (if any)
 	 * we now place the source address or from in the to of the find
 	 * endpoint call. Since in reality this chain is used from the
 	 * inbound packet side.
 	 */
 	if (inp_p != NULL) {
 		stcb = sctp_findassociation_ep_addr(inp_p, from, netp, to,
 		    NULL);
 	} else {
 		stcb = sctp_findassociation_ep_addr(&inp, from, netp, to,
 		    NULL);
 	}
 	return (stcb);
 }
 
 
 /*
  * This routine will grub through the mbuf that is a INIT or INIT-ACK and
  * find all addresses that the sender has specified in any address list. Each
  * address will be used to lookup the TCB and see if one exits.
  */
 static struct sctp_tcb *
 sctp_findassociation_special_addr(struct mbuf *m, int offset,
     struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp,
     struct sockaddr *dst)
 {
 	struct sctp_paramhdr *phdr, param_buf;
 #if defined(INET) || defined(INET6)
 	struct sctp_tcb *stcb;
 	uint16_t ptype;
 #endif
 	uint16_t plen;
 #ifdef INET
 	struct sockaddr_in sin4;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 
 #ifdef INET
 	memset(&sin4, 0, sizeof(sin4));
 	sin4.sin_len = sizeof(sin4);
 	sin4.sin_family = AF_INET;
 	sin4.sin_port = sh->src_port;
 #endif
 #ifdef INET6
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(sin6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_port = sh->src_port;
 #endif
 
 	offset += sizeof(struct sctp_init_chunk);
 
 	phdr = sctp_get_next_param(m, offset, &param_buf, sizeof(param_buf));
 	while (phdr != NULL) {
 		/* now we must see if we want the parameter */
 #if defined(INET) || defined(INET6)
 		ptype = ntohs(phdr->param_type);
 #endif
 		plen = ntohs(phdr->param_length);
 		if (plen == 0) {
 			break;
 		}
 #ifdef INET
 		if (ptype == SCTP_IPV4_ADDRESS &&
 		    plen == sizeof(struct sctp_ipv4addr_param)) {
 			/* Get the rest of the address */
 			struct sctp_ipv4addr_param ip4_param, *p4;
 
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&ip4_param, sizeof(ip4_param));
 			if (phdr == NULL) {
 				return (NULL);
 			}
 			p4 = (struct sctp_ipv4addr_param *)phdr;
 			memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr));
 			/* look it up */
 			stcb = sctp_findassociation_ep_addr(inp_p,
 			    (struct sockaddr *)&sin4, netp, dst, NULL);
 			if (stcb != NULL) {
 				return (stcb);
 			}
 		}
 #endif
 #ifdef INET6
 		if (ptype == SCTP_IPV6_ADDRESS &&
 		    plen == sizeof(struct sctp_ipv6addr_param)) {
 			/* Get the rest of the address */
 			struct sctp_ipv6addr_param ip6_param, *p6;
 
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&ip6_param, sizeof(ip6_param));
 			if (phdr == NULL) {
 				return (NULL);
 			}
 			p6 = (struct sctp_ipv6addr_param *)phdr;
 			memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr));
 			/* look it up */
 			stcb = sctp_findassociation_ep_addr(inp_p,
 			    (struct sockaddr *)&sin6, netp, dst, NULL);
 			if (stcb != NULL) {
 				return (stcb);
 			}
 		}
 #endif
 		offset += SCTP_SIZE32(plen);
 		phdr = sctp_get_next_param(m, offset, &param_buf,
 		    sizeof(param_buf));
 	}
 	return (NULL);
 }
 
 static struct sctp_tcb *
 sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport,
     uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag)
 {
 	/*
 	 * Use my vtag to hash. If we find it we then verify the source addr
 	 * is in the assoc. If all goes well we save a bit on rec of a
 	 * packet.
 	 */
 	struct sctpasochead *head;
 	struct sctp_nets *net;
 	struct sctp_tcb *stcb;
 
 	SCTP_INP_INFO_RLOCK();
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag,
 	    SCTP_BASE_INFO(hashasocmark))];
 	LIST_FOREACH(stcb, head, sctp_asocs) {
 		SCTP_INP_RLOCK(stcb->sctp_ep);
 		if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			SCTP_INP_RUNLOCK(stcb->sctp_ep);
 			continue;
 		}
 		if (stcb->sctp_ep->def_vrf_id != vrf_id) {
 			SCTP_INP_RUNLOCK(stcb->sctp_ep);
 			continue;
 		}
 		SCTP_TCB_LOCK(stcb);
 		SCTP_INP_RUNLOCK(stcb->sctp_ep);
 		if (stcb->asoc.my_vtag == vtag) {
 			/* candidate */
 			if (stcb->rport != rport) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (stcb->sctp_ep->sctp_lport != lport) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			/* RRS:Need toaddr check here */
 			if (sctp_does_stcb_own_this_addr(stcb, to) == 0) {
 				/* Endpoint does not own this address */
 				SCTP_TCB_UNLOCK(stcb);
 				continue;
 			}
 			if (remote_tag) {
 				/*
 				 * If we have both vtags that's all we match
 				 * on
 				 */
 				if (stcb->asoc.peer_vtag == remote_tag) {
 					/*
 					 * If both tags match we consider it
 					 * conclusive and check NO
 					 * source/destination addresses
 					 */
 					goto conclusive;
 				}
 			}
 			if (skip_src_check) {
 		conclusive:
 				if (from) {
 					*netp = sctp_findnet(stcb, from);
 				} else {
 					*netp = NULL;	/* unknown */
 				}
 				if (inp_p)
 					*inp_p = stcb->sctp_ep;
 				SCTP_INP_INFO_RUNLOCK();
 				return (stcb);
 			}
 			net = sctp_findnet(stcb, from);
 			if (net) {
 				/* yep its him. */
 				*netp = net;
 				SCTP_STAT_INCR(sctps_vtagexpress);
 				*inp_p = stcb->sctp_ep;
 				SCTP_INP_INFO_RUNLOCK();
 				return (stcb);
 			} else {
 				/*
 				 * not him, this should only happen in rare
 				 * cases so I peg it.
 				 */
 				SCTP_STAT_INCR(sctps_vtagbogus);
 			}
 		}
 		SCTP_TCB_UNLOCK(stcb);
 	}
 	SCTP_INP_INFO_RUNLOCK();
 	return (NULL);
 }
 
 
 /*
  * Find an association with the pointer to the inbound IP packet. This can be
  * a IPv4 or IPv6 packet.
  */
 struct sctp_tcb *
 sctp_findassociation_addr(struct mbuf *m, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_chunkhdr *ch,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	struct sctp_tcb *stcb;
 	struct sctp_inpcb *inp;
 
 	if (sh->v_tag) {
 		/* we only go down this path if vtag is non-zero */
 		stcb = sctp_findassoc_by_vtag(src, dst, ntohl(sh->v_tag),
 		    inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0);
 		if (stcb) {
 			return (stcb);
 		}
 	}
 
 	if (inp_p) {
 		stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp,
 		    1, vrf_id);
 		inp = *inp_p;
 	} else {
 		stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp,
 		    1, vrf_id);
 	}
 	SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp);
 	if (stcb == NULL && inp) {
 		/* Found a EP but not this address */
 		if ((ch->chunk_type == SCTP_INITIATION) ||
 		    (ch->chunk_type == SCTP_INITIATION_ACK)) {
 			/*-
 			 * special hook, we do NOT return linp or an
 			 * association that is linked to an existing
 			 * association that is under the TCP pool (i.e. no
 			 * listener exists). The endpoint finding routine
 			 * will always find a listener before examining the
 			 * TCP pool.
 			 */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) {
 				if (inp_p) {
 					*inp_p = NULL;
 				}
 				return (NULL);
 			}
 			stcb = sctp_findassociation_special_addr(m,
 			    offset, sh, &inp, netp, dst);
 			if (inp_p != NULL) {
 				*inp_p = inp;
 			}
 		}
 	}
 	SCTPDBG(SCTP_DEBUG_PCB1, "stcb is %p\n", (void *)stcb);
 	return (stcb);
 }
 
 /*
  * lookup an association by an ASCONF lookup address.
  * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup
  */
 struct sctp_tcb *
 sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
     struct sockaddr *dst, struct sctphdr *sh,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	struct sctp_tcb *stcb;
 	union sctp_sockstore remote_store;
 	struct sctp_paramhdr param_buf, *phdr;
 	int ptype;
 	int zero_address = 0;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 
 	memset(&remote_store, 0, sizeof(remote_store));
 	phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk),
 	    &param_buf, sizeof(struct sctp_paramhdr));
 	if (phdr == NULL) {
 		SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n",
 		    __func__);
 		return NULL;
 	}
 	ptype = (int)((uint32_t)ntohs(phdr->param_type));
 	/* get the correlation address */
 	switch (ptype) {
 #ifdef INET6
 	case SCTP_IPV6_ADDRESS:
 		{
 			/* ipv6 address param */
 			struct sctp_ipv6addr_param *p6, p6_buf;
 
 			if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) {
 				return NULL;
 			}
 			p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m,
 			    offset + sizeof(struct sctp_asconf_chunk),
 			    &p6_buf.ph, sizeof(p6_buf));
 			if (p6 == NULL) {
 				SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n",
 				    __func__);
 				return (NULL);
 			}
 			sin6 = &remote_store.sin6;
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_len = sizeof(*sin6);
 			sin6->sin6_port = sh->src_port;
 			memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr));
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 				zero_address = 1;
 			break;
 		}
 #endif
 #ifdef INET
 	case SCTP_IPV4_ADDRESS:
 		{
 			/* ipv4 address param */
 			struct sctp_ipv4addr_param *p4, p4_buf;
 
 			if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) {
 				return NULL;
 			}
 			p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m,
 			    offset + sizeof(struct sctp_asconf_chunk),
 			    &p4_buf.ph, sizeof(p4_buf));
 			if (p4 == NULL) {
 				SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n",
 				    __func__);
 				return (NULL);
 			}
 			sin = &remote_store.sin;
 			sin->sin_family = AF_INET;
 			sin->sin_len = sizeof(*sin);
 			sin->sin_port = sh->src_port;
 			memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr));
 			if (sin->sin_addr.s_addr == INADDR_ANY)
 				zero_address = 1;
 			break;
 		}
 #endif
 	default:
 		/* invalid address param type */
 		return NULL;
 	}
 
 	if (zero_address) {
 		stcb = sctp_findassoc_by_vtag(NULL, dst, ntohl(sh->v_tag), inp_p,
 		    netp, sh->src_port, sh->dest_port, 1, vrf_id, 0);
 		if (stcb != NULL) {
 			SCTP_INP_DECR_REF(*inp_p);
 		}
 	} else {
 		stcb = sctp_findassociation_ep_addr(inp_p,
 		    &remote_store.sa, netp,
 		    dst, NULL);
 	}
 	return (stcb);
 }
 
 
 /*
  * allocate a sctp_inpcb and setup a temporary binding to a port/all
  * addresses. This way if we don't get a bind we by default pick a ephemeral
  * port with all addresses bound.
  */
 int
 sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
 {
 	/*
 	 * we get called when a new endpoint starts up. We need to allocate
 	 * the sctp_inpcb structure from the zone and init it. Mark it as
 	 * unbound and find a port that we can use as an ephemeral with
 	 * INADDR_ANY. If the user binds later no problem we can then add in
 	 * the specific addresses. And setup the default parameters for the
 	 * EP.
 	 */
 	int i, error;
 	struct sctp_inpcb *inp;
 	struct sctp_pcb *m;
 	struct timeval time;
 	sctp_sharedkey_t *null_key;
 
 	error = 0;
 
 	SCTP_INP_INFO_WLOCK();
 	inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb);
 	if (inp == NULL) {
 		SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n");
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		return (ENOBUFS);
 	}
 	/* zap it */
 	memset(inp, 0, sizeof(*inp));
 
 	/* bump generations */
 	/* setup socket pointers */
 	inp->sctp_socket = so;
 	inp->ip_inp.inp.inp_socket = so;
 	inp->ip_inp.inp.inp_cred = crhold(so->so_cred);
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		if (MODULE_GLOBAL(ip6_auto_flowlabel)) {
 			inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL;
 		}
 		if (MODULE_GLOBAL(ip6_v6only)) {
 			inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY;
 		}
 	}
 #endif
 	inp->sctp_associd_counter = 1;
 	inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT;
 	inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
 	inp->max_cwnd = 0;
 	inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off);
 	inp->ecn_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_ecn_enable);
 	inp->prsctp_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pr_enable);
 	inp->auth_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_auth_enable);
 	inp->asconf_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_asconf_enable);
 	inp->reconfig_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_reconfig_enable);
 	inp->nrsack_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_nrsack_enable);
 	inp->pktdrop_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pktdrop_enable);
 	inp->idata_supported = 0;
 
 	inp->fibnum = so->so_fibnum;
 	/* init the small hash table we use to track asocid <-> tcb */
 	inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark);
 	if (inp->sctp_asocidhash == NULL) {
 		crfree(inp->ip_inp.inp.inp_cred);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return (ENOBUFS);
 	}
 	SCTP_INCR_EP_COUNT();
 	inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
 	SCTP_INP_INFO_WUNLOCK();
 
 	so->so_pcb = (caddr_t)inp;
 
 	if (SCTP_SO_TYPE(so) == SOCK_SEQPACKET) {
 		/* UDP style socket */
 		inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE |
 		    SCTP_PCB_FLAGS_UNBOUND);
 		/* Be sure it is NON-BLOCKING IO for UDP */
 		/* SCTP_SET_SO_NBIO(so); */
 	} else if (SCTP_SO_TYPE(so) == SOCK_STREAM) {
 		/* TCP style socket */
 		inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE |
 		    SCTP_PCB_FLAGS_UNBOUND);
 		/* Be sure we have blocking IO by default */
 		SOCK_LOCK(so);
 		SCTP_CLEAR_SO_NBIO(so);
 		SOCK_UNLOCK(so);
 	} else {
 		/*
 		 * unsupported socket type (RAW, etc)- in case we missed it
 		 * in protosw
 		 */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP);
 		so->so_pcb = NULL;
 		crfree(inp->ip_inp.inp.inp_cred);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		return (EOPNOTSUPP);
 	}
 	if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) {
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 	} else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) {
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 	} else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) {
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 	}
 	inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize),
 	    &inp->sctp_hashmark);
 	if (inp->sctp_tcbhash == NULL) {
 		SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n");
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		so->so_pcb = NULL;
 		crfree(inp->ip_inp.inp.inp_cred);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 		return (ENOBUFS);
 	}
 	inp->def_vrf_id = vrf_id;
 
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_LOCK_INIT(inp);
 	INP_LOCK_INIT(&inp->ip_inp.inp, "inp", "sctpinp");
 	SCTP_INP_READ_INIT(inp);
 	SCTP_ASOC_CREATE_LOCK_INIT(inp);
 	/* lock the new ep */
 	SCTP_INP_WLOCK(inp);
 
 	/* add it to the info area */
 	LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list);
 	SCTP_INP_INFO_WUNLOCK();
 
 	TAILQ_INIT(&inp->read_queue);
 	LIST_INIT(&inp->sctp_addr_list);
 
 	LIST_INIT(&inp->sctp_asoc_list);
 
 #ifdef SCTP_TRACK_FREED_ASOCS
 	/* TEMP CODE */
 	LIST_INIT(&inp->sctp_asoc_free_list);
 #endif
 	/* Init the timer structure for signature change */
 	SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer);
 	inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE;
 
 	/* now init the actual endpoint default data */
 	m = &inp->sctp_ep;
 
 	/* setup the base timeout information */
 	m->sctp_timeoutticks[SCTP_TIMER_SEND] = sctp_secs_to_ticks(SCTP_SEND_SEC);	/* needed ? */
 	m->sctp_timeoutticks[SCTP_TIMER_INIT] = sctp_secs_to_ticks(SCTP_INIT_SEC);	/* needed ? */
 	m->sctp_timeoutticks[SCTP_TIMER_RECV] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default));
 	m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default));
 	m->sctp_timeoutticks[SCTP_TIMER_PMTU] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default));
 	m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default));
 	m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default));
 	/* all max/min max are in ms */
 	m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default);
 	m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default);
 	m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default);
 	m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default);
 	m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default);
 	m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default);
 	m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default);
 	m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default);
 	m->def_net_pf_threshold = SCTP_BASE_SYSCTL(sctp_path_pf_threshold);
 	m->sctp_sws_sender = SCTP_SWS_SENDER_DEF;
 	m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF;
 	m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default);
 	m->fr_max_burst = SCTP_BASE_SYSCTL(sctp_fr_max_burst_default);
 
 	m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module);
 	m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module);
 	m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default);
 	/* number of streams to pre-open on a association */
 	m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default);
 
 	m->default_mtu = 0;
 	/* Add adaptation cookie */
 	m->adaptation_layer_indicator = 0;
 	m->adaptation_layer_indicator_provided = 0;
 
 	/* seed random number generator */
 	m->random_counter = 1;
 	m->store_at = SCTP_SIGNATURE_SIZE;
 	SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers));
 	sctp_fill_random_store(m);
 
 	/* Minimum cookie size */
 	m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) +
 	    sizeof(struct sctp_state_cookie);
 	m->size_of_a_cookie += SCTP_SIGNATURE_SIZE;
 
 	/* Setup the initial secret */
 	(void)SCTP_GETTIME_TIMEVAL(&time);
 	m->time_of_secret_change = time.tv_sec;
 
 	for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) {
 		m->secret_key[0][i] = sctp_select_initial_TSN(m);
 	}
 	sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL);
 
 	/* How long is a cookie good for ? */
 	m->def_cookie_life = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default));
 	/*
 	 * Initialize authentication parameters
 	 */
 	m->local_hmacs = sctp_default_supported_hmaclist();
 	m->local_auth_chunks = sctp_alloc_chunklist();
 	if (inp->asconf_supported) {
 		sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks);
 		sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks);
 	}
 	m->default_dscp = 0;
 #ifdef INET6
 	m->default_flowlabel = 0;
 #endif
 	m->port = 0;		/* encapsulation disabled by default */
 	LIST_INIT(&m->shared_keys);
 	/* add default NULL key as key id 0 */
 	null_key = sctp_alloc_sharedkey();
 	sctp_insert_sharedkey(&m->shared_keys, null_key);
 	SCTP_INP_WUNLOCK(inp);
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 12);
 #endif
 	return (error);
 }
 
 
 void
 sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp,
     struct sctp_tcb *stcb)
 {
 	struct sctp_nets *net;
 	uint16_t lport, rport;
 	struct sctppcbhead *head;
 	struct sctp_laddr *laddr, *oladdr;
 
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	SCTP_TCB_UNLOCK(stcb);
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(old_inp);
 	SCTP_INP_WLOCK(new_inp);
 	SCTP_TCB_LOCK(stcb);
 	atomic_subtract_int(&stcb->asoc.refcnt, 1);
 
 	new_inp->sctp_ep.time_of_secret_change =
 	    old_inp->sctp_ep.time_of_secret_change;
 	memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key,
 	    sizeof(old_inp->sctp_ep.secret_key));
 	new_inp->sctp_ep.current_secret_number =
 	    old_inp->sctp_ep.current_secret_number;
 	new_inp->sctp_ep.last_secret_number =
 	    old_inp->sctp_ep.last_secret_number;
 	new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie;
 
 	/* make it so new data pours into the new socket */
 	stcb->sctp_socket = new_inp->sctp_socket;
 	stcb->sctp_ep = new_inp;
 
 	/* Copy the port across */
 	lport = new_inp->sctp_lport = old_inp->sctp_lport;
 	rport = stcb->rport;
 	/* Pull the tcb from the old association */
 	LIST_REMOVE(stcb, sctp_tcbhash);
 	LIST_REMOVE(stcb, sctp_tcblist);
 	if (stcb->asoc.in_asocid_hash) {
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 	}
 	/* Now insert the new_inp into the TCP connected hash */
 	head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))];
 
 	LIST_INSERT_HEAD(head, new_inp, sctp_hash);
 	/* Its safe to access */
 	new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND;
 
 	/* Now move the tcb into the endpoint list */
 	LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist);
 	/*
 	 * Question, do we even need to worry about the ep-hash since we
 	 * only have one connection? Probably not :> so lets get rid of it
 	 * and not suck up any kernel memory in that.
 	 */
 	if (stcb->asoc.in_asocid_hash) {
 		struct sctpasochead *lhd;
 
 		lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id,
 		    new_inp->hashasocidmark)];
 		LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash);
 	}
 	/* Ok. Let's restart timer. */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp,
 		    stcb, net);
 	}
 
 	SCTP_INP_INFO_WUNLOCK();
 	if (new_inp->sctp_tcbhash != NULL) {
 		SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark);
 		new_inp->sctp_tcbhash = NULL;
 	}
 	if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 		/* Subset bound, so copy in the laddr list from the old_inp */
 		LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) {
 			laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 			if (laddr == NULL) {
 				/*
 				 * Gak, what can we do? This assoc is really
 				 * HOSED. We probably should send an abort
 				 * here.
 				 */
 				SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n");
 				continue;
 			}
 			SCTP_INCR_LADDR_COUNT();
 			memset(laddr, 0, sizeof(*laddr));
 			(void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
 			laddr->ifa = oladdr->ifa;
 			atomic_add_int(&laddr->ifa->refcount, 1);
 			LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr,
 			    sctp_nxt_addr);
 			new_inp->laddr_count++;
 			if (oladdr == stcb->asoc.last_used_address) {
 				stcb->asoc.last_used_address = laddr;
 			}
 		}
 	}
 	/* Now any running timers need to be adjusted. */
 	if (stcb->asoc.dack_timer.ep == old_inp) {
 		SCTP_INP_DECR_REF(old_inp);
 		stcb->asoc.dack_timer.ep = new_inp;
 		SCTP_INP_INCR_REF(new_inp);
 	}
 	if (stcb->asoc.asconf_timer.ep == old_inp) {
 		SCTP_INP_DECR_REF(old_inp);
 		stcb->asoc.asconf_timer.ep = new_inp;
 		SCTP_INP_INCR_REF(new_inp);
 	}
 	if (stcb->asoc.strreset_timer.ep == old_inp) {
 		SCTP_INP_DECR_REF(old_inp);
 		stcb->asoc.strreset_timer.ep = new_inp;
 		SCTP_INP_INCR_REF(new_inp);
 	}
 	if (stcb->asoc.shut_guard_timer.ep == old_inp) {
 		SCTP_INP_DECR_REF(old_inp);
 		stcb->asoc.shut_guard_timer.ep = new_inp;
 		SCTP_INP_INCR_REF(new_inp);
 	}
 	if (stcb->asoc.autoclose_timer.ep == old_inp) {
 		SCTP_INP_DECR_REF(old_inp);
 		stcb->asoc.autoclose_timer.ep = new_inp;
 		SCTP_INP_INCR_REF(new_inp);
 	}
 	if (stcb->asoc.delete_prim_timer.ep == old_inp) {
 		SCTP_INP_DECR_REF(old_inp);
 		stcb->asoc.delete_prim_timer.ep = new_inp;
 		SCTP_INP_INCR_REF(new_inp);
 	}
 	/* now what about the nets? */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		if (net->pmtu_timer.ep == old_inp) {
 			SCTP_INP_DECR_REF(old_inp);
 			net->pmtu_timer.ep = new_inp;
 			SCTP_INP_INCR_REF(new_inp);
 		}
 		if (net->hb_timer.ep == old_inp) {
 			SCTP_INP_DECR_REF(old_inp);
 			net->hb_timer.ep = new_inp;
 			SCTP_INP_INCR_REF(new_inp);
 		}
 		if (net->rxt_timer.ep == old_inp) {
 			SCTP_INP_DECR_REF(old_inp);
 			net->rxt_timer.ep = new_inp;
 			SCTP_INP_INCR_REF(new_inp);
 		}
 	}
 	SCTP_INP_WUNLOCK(new_inp);
 	SCTP_INP_WUNLOCK(old_inp);
 }
 
 /*
  * insert an laddr entry with the given ifa for the desired list
  */
 static int
 sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act)
 {
 	struct sctp_laddr *laddr;
 
 	laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 	if (laddr == NULL) {
 		/* out of memory? */
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		return (EINVAL);
 	}
 	SCTP_INCR_LADDR_COUNT();
 	memset(laddr, 0, sizeof(*laddr));
 	(void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
 	laddr->ifa = ifa;
 	laddr->action = act;
 	atomic_add_int(&ifa->refcount, 1);
 	/* insert it */
 	LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr);
 
 	return (0);
 }
 
 /*
  * Remove an laddr entry from the local address list (on an assoc)
  */
 static void
 sctp_remove_laddr(struct sctp_laddr *laddr)
 {
 
 	/* remove from the list */
 	LIST_REMOVE(laddr, sctp_nxt_addr);
 	sctp_free_ifa(laddr->ifa);
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr);
 	SCTP_DECR_LADDR_COUNT();
 }
 
 
 
 /* sctp_ifap is used to bypass normal local address validation checks */
 int
 sctp_inpcb_bind(struct socket *so, struct sockaddr *addr,
     struct sctp_ifa *sctp_ifap, struct thread *p)
 {
 	/* bind a ep to a socket address */
 	struct sctppcbhead *head;
 	struct sctp_inpcb *inp, *inp_tmp;
 	struct inpcb *ip_inp;
 	int port_reuse_active = 0;
 	int bindall;
 	uint16_t lport;
 	int error;
 	uint32_t vrf_id;
 
 	lport = 0;
 	bindall = 1;
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	ip_inp = (struct inpcb *)so->so_pcb;
 #ifdef SCTP_DEBUG
 	if (addr) {
 		SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port: %d\n",
 		    ntohs(((struct sockaddr_in *)addr)->sin_port));
 		SCTPDBG(SCTP_DEBUG_PCB1, "Addr: ");
 		SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr);
 	}
 #endif
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) {
 		/* already did a bind, subsequent binds NOT allowed ! */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		return (EINVAL);
 	}
 #ifdef INVARIANTS
 	if (p == NULL)
 		panic("null proc/thread");
 #endif
 	if (addr != NULL) {
 		switch (addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			{
 				struct sockaddr_in *sin;
 
 				/* IPV6_V6ONLY socket? */
 				if (SCTP_IPV6_V6ONLY(inp)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 					return (EINVAL);
 				}
 				if (addr->sa_len != sizeof(*sin)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 					return (EINVAL);
 				}
 
 				sin = (struct sockaddr_in *)addr;
 				lport = sin->sin_port;
 				/*
 				 * For LOOPBACK the prison_local_ip4() call
 				 * will transmute the ip address to the
 				 * proper value.
 				 */
 				if (p && (error = prison_local_ip4(p->td_ucred, &sin->sin_addr)) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
 					return (error);
 				}
 				if (sin->sin_addr.s_addr != INADDR_ANY) {
 					bindall = 0;
 				}
 				break;
 			}
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				/*
 				 * Only for pure IPv6 Address. (No IPv4
 				 * Mapped!)
 				 */
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)addr;
 
 				if (addr->sa_len != sizeof(*sin6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 					return (EINVAL);
 				}
 				lport = sin6->sin6_port;
 				/*
 				 * For LOOPBACK the prison_local_ip6() call
 				 * will transmute the ipv6 address to the
 				 * proper value.
 				 */
 				if (p && (error = prison_local_ip6(p->td_ucred, &sin6->sin6_addr,
 				    (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
 					return (error);
 				}
 				if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 					bindall = 0;
 					/* KAME hack: embed scopeid */
 					if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 						return (EINVAL);
 					}
 				}
 				/* this must be cleared for ifa_ifwithaddr() */
 				sin6->sin6_scope_id = 0;
 				break;
 			}
 #endif
 		default:
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EAFNOSUPPORT);
 			return (EAFNOSUPPORT);
 		}
 	}
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(inp);
 	/* Setup a vrf_id to be the default for the non-bind-all case. */
 	vrf_id = inp->def_vrf_id;
 
 	/* increase our count due to the unlock we do */
 	SCTP_INP_INCR_REF(inp);
 	if (lport) {
 		/*
 		 * Did the caller specify a port? if so we must see if an ep
 		 * already has this one bound.
 		 */
 		/* got to be root to get at low ports */
 		if (ntohs(lport) < IPPORT_RESERVED) {
 			if ((p != NULL) && ((error =
 			    priv_check(p, PRIV_NETINET_RESERVEDPORT)
 			    ) != 0)) {
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				return (error);
 			}
 		}
 		SCTP_INP_WUNLOCK(inp);
 		if (bindall) {
 			vrf_id = inp->def_vrf_id;
 			inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id);
 			if (inp_tmp != NULL) {
 				/*
 				 * lock guy returned and lower count note
 				 * that we are not bound so inp_tmp should
 				 * NEVER be inp. And it is this inp
 				 * (inp_tmp) that gets the reference bump,
 				 * so we must lower it.
 				 */
 				SCTP_INP_DECR_REF(inp_tmp);
 				/* unlock info */
 				if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 				    (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
 					/*
 					 * Ok, must be one-2-one and
 					 * allowing port re-use
 					 */
 					port_reuse_active = 1;
 					goto continue_anyway;
 				}
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 				return (EADDRINUSE);
 			}
 		} else {
 			inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id);
 			if (inp_tmp != NULL) {
 				/*
 				 * lock guy returned and lower count note
 				 * that we are not bound so inp_tmp should
 				 * NEVER be inp. And it is this inp
 				 * (inp_tmp) that gets the reference bump,
 				 * so we must lower it.
 				 */
 				SCTP_INP_DECR_REF(inp_tmp);
 				/* unlock info */
 				if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 				    (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
 					/*
 					 * Ok, must be one-2-one and
 					 * allowing port re-use
 					 */
 					port_reuse_active = 1;
 					goto continue_anyway;
 				}
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 				return (EADDRINUSE);
 			}
 		}
 continue_anyway:
 		SCTP_INP_WLOCK(inp);
 		if (bindall) {
 			/* verify that no lport is not used by a singleton */
 			if ((port_reuse_active == 0) &&
 			    (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id))) {
 				/* Sorry someone already has this one bound */
 				if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 				    (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) {
 					port_reuse_active = 1;
 				} else {
 					SCTP_INP_DECR_REF(inp);
 					SCTP_INP_WUNLOCK(inp);
 					SCTP_INP_INFO_WUNLOCK();
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 					return (EADDRINUSE);
 				}
 			}
 		}
 	} else {
 		uint16_t first, last, candidate;
 		uint16_t count;
 		int done;
 
 		if (ip_inp->inp_flags & INP_HIGHPORT) {
 			first = MODULE_GLOBAL(ipport_hifirstauto);
 			last = MODULE_GLOBAL(ipport_hilastauto);
 		} else if (ip_inp->inp_flags & INP_LOWPORT) {
 			if (p && (error =
 			    priv_check(p, PRIV_NETINET_RESERVEDPORT)
 			    )) {
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error);
 				return (error);
 			}
 			first = MODULE_GLOBAL(ipport_lowfirstauto);
 			last = MODULE_GLOBAL(ipport_lowlastauto);
 		} else {
 			first = MODULE_GLOBAL(ipport_firstauto);
 			last = MODULE_GLOBAL(ipport_lastauto);
 		}
 		if (first > last) {
 			uint16_t temp;
 
 			temp = first;
 			first = last;
 			last = temp;
 		}
 		count = last - first + 1;	/* number of candidates */
 		candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count);
 
 		done = 0;
 		while (!done) {
 			if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) {
 				done = 1;
 			}
 			if (!done) {
 				if (--count == 0) {
 					SCTP_INP_DECR_REF(inp);
 					SCTP_INP_WUNLOCK(inp);
 					SCTP_INP_INFO_WUNLOCK();
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE);
 					return (EADDRINUSE);
 				}
 				if (candidate == last)
 					candidate = first;
 				else
 					candidate = candidate + 1;
 			}
 		}
 		lport = htons(candidate);
 	}
 	SCTP_INP_DECR_REF(inp);
 	if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE |
 	    SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		/*
 		 * this really should not happen. The guy did a non-blocking
 		 * bind and then did a close at the same time.
 		 */
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		return (EINVAL);
 	}
 	/* ok we look clear to give out this port, so lets setup the binding */
 	if (bindall) {
 		/* binding to all addresses, so just set in the proper flags */
 		inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL;
 		/* set the automatic addr changes from kernel flag */
 		if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) {
 			sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF);
 			sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 		} else {
 			sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF);
 			sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 		}
 		if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) {
 			sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS);
 		} else {
 			sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS);
 		}
 		/*
 		 * set the automatic mobility_base from kernel flag (by
 		 * micchie)
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) {
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		} else {
 			sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		}
 		/*
 		 * set the automatic mobility_fasthandoff from kernel flag
 		 * (by micchie)
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) {
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		} else {
 			sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF);
 			sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED);
 		}
 	} else {
 		/*
 		 * bind specific, make sure flags is off and add a new
 		 * address structure to the sctp_addr_list inside the ep
 		 * structure.
 		 *
 		 * We will need to allocate one and insert it at the head.
 		 * The socketopt call can just insert new addresses in there
 		 * as well. It will also have to do the embed scope kame
 		 * hack too (before adding).
 		 */
 		struct sctp_ifa *ifa;
 		union sctp_sockstore store;
 
 		memset(&store, 0, sizeof(store));
 		switch (addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			memcpy(&store.sin, addr, sizeof(struct sockaddr_in));
 			store.sin.sin_port = 0;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6));
 			store.sin6.sin6_port = 0;
 			break;
 #endif
 		default:
 			break;
 		}
 		/*
 		 * first find the interface with the bound address need to
 		 * zero out the port to find the address! yuck! can't do
 		 * this earlier since need port for sctp_pcb_findep()
 		 */
 		if (sctp_ifap != NULL) {
 			ifa = sctp_ifap;
 		} else {
 			/*
 			 * Note for BSD we hit here always other O/S's will
 			 * pass things in via the sctp_ifap argument.
 			 */
 			ifa = sctp_find_ifa_by_addr(&store.sa,
 			    vrf_id, SCTP_ADDR_NOT_LOCKED);
 		}
 		if (ifa == NULL) {
 			/* Can't find an interface with that address */
 			SCTP_INP_WUNLOCK(inp);
 			SCTP_INP_INFO_WUNLOCK();
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRNOTAVAIL);
 			return (EADDRNOTAVAIL);
 		}
 #ifdef INET6
 		if (addr->sa_family == AF_INET6) {
 			/* GAK, more FIXME IFA lock? */
 			if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 				/* Can't bind a non-existent addr. */
 				SCTP_INP_WUNLOCK(inp);
 				SCTP_INP_INFO_WUNLOCK();
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 				return (EINVAL);
 			}
 		}
 #endif
 		/* we're not bound all */
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL;
 		/* allow bindx() to send ASCONF's for binding changes */
 		sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF);
 		/* clear automatic addr changes from kernel flag */
 		sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 
 		/* add this address to the endpoint list */
 		error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0);
 		if (error != 0) {
 			SCTP_INP_WUNLOCK(inp);
 			SCTP_INP_INFO_WUNLOCK();
 			return (error);
 		}
 		inp->laddr_count++;
 	}
 	/* find the bucket */
 	if (port_reuse_active) {
 		/* Put it into tcp 1-2-1 hash */
 		head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))];
 		inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL;
 	} else {
 		head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))];
 	}
 	/* put it in the bucket */
 	LIST_INSERT_HEAD(head, inp, sctp_hash);
 	SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n",
 	    (void *)head, ntohs(lport), port_reuse_active);
 	/* set in the port */
 	inp->sctp_lport = lport;
 
 	/* turn off just the unbound flag */
 	inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND;
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 	return (0);
 }
 
 
 static void
 sctp_iterator_inp_being_freed(struct sctp_inpcb *inp)
 {
 	struct sctp_iterator *it, *nit;
 
 	/*
 	 * We enter with the only the ITERATOR_LOCK in place and a write
 	 * lock on the inp_info stuff.
 	 */
 	it = sctp_it_ctl.cur_it;
 	if (it && (it->vn != curvnet)) {
 		/* Its not looking at our VNET */
 		return;
 	}
 	if (it && (it->inp == inp)) {
 		/*
 		 * This is tricky and we hold the iterator lock, but when it
 		 * returns and gets the lock (when we release it) the
 		 * iterator will try to operate on inp. We need to stop that
 		 * from happening. But of course the iterator has a
 		 * reference on the stcb and inp. We can mark it and it will
 		 * stop.
 		 *
 		 * If its a single iterator situation, we set the end
 		 * iterator flag. Otherwise we set the iterator to go to the
 		 * next inp.
 		 *
 		 */
 		if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
 			sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
 		} else {
 			sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP;
 		}
 	}
 	/*
 	 * Now go through and remove any single reference to our inp that
 	 * may be still pending on the list
 	 */
 	SCTP_IPI_ITERATOR_WQ_LOCK();
 	TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) {
 		if (it->vn != curvnet) {
 			continue;
 		}
 		if (it->inp == inp) {
 			/* This one points to me is it inp specific? */
 			if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
 				/* Remove and free this one */
 				TAILQ_REMOVE(&sctp_it_ctl.iteratorhead,
 				    it, sctp_nxt_itr);
 				if (it->function_atend != NULL) {
 					(*it->function_atend) (it->pointer, it->val);
 				}
 				SCTP_FREE(it, SCTP_M_ITER);
 			} else {
 				it->inp = LIST_NEXT(it->inp, sctp_list);
 				if (it->inp) {
 					SCTP_INP_INCR_REF(it->inp);
 				}
 			}
 			/*
 			 * When its put in the refcnt is incremented so decr
 			 * it
 			 */
 			SCTP_INP_DECR_REF(inp);
 		}
 	}
 	SCTP_IPI_ITERATOR_WQ_UNLOCK();
 }
 
 /* release sctp_inpcb unbind the port */
 void
 sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
 {
 	/*
 	 * Here we free a endpoint. We must find it (if it is in the Hash
 	 * table) and remove it from there. Then we must also find it in the
 	 * overall list and remove it from there. After all removals are
 	 * complete then any timer has to be stopped. Then start the actual
 	 * freeing. a) Any local lists. b) Any associations. c) The hash of
 	 * all associations. d) finally the ep itself.
 	 */
 	struct sctp_tcb *asoc, *nasoc;
 	struct sctp_laddr *laddr, *nladdr;
 	struct inpcb *ip_pcb;
 	struct socket *so;
 	int being_refed = 0;
 	struct sctp_queued_to_read *sq, *nsq;
 	int cnt;
 	sctp_sharedkey_t *shared_key, *nshared_key;
 
 
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 0);
 #endif
 	SCTP_ITERATOR_LOCK();
 	/* mark any iterators on the list or being processed */
 	sctp_iterator_inp_being_freed(inp);
 	SCTP_ITERATOR_UNLOCK();
 	so = inp->sctp_socket;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 		/* been here before.. eeks.. get out of here */
 		SCTP_PRINTF("This conflict in free SHOULD not be happening! from %d, imm %d\n", from, immediate);
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 1);
 #endif
 		return;
 	}
 	SCTP_ASOC_CREATE_LOCK(inp);
 	SCTP_INP_INFO_WLOCK();
 
 	SCTP_INP_WLOCK(inp);
 	if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) {
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP;
 		/* socket is gone, so no more wakeups allowed */
 		inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE;
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT;
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT;
 
 	}
 	/* First time through we have the socket lock, after that no more. */
 	sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL,
 	    SCTP_FROM_SCTP_PCB + SCTP_LOC_1);
 
 	if (inp->control) {
 		sctp_m_freem(inp->control);
 		inp->control = NULL;
 	}
 	if (inp->pkt) {
 		sctp_m_freem(inp->pkt);
 		inp->pkt = NULL;
 	}
 	ip_pcb = &inp->ip_inp.inp;	/* we could just cast the main pointer
 					 * here but I will be nice :> (i.e.
 					 * ip_pcb = ep;) */
 	if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) {
 		int cnt_in_sd;
 
 		cnt_in_sd = 0;
 		LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_list, sctp_tcblist, nasoc) {
 			SCTP_TCB_LOCK(asoc);
 			if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				/* Skip guys being freed */
 				cnt_in_sd++;
 				if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) {
 					/*
 					 * Special case - we did not start a
 					 * kill timer on the asoc due to it
 					 * was not closed. So go ahead and
 					 * start it now.
 					 */
 					SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_IN_ACCEPT_QUEUE);
 					sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL);
 				}
 				SCTP_TCB_UNLOCK(asoc);
 				continue;
 			}
 			if (((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
 			    (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) &&
 			    (asoc->asoc.total_output_queue_size == 0)) {
 				/*
 				 * If we have data in queue, we don't want
 				 * to just free since the app may have done,
 				 * send()/close or connect/send/close. And
 				 * it wants the data to get across first.
 				 */
 				/* Just abandon things in the front states */
 				if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE,
 				    SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) {
 					cnt_in_sd++;
 				}
 				continue;
 			}
 			/* Disconnect the socket please */
 			asoc->sctp_socket = NULL;
 			SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_CLOSED_SOCKET);
 			if ((asoc->asoc.size_on_reasm_queue > 0) ||
 			    (asoc->asoc.control_pdapi) ||
 			    (asoc->asoc.size_on_all_streams > 0) ||
 			    (so && (so->so_rcv.sb_cc > 0))) {
 				/* Left with Data unread */
 				struct mbuf *op_err;
 
 				op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 				asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3;
 				sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 				SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 				if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
 				    (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 					SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 				}
 				if (sctp_free_assoc(inp, asoc,
 				    SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) {
 					cnt_in_sd++;
 				}
 				continue;
 			} else if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
 				    TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
 			    (asoc->asoc.stream_queue_cnt == 0)) {
 				if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) {
 					goto abort_anyway;
 				}
 				if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
 				    (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 					struct sctp_nets *netp;
 
 					/*
 					 * there is nothing queued to send,
 					 * so I send shutdown
 					 */
 					if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
 					sctp_stop_timers_for_shutdown(asoc);
 					if (asoc->asoc.alternate) {
 						netp = asoc->asoc.alternate;
 					} else {
 						netp = asoc->asoc.primary_destination;
 					}
 					sctp_send_shutdown(asoc, netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, asoc->sctp_ep, asoc,
 					    netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, NULL);
 					sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED);
 				}
 			} else {
 				/* mark into shutdown pending */
 				SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, NULL);
 				if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) {
 					SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT);
 				}
 				if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
 				    TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
 				    (asoc->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 					struct mbuf *op_err;
 
 			abort_anyway:
 					op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 					asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5;
 					sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 					if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					if (sctp_free_assoc(inp, asoc,
 					    SCTP_PCBFREE_NOFORCE,
 					    SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) {
 						cnt_in_sd++;
 					}
 					continue;
 				} else {
 					sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
 				}
 			}
 			cnt_in_sd++;
 			SCTP_TCB_UNLOCK(asoc);
 		}
 		/* now is there some left in our SHUTDOWN state? */
 		if (cnt_in_sd) {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 2);
 #endif
 			inp->sctp_socket = NULL;
 			SCTP_INP_WUNLOCK(inp);
 			SCTP_ASOC_CREATE_UNLOCK(inp);
 			SCTP_INP_INFO_WUNLOCK();
 			return;
 		}
 	}
 	inp->sctp_socket = NULL;
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) !=
 	    SCTP_PCB_FLAGS_UNBOUND) {
 		/*
 		 * ok, this guy has been bound. It's port is somewhere in
 		 * the SCTP_BASE_INFO(hash table). Remove it!
 		 */
 		LIST_REMOVE(inp, sctp_hash);
 		inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND;
 	}
 
 	/*
 	 * If there is a timer running to kill us, forget it, since it may
 	 * have a contest on the INP lock.. which would cause us to die ...
 	 */
 	cnt = 0;
 	LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_list, sctp_tcblist, nasoc) {
 		SCTP_TCB_LOCK(asoc);
 		if (immediate != SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) {
 			/* Disconnect the socket please */
 			asoc->sctp_socket = NULL;
 			SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_CLOSED_SOCKET);
 		}
 		if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) {
 				SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_IN_ACCEPT_QUEUE);
 				sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL);
 			}
 			cnt++;
 			SCTP_TCB_UNLOCK(asoc);
 			continue;
 		}
 		/* Free associations that are NOT killing us */
 		if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) &&
 		    ((asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) {
 			struct mbuf *op_err;
 
 			op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 			asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7;
 			sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 			SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 		} else if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			cnt++;
 			SCTP_TCB_UNLOCK(asoc);
 			continue;
 		}
 		if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
 		    (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 		}
 		if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE,
 		    SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) {
 			cnt++;
 		}
 	}
 	if (cnt) {
 		/* Ok we have someone out there that will kill us */
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 3);
 #endif
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return;
 	}
 	if (SCTP_INP_LOCK_CONTENDED(inp))
 		being_refed++;
 	if (SCTP_INP_READ_CONTENDED(inp))
 		being_refed++;
 	if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp))
 		being_refed++;
 	/* NOTE: 0 refcount also means no timers are referencing us. */
 	if ((inp->refcount) ||
 	    (being_refed) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) {
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 4);
 #endif
 		sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL);
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		return;
 	}
 	inp->sctp_ep.signature_change.type = 0;
 	inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE;
 	/*
 	 * Remove it from the list .. last thing we need a lock for.
 	 */
 	LIST_REMOVE(inp, sctp_list);
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_ASOC_CREATE_UNLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 5);
 #endif
 
 
 	if ((inp->sctp_asocidhash) != NULL) {
 		SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark);
 		inp->sctp_asocidhash = NULL;
 	}
 	/* sa_ignore FREED_MEMORY */
 	TAILQ_FOREACH_SAFE(sq, &inp->read_queue, next, nsq) {
 		/* Its only abandoned if it had data left */
 		if (sq->length)
 			SCTP_STAT_INCR(sctps_left_abandon);
 
 		TAILQ_REMOVE(&inp->read_queue, sq, next);
 		sctp_free_remote_addr(sq->whoFrom);
 		if (so)
 			so->so_rcv.sb_cc -= sq->length;
 		if (sq->data) {
 			sctp_m_freem(sq->data);
 			sq->data = NULL;
 		}
 		/*
 		 * no need to free the net count, since at this point all
 		 * assoc's are gone.
 		 */
 		sctp_free_a_readq(NULL, sq);
 	}
 	/* Now the sctp_pcb things */
 	/*
 	 * free each asoc if it is not already closed/free. we can't use the
 	 * macro here since le_next will get freed as part of the
 	 * sctp_free_assoc() call.
 	 */
 	if (ip_pcb->inp_options) {
 		(void)sctp_m_free(ip_pcb->inp_options);
 		ip_pcb->inp_options = 0;
 	}
 
 
 #ifdef INET6
 	if (ip_pcb->inp_vflag & INP_IPV6) {
 		ip6_freepcbopts(ip_pcb->in6p_outputopts);
 	}
 #endif				/* INET6 */
 	ip_pcb->inp_vflag = 0;
 	/* free up authentication fields */
 	if (inp->sctp_ep.local_auth_chunks != NULL)
 		sctp_free_chunklist(inp->sctp_ep.local_auth_chunks);
 	if (inp->sctp_ep.local_hmacs != NULL)
 		sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
 
 	LIST_FOREACH_SAFE(shared_key, &inp->sctp_ep.shared_keys, next, nshared_key) {
 		LIST_REMOVE(shared_key, next);
 		sctp_free_sharedkey(shared_key);
 		/* sa_ignore FREED_MEMORY */
 	}
 
 	/*
 	 * if we have an address list the following will free the list of
 	 * ifaddr's that are set into this ep. Again macro limitations here,
 	 * since the LIST_FOREACH could be a bad idea.
 	 */
 	LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) {
 		sctp_remove_laddr(laddr);
 	}
 
 #ifdef SCTP_TRACK_FREED_ASOCS
 	/* TEMP CODE */
 	LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_free_list, sctp_tcblist, nasoc) {
 		LIST_REMOVE(asoc, sctp_tcblist);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), asoc);
 		SCTP_DECR_ASOC_COUNT();
 	}
 	/* *** END TEMP CODE *** */
 #endif
 	/* Now lets see about freeing the EP hash table. */
 	if (inp->sctp_tcbhash != NULL) {
 		SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark);
 		inp->sctp_tcbhash = NULL;
 	}
 	/* Now we must put the ep memory back into the zone pool */
 	crfree(inp->ip_inp.inp.inp_cred);
 	INP_LOCK_DESTROY(&inp->ip_inp.inp);
 	SCTP_INP_LOCK_DESTROY(inp);
 	SCTP_INP_READ_DESTROY(inp);
 	SCTP_ASOC_CREATE_LOCK_DESTROY(inp);
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
 	SCTP_DECR_EP_COUNT();
 }
 
 
 struct sctp_nets *
 sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr)
 {
 	struct sctp_nets *net;
 
 	/* locate the address */
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr))
 			return (net);
 	}
 	return (NULL);
 }
 
 
 int
 sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id)
 {
 	struct sctp_ifa *sctp_ifa;
 
 	sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED);
 	if (sctp_ifa) {
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 /*
  * add's a remote endpoint address, done with the INIT/INIT-ACK as well as
  * when a ASCONF arrives that adds it. It will also initialize all the cwnd
  * stats of stuff.
  */
 int
 sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
     struct sctp_nets **netp, uint16_t port, int set_scope, int from)
 {
 	/*
 	 * The following is redundant to the same lines in the
 	 * sctp_aloc_assoc() but is needed since others call the add address
 	 * function
 	 */
 	struct sctp_nets *net, *netfirst;
 	int addr_inscope;
 
 	SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ",
 	    from);
 	SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr);
 
 	netfirst = sctp_findnet(stcb, newaddr);
 	if (netfirst) {
 		/*
 		 * Lie and return ok, we don't want to make the association
 		 * go away for this behavior. It will happen in the TCP
 		 * model in a connected socket. It does not reach the hash
 		 * table until after the association is built so it can't be
 		 * found. Mark as reachable, since the initial creation will
 		 * have been cleared and the NOT_IN_ASSOC flag will have
 		 * been added... and we don't want to end up removing it
 		 * back out.
 		 */
 		if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) {
 			netfirst->dest_state = (SCTP_ADDR_REACHABLE |
 			    SCTP_ADDR_UNCONFIRMED);
 		} else {
 			netfirst->dest_state = SCTP_ADDR_REACHABLE;
 		}
 
 		return (0);
 	}
 	addr_inscope = 1;
 	switch (newaddr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = (struct sockaddr_in *)newaddr;
 			if (sin->sin_addr.s_addr == 0) {
 				/* Invalid address */
 				return (-1);
 			}
 			/* zero out the zero area */
 			memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
 
 			/* assure len is set */
 			sin->sin_len = sizeof(struct sockaddr_in);
 			if (set_scope) {
 				if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
 					stcb->asoc.scope.ipv4_local_scope = 1;
 				}
 			} else {
 				/* Validate the address is in scope */
 				if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) &&
 				    (stcb->asoc.scope.ipv4_local_scope == 0)) {
 					addr_inscope = 0;
 				}
 			}
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)newaddr;
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				/* Invalid address */
 				return (-1);
 			}
 			/* assure len is set */
 			sin6->sin6_len = sizeof(struct sockaddr_in6);
 			if (set_scope) {
 				if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) {
 					stcb->asoc.scope.loopback_scope = 1;
 					stcb->asoc.scope.local_scope = 0;
 					stcb->asoc.scope.ipv4_local_scope = 1;
 					stcb->asoc.scope.site_scope = 1;
 				} else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 					/*
 					 * If the new destination is a
 					 * LINK_LOCAL we must have common
 					 * site scope. Don't set the local
 					 * scope since we may not share all
 					 * links, only loopback can do this.
 					 * Links on the local network would
 					 * also be on our private network
 					 * for v4 too.
 					 */
 					stcb->asoc.scope.ipv4_local_scope = 1;
 					stcb->asoc.scope.site_scope = 1;
 				} else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
 					/*
 					 * If the new destination is
 					 * SITE_LOCAL then we must have site
 					 * scope in common.
 					 */
 					stcb->asoc.scope.site_scope = 1;
 				}
 			} else {
 				/* Validate the address is in scope */
 				if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) &&
 				    (stcb->asoc.scope.loopback_scope == 0)) {
 					addr_inscope = 0;
 				} else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
 				    (stcb->asoc.scope.local_scope == 0)) {
 					addr_inscope = 0;
 				} else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
 				    (stcb->asoc.scope.site_scope == 0)) {
 					addr_inscope = 0;
 				}
 			}
 			break;
 		}
 #endif
 	default:
 		/* not supported family type */
 		return (-1);
 	}
 	net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets);
 	if (net == NULL) {
 		return (-1);
 	}
 	SCTP_INCR_RADDR_COUNT();
 	memset(net, 0, sizeof(struct sctp_nets));
 	(void)SCTP_GETTIME_TIMEVAL(&net->start_time);
 	memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len);
 	switch (newaddr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport;
 		break;
 #endif
 	default:
 		break;
 	}
 	net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id);
 	if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) {
 		stcb->asoc.scope.loopback_scope = 1;
 		stcb->asoc.scope.ipv4_local_scope = 1;
 		stcb->asoc.scope.local_scope = 0;
 		stcb->asoc.scope.site_scope = 1;
 		addr_inscope = 1;
 	}
 	net->failure_threshold = stcb->asoc.def_net_failure;
 	net->pf_threshold = stcb->asoc.def_net_pf_threshold;
 	if (addr_inscope == 0) {
 		net->dest_state = (SCTP_ADDR_REACHABLE |
 		    SCTP_ADDR_OUT_OF_SCOPE);
 	} else {
 		if (from == SCTP_ADDR_IS_CONFIRMED)
 			/* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */
 			net->dest_state = SCTP_ADDR_REACHABLE;
 		else
 			net->dest_state = SCTP_ADDR_REACHABLE |
 			    SCTP_ADDR_UNCONFIRMED;
 	}
 	/*
 	 * We set this to 0, the timer code knows that this means its an
 	 * initial value
 	 */
 	net->rto_needed = 1;
 	net->RTO = 0;
 	net->RTO_measured = 0;
 	stcb->asoc.numnets++;
 	net->ref_count = 1;
 	net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1;
 	net->port = port;
 	net->dscp = stcb->asoc.default_dscp;
 #ifdef INET6
 	net->flowlabel = stcb->asoc.default_flowlabel;
 #endif
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
 		net->dest_state |= SCTP_ADDR_NOHB;
 	} else {
 		net->dest_state &= ~SCTP_ADDR_NOHB;
 	}
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) {
 		net->dest_state |= SCTP_ADDR_NO_PMTUD;
 	} else {
 		net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 	}
 	net->heart_beat_delay = stcb->asoc.heart_beat_delay;
 	/* Init the timer structure */
 	SCTP_OS_TIMER_INIT(&net->rxt_timer.timer);
 	SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer);
 	SCTP_OS_TIMER_INIT(&net->hb_timer.timer);
 
 	/* Now generate a route for this guy */
 #ifdef INET6
 	/* KAME hack: embed scopeid */
 	if (newaddr->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 		(void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
 		sin6->sin6_scope_id = 0;
 	}
 #endif
 	SCTP_RTALLOC((sctp_route_t *)&net->ro,
 	    stcb->asoc.vrf_id,
 	    stcb->sctp_ep->fibnum);
 
 	net->src_addr_selected = 0;
 	if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) {
 		/* Get source address */
 		net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep,
 		    stcb,
 		    (sctp_route_t *)&net->ro,
 		    net,
 		    0,
 		    stcb->asoc.vrf_id);
 		if (stcb->asoc.default_mtu > 0) {
 			net->mtu = stcb->asoc.default_mtu;
 			switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				net->mtu += SCTP_MIN_V4_OVERHEAD;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				net->mtu += SCTP_MIN_OVERHEAD;
 				break;
 #endif
 			default:
 				break;
 			}
 #if defined(INET) || defined(INET6)
 			if (net->port) {
 				net->mtu += (uint32_t)sizeof(struct udphdr);
 			}
 #endif
 		} else if (net->ro._s_addr != NULL) {
 			uint32_t imtu, rmtu, hcmtu;
 
 			net->src_addr_selected = 1;
 			/* Now get the interface MTU */
 			if (net->ro._s_addr->ifn_p != NULL) {
 				imtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p);
 			} else {
 				imtu = 0;
 			}
 			rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_nh);
 			hcmtu = sctp_hc_get_mtu(&net->ro._l_addr, stcb->sctp_ep->fibnum);
 			net->mtu = sctp_min_mtu(hcmtu, rmtu, imtu);
 		}
 	}
 	if (net->mtu == 0) {
 		if (stcb->asoc.default_mtu > 0) {
 			net->mtu = stcb->asoc.default_mtu;
 			switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				net->mtu += SCTP_MIN_V4_OVERHEAD;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				net->mtu += SCTP_MIN_OVERHEAD;
 				break;
 #endif
 			default:
 				break;
 			}
 #if defined(INET) || defined(INET6)
 			if (net->port) {
 				net->mtu += (uint32_t)sizeof(struct udphdr);
 			}
 #endif
 		} else {
 			switch (newaddr->sa_family) {
 #ifdef INET
 			case AF_INET:
 				net->mtu = SCTP_DEFAULT_MTU;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				net->mtu = 1280;
 				break;
 #endif
 			default:
 				break;
 			}
 		}
 	}
 #if defined(INET) || defined(INET6)
 	if (net->port) {
 		net->mtu -= (uint32_t)sizeof(struct udphdr);
 	}
 #endif
 	if (from == SCTP_ALLOC_ASOC) {
 		stcb->asoc.smallest_mtu = net->mtu;
 	}
 	if (stcb->asoc.smallest_mtu > net->mtu) {
 		sctp_pathmtu_adjustment(stcb, net->mtu);
 	}
 #ifdef INET6
 	if (newaddr->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 		(void)sa6_recoverscope(sin6);
 	}
 #endif
 
 	/* JRS - Use the congestion control given in the CC module */
 	if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL)
 		(*stcb->asoc.cc_functions.sctp_set_initial_cc_param) (stcb, net);
 
 	/*
 	 * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning
 	 * of assoc (2005/06/27, iyengar@cis.udel.edu)
 	 */
 	net->find_pseudo_cumack = 1;
 	net->find_rtx_pseudo_cumack = 1;
 	/* Choose an initial flowid. */
 	net->flowid = stcb->asoc.my_vtag ^
 	    ntohs(stcb->rport) ^
 	    ntohs(stcb->sctp_ep->sctp_lport);
 	net->flowtype = M_HASHTYPE_OPAQUE_HASH;
 	if (netp) {
 		*netp = net;
 	}
 	netfirst = TAILQ_FIRST(&stcb->asoc.nets);
 	if (net->ro.ro_nh == NULL) {
 		/* Since we have no route put it at the back */
 		TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
 	} else if (netfirst == NULL) {
 		/* We are the first one in the pool. */
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
 	} else if (netfirst->ro.ro_nh == NULL) {
 		/*
 		 * First one has NO route. Place this one ahead of the first
 		 * one.
 		 */
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
 	} else if (net->ro.ro_nh->nh_ifp != netfirst->ro.ro_nh->nh_ifp) {
 		/*
 		 * This one has a different interface than the one at the
 		 * top of the list. Place it ahead.
 		 */
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
 	} else {
 		/*
 		 * Ok we have the same interface as the first one. Move
 		 * forward until we find either a) one with a NULL route...
 		 * insert ahead of that b) one with a different ifp.. insert
 		 * after that. c) end of the list.. insert at the tail.
 		 */
 		struct sctp_nets *netlook;
 
 		do {
 			netlook = TAILQ_NEXT(netfirst, sctp_next);
 			if (netlook == NULL) {
 				/* End of the list */
 				TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
 				break;
 			} else if (netlook->ro.ro_nh == NULL) {
 				/* next one has NO route */
 				TAILQ_INSERT_BEFORE(netfirst, net, sctp_next);
 				break;
 			} else if (netlook->ro.ro_nh->nh_ifp != net->ro.ro_nh->nh_ifp) {
 				TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook,
 				    net, sctp_next);
 				break;
 			}
 			/* Shift forward */
 			netfirst = netlook;
 		} while (netlook != NULL);
 	}
 
 	/* got to have a primary set */
 	if (stcb->asoc.primary_destination == 0) {
 		stcb->asoc.primary_destination = net;
 	} else if ((stcb->asoc.primary_destination->ro.ro_nh == NULL) &&
 		    (net->ro.ro_nh) &&
 	    ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) {
 		/* No route to current primary adopt new primary */
 		stcb->asoc.primary_destination = net;
 	}
 	/* Validate primary is first */
 	net = TAILQ_FIRST(&stcb->asoc.nets);
 	if ((net != stcb->asoc.primary_destination) &&
 	    (stcb->asoc.primary_destination)) {
 		/*
 		 * first one on the list is NOT the primary sctp_cmpaddr()
 		 * is much more efficient if the primary is the first on the
 		 * list, make it so.
 		 */
 		TAILQ_REMOVE(&stcb->asoc.nets,
 		    stcb->asoc.primary_destination, sctp_next);
 		TAILQ_INSERT_HEAD(&stcb->asoc.nets,
 		    stcb->asoc.primary_destination, sctp_next);
 	}
 	return (0);
 }
 
 
 static uint32_t
 sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb)
 {
 	uint32_t id;
 	struct sctpasochead *head;
 	struct sctp_tcb *lstcb;
 
 try_again:
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 		/* TSNH */
 		return (0);
 	}
 	/*
 	 * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC,
 	 * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC.
 	 */
 	if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) {
 		inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1;
 	}
 	id = inp->sctp_associd_counter;
 	inp->sctp_associd_counter++;
 	lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t)id, 0);
 	if (lstcb) {
 		goto try_again;
 	}
 	head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)];
 	LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash);
 	stcb->asoc.in_asocid_hash = 1;
 	return (id);
 }
 
 /*
  * allocate an association and add it to the endpoint. The caller must be
  * careful to add all additional addresses once they are know right away or
  * else the assoc will be may experience a blackout scenario.
  */
 struct sctp_tcb *
 sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr,
     int *error, uint32_t override_tag, uint32_t vrf_id,
     uint16_t o_streams, uint16_t port,
     struct thread *p,
     int initialize_auth_params)
 {
 	/* note the p argument is only valid in unbound sockets */
 
 	struct sctp_tcb *stcb;
 	struct sctp_association *asoc;
 	struct sctpasochead *head;
 	uint16_t rport;
 	int err;
 
 	/*
 	 * Assumption made here: Caller has done a
 	 * sctp_findassociation_ep_addr(ep, addr's); to make sure the
 	 * address does not exist already.
 	 */
 	if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) {
 		/* Hit max assoc, sorry no more */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		*error = ENOBUFS;
 		return (NULL);
 	}
 	if (firstaddr == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	SCTP_INP_RLOCK(inp);
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
 	    ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) {
 		/*
 		 * If its in the TCP pool, its NOT allowed to create an
 		 * association. The parent listener needs to call
 		 * sctp_aloc_assoc.. or the one-2-many socket. If a peeled
 		 * off, or connected one does this.. its an error.
 		 */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) {
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED)) {
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 			*error = EINVAL;
 			return (NULL);
 		}
 	}
 	SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:");
 #ifdef SCTP_DEBUG
 	if (firstaddr) {
 		SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr);
 		switch (firstaddr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n",
 			    ntohs(((struct sockaddr_in *)firstaddr)->sin_port));
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n",
 			    ntohs(((struct sockaddr_in6 *)firstaddr)->sin6_port));
 			break;
 #endif
 		default:
 			break;
 		}
 	} else {
 		SCTPDBG(SCTP_DEBUG_PCB3, "None\n");
 	}
 #endif				/* SCTP_DEBUG */
 	switch (firstaddr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = (struct sockaddr_in *)firstaddr;
 			if ((ntohs(sin->sin_port) == 0) ||
 			    (sin->sin_addr.s_addr == INADDR_ANY) ||
 			    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 			    IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 				/* Invalid address */
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 				*error = EINVAL;
 				return (NULL);
 			}
 			rport = sin->sin_port;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)firstaddr;
 			if ((ntohs(sin6->sin6_port) == 0) ||
 			    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 				/* Invalid address */
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 				*error = EINVAL;
 				return (NULL);
 			}
 			rport = sin6->sin6_port;
 			break;
 		}
 #endif
 	default:
 		/* not supported family type */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		/*
 		 * If you have not performed a bind, then we need to do the
 		 * ephemeral bind for you.
 		 */
 		if ((err = sctp_inpcb_bind(inp->sctp_socket, NULL, NULL, p))) {
 			/* bind error, probably perm */
 			*error = err;
 			return (NULL);
 		}
 	}
 	stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb);
 	if (stcb == NULL) {
 		/* out of memory? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM);
 		*error = ENOMEM;
 		return (NULL);
 	}
 	SCTP_INCR_ASOC_COUNT();
 
 	memset(stcb, 0, sizeof(*stcb));
 	asoc = &stcb->asoc;
 
 	SCTP_TCB_LOCK_INIT(stcb);
 	SCTP_TCB_SEND_LOCK_INIT(stcb);
 	stcb->rport = rport;
 	/* setup back pointer's */
 	stcb->sctp_ep = inp;
 	stcb->sctp_socket = inp->sctp_socket;
 	if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id, o_streams))) {
 		/* failed */
 		SCTP_TCB_LOCK_DESTROY(stcb);
 		SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_DECR_ASOC_COUNT();
 		*error = err;
 		return (NULL);
 	}
 	/* and the port */
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(inp);
 	if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		/* inpcb freed while alloc going on */
 		SCTP_TCB_LOCK_DESTROY(stcb);
 		SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_DECR_ASOC_COUNT();
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
 		*error = EINVAL;
 		return (NULL);
 	}
 	SCTP_TCB_LOCK(stcb);
 
 	asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb);
 	/* now that my_vtag is set, add it to the hash */
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))];
 	/* put it in the bucket in the vtag hash of assoc's for the system */
 	LIST_INSERT_HEAD(head, stcb, sctp_asocs);
 	SCTP_INP_INFO_WUNLOCK();
 
 	if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, port, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) {
 		/* failure.. memory error? */
 		if (asoc->strmout) {
 			SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
 			asoc->strmout = NULL;
 		}
 		if (asoc->mapping_array) {
 			SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
 			asoc->mapping_array = NULL;
 		}
 		if (asoc->nr_mapping_array) {
 			SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
 			asoc->nr_mapping_array = NULL;
 		}
 		SCTP_DECR_ASOC_COUNT();
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_TCB_LOCK_DESTROY(stcb);
 		SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
 		*error = ENOBUFS;
 		return (NULL);
 	}
 	/* Init all the timers */
 	SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer);
 	SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer);
 
 	LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist);
 	/* now file the port under the hash as well */
 	if (inp->sctp_tcbhash != NULL) {
 		head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport,
 		    inp->sctp_hashmark)];
 		LIST_INSERT_HEAD(head, stcb, sctp_tcbhash);
 	}
 	if (initialize_auth_params == SCTP_INITIALIZE_AUTH_PARAMS) {
 		sctp_initialize_auth_params(inp, stcb);
 	}
 	SCTP_INP_WUNLOCK(inp);
 	SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", (void *)stcb);
 	return (stcb);
 }
 
 
 void
 sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_association *asoc;
 
 	inp = stcb->sctp_ep;
 	asoc = &stcb->asoc;
 	asoc->numnets--;
 	TAILQ_REMOVE(&asoc->nets, net, sctp_next);
 	if (net == asoc->primary_destination) {
 		/* Reset primary */
 		struct sctp_nets *lnet;
 
 		lnet = TAILQ_FIRST(&asoc->nets);
 		/*
 		 * Mobility adaptation Ideally, if deleted destination is
 		 * the primary, it becomes a fast retransmission trigger by
 		 * the subsequent SET PRIMARY. (by micchie)
 		 */
 		if (sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_BASE) ||
 		    sctp_is_mobility_feature_on(stcb->sctp_ep,
 		    SCTP_MOBILITY_FASTHANDOFF)) {
 			SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n");
 			if (asoc->deleted_primary != NULL) {
 				SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n");
 				goto out;
 			}
 			asoc->deleted_primary = net;
 			atomic_add_int(&net->ref_count, 1);
 			memset(&net->lastsa, 0, sizeof(net->lastsa));
 			memset(&net->lastsv, 0, sizeof(net->lastsv));
 			sctp_mobility_feature_on(stcb->sctp_ep,
 			    SCTP_MOBILITY_PRIM_DELETED);
 			sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED,
 			    stcb->sctp_ep, stcb, NULL);
 		}
 out:
 		/* Try to find a confirmed primary */
 		asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0);
 	}
 	if (net == asoc->last_data_chunk_from) {
 		/* Reset primary */
 		asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets);
 	}
 	if (net == asoc->last_control_chunk_from) {
 		/* Clear net */
 		asoc->last_control_chunk_from = NULL;
 	}
 	if (net == stcb->asoc.alternate) {
 		sctp_free_remote_addr(stcb->asoc.alternate);
 		stcb->asoc.alternate = NULL;
 	}
 	sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 	    SCTP_FROM_SCTP_PCB + SCTP_LOC_9);
 	sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 	    SCTP_FROM_SCTP_PCB + SCTP_LOC_10);
 	net->dest_state |= SCTP_ADDR_BEING_DELETED;
 	sctp_free_remote_addr(net);
 }
 
 /*
  * remove a remote endpoint address from an association, it will fail if the
  * address does not exist.
  */
 int
 sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr)
 {
 	/*
 	 * Here we need to remove a remote address. This is quite simple, we
 	 * first find it in the list of address for the association
 	 * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE
 	 * on that item. Note we do not allow it to be removed if there are
 	 * no other addresses.
 	 */
 	struct sctp_association *asoc;
 	struct sctp_nets *net, *nnet;
 
 	asoc = &stcb->asoc;
 
 	/* locate the address */
 	TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) {
 		if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) {
 			continue;
 		}
 		if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr,
 		    remaddr)) {
 			/* we found the guy */
 			if (asoc->numnets < 2) {
 				/* Must have at LEAST two remote addresses */
 				return (-1);
 			} else {
 				sctp_remove_net(stcb, net);
 				return (0);
 			}
 		}
 	}
 	/* not found. */
 	return (-2);
 }
 
 void
 sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
 {
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	int found = 0;
 	int i;
 
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if ((twait_block->vtag_block[i].v_tag == tag) &&
 			    (twait_block->vtag_block[i].lport == lport) &&
 			    (twait_block->vtag_block[i].rport == rport)) {
 				twait_block->vtag_block[i].tv_sec_at_expire = 0;
 				twait_block->vtag_block[i].v_tag = 0;
 				twait_block->vtag_block[i].lport = 0;
 				twait_block->vtag_block[i].rport = 0;
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			break;
 	}
 }
 
 int
 sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
 {
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	int found = 0;
 	int i;
 
 	SCTP_INP_INFO_WLOCK();
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if ((twait_block->vtag_block[i].v_tag == tag) &&
 			    (twait_block->vtag_block[i].lport == lport) &&
 			    (twait_block->vtag_block[i].rport == rport)) {
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			break;
 	}
 	SCTP_INP_INFO_WUNLOCK();
 	return (found);
 }
 
 
 void
 sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport)
 {
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	struct timeval now;
 	int set, i;
 
 	if (time == 0) {
 		/* Its disabled */
 		return;
 	}
 	(void)SCTP_GETTIME_TIMEVAL(&now);
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	set = 0;
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		/* Block(s) present, lets find space, and expire on the fly */
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if ((twait_block->vtag_block[i].v_tag == 0) &&
 			    !set) {
 				twait_block->vtag_block[i].tv_sec_at_expire =
 				    now.tv_sec + time;
 				twait_block->vtag_block[i].v_tag = tag;
 				twait_block->vtag_block[i].lport = lport;
 				twait_block->vtag_block[i].rport = rport;
 				set = 1;
 			} else if ((twait_block->vtag_block[i].v_tag) &&
 			    ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) {
 				/* Audit expires this guy */
 				twait_block->vtag_block[i].tv_sec_at_expire = 0;
 				twait_block->vtag_block[i].v_tag = 0;
 				twait_block->vtag_block[i].lport = 0;
 				twait_block->vtag_block[i].rport = 0;
 				if (set == 0) {
 					/* Reuse it for my new tag */
 					twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time;
 					twait_block->vtag_block[i].v_tag = tag;
 					twait_block->vtag_block[i].lport = lport;
 					twait_block->vtag_block[i].rport = rport;
 					set = 1;
 				}
 			}
 		}
 		if (set) {
 			/*
 			 * We only do up to the block where we can place our
 			 * tag for audits
 			 */
 			break;
 		}
 	}
 	/* Need to add a new block to chain */
 	if (!set) {
 		SCTP_MALLOC(twait_block, struct sctp_tagblock *,
 		    sizeof(struct sctp_tagblock), SCTP_M_TIMW);
 		if (twait_block == NULL) {
 			return;
 		}
 		memset(twait_block, 0, sizeof(struct sctp_tagblock));
 		LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock);
 		twait_block->vtag_block[0].tv_sec_at_expire = now.tv_sec + time;
 		twait_block->vtag_block[0].v_tag = tag;
 		twait_block->vtag_block[0].lport = lport;
 		twait_block->vtag_block[0].rport = rport;
 	}
 }
 
 void
 sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh)
 {
 	struct sctp_tmit_chunk *chk, *nchk;
 	struct sctp_queued_to_read *control, *ncontrol;
 
 	TAILQ_FOREACH_SAFE(control, rh, next_instrm, ncontrol) {
 		TAILQ_REMOVE(rh, control, next_instrm);
 		control->on_strm_q = 0;
 		if (control->on_read_q == 0) {
 			sctp_free_remote_addr(control->whoFrom);
 			if (control->data) {
 				sctp_m_freem(control->data);
 				control->data = NULL;
 			}
 		}
 		/* Reassembly free? */
 		TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) {
 			TAILQ_REMOVE(&control->reasm, chk, sctp_next);
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 			if (chk->holds_key_ref)
 				sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 			sctp_free_remote_addr(chk->whoTo);
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 			SCTP_DECR_CHK_COUNT();
 			/* sa_ignore FREED_MEMORY */
 		}
 		/*
 		 * We don't free the address here since all the net's were
 		 * freed above.
 		 */
 		if (control->on_read_q == 0) {
 			sctp_free_a_readq(stcb, control);
 		}
 	}
 }
 
 /*-
  * Free the association after un-hashing the remote port. This
  * function ALWAYS returns holding NO LOCK on the stcb. It DOES
  * expect that the input to this function IS a locked TCB.
  * It will return 0, if it did NOT destroy the association (instead
  * it unlocks it. It will return NON-zero if it either destroyed the
  * association OR the association is already destroyed.
  */
 int
 sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location)
 {
 	int i;
 	struct sctp_association *asoc;
 	struct sctp_nets *net, *nnet;
 	struct sctp_laddr *laddr, *naddr;
 	struct sctp_tmit_chunk *chk, *nchk;
 	struct sctp_asconf_addr *aparam, *naparam;
 	struct sctp_asconf_ack *aack, *naack;
 	struct sctp_stream_reset_list *strrst, *nstrrst;
 	struct sctp_queued_to_read *sq, *nsq;
 	struct sctp_stream_queue_pending *sp, *nsp;
 	sctp_sharedkey_t *shared_key, *nshared_key;
 	struct socket *so;
 
 	/* first, lets purge the entry from the hash table. */
 
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, stcb, 6);
 #endif
 	if (stcb->asoc.state == 0) {
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 7);
 #endif
 		/* there is no asoc, really TSNH :-0 */
 		return (1);
 	}
+	SCTP_TCB_SEND_LOCK(stcb);
 	if (stcb->asoc.alternate) {
 		sctp_free_remote_addr(stcb->asoc.alternate);
 		stcb->asoc.alternate = NULL;
 	}
 	/* TEMP CODE */
 	if (stcb->freed_from_where == 0) {
 		/* Only record the first place free happened from */
 		stcb->freed_from_where = from_location;
 	}
 	/* TEMP CODE */
 
 	asoc = &stcb->asoc;
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
 		/* nothing around */
 		so = NULL;
 	else
 		so = inp->sctp_socket;
 
 	/*
 	 * We used timer based freeing if a reader or writer is in the way.
 	 * So we first check if we are actually being called from a timer,
 	 * if so we abort early if a reader or writer is still in the way.
 	 */
 	if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) &&
 	    (from_inpcbfree == SCTP_NORMAL_PROC)) {
 		/*
 		 * is it the timer driving us? if so are the reader/writers
 		 * gone?
 		 */
 		if (stcb->asoc.refcnt) {
 			/* nope, reader or writer in the way */
 			sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
 			/* no asoc destroyed */
+			SCTP_TCB_SEND_UNLOCK(stcb);
 			SCTP_TCB_UNLOCK(stcb);
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, stcb, 8);
 #endif
 			return (0);
 		}
 	}
 	/* Now clean up any other timers */
 	sctp_stop_association_timers(stcb, false);
 	/* Now the read queue needs to be cleaned up (only once) */
 	if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) {
 		SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_ABOUT_TO_BE_FREED);
 		SCTP_INP_READ_LOCK(inp);
 		TAILQ_FOREACH(sq, &inp->read_queue, next) {
 			if (sq->stcb == stcb) {
 				sq->do_not_ref_stcb = 1;
 				sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn;
 				/*
 				 * If there is no end, there never will be
 				 * now.
 				 */
 				if (sq->end_added == 0) {
 					/* Held for PD-API clear that. */
 					sq->pdapi_aborted = 1;
 					sq->held_length = 0;
 					if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) {
 						/*
 						 * Need to add a PD-API
 						 * aborted indication.
 						 * Setting the control_pdapi
 						 * assures that it will be
 						 * added right after this
 						 * msg.
 						 */
 						uint32_t strseq;
 
 						stcb->asoc.control_pdapi = sq;
 						strseq = (sq->sinfo_stream << 16) | (sq->mid & 0x0000ffff);
 						sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION,
 						    stcb,
 						    SCTP_PARTIAL_DELIVERY_ABORTED,
 						    (void *)&strseq,
 						    SCTP_SO_LOCKED);
 						stcb->asoc.control_pdapi = NULL;
 					}
 				}
 				/* Add an end to wake them */
 				sq->end_added = 1;
 			}
 		}
 		SCTP_INP_READ_UNLOCK(inp);
 		if (stcb->block_entry) {
 			SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET);
 			stcb->block_entry->error = ECONNRESET;
 			stcb->block_entry = NULL;
 		}
 	}
 	if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) {
 		/*
 		 * Someone holds a reference OR the socket is unaccepted
 		 * yet.
 		 */
 		if ((stcb->asoc.refcnt) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 			SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE);
 			sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
 		}
+		SCTP_TCB_SEND_UNLOCK(stcb);
 		SCTP_TCB_UNLOCK(stcb);
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
 			/* nothing around */
 			so = NULL;
 		if (so) {
 			/* Wake any reader/writers */
 			sctp_sorwakeup(inp, so);
 			sctp_sowwakeup(inp, so);
 		}
 
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, stcb, 9);
 #endif
 		/* no asoc destroyed */
 		return (0);
 	}
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, stcb, 10);
 #endif
 	/*
 	 * When I reach here, no others want to kill the assoc yet.. and I
 	 * own the lock. Now its possible an abort comes in when I do the
 	 * lock exchange below to grab all the locks to do the final take
 	 * out. to prevent this we increment the count, which will start a
 	 * timer and blow out above thus assuring us that we hold exclusive
 	 * killing of the asoc. Note that after getting back the TCB lock we
 	 * will go ahead and increment the counter back up and stop any
 	 * timer a passing stranger may have started :-S
 	 */
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 
+		SCTP_TCB_SEND_UNLOCK(stcb);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_INFO_WLOCK();
 		SCTP_INP_WLOCK(inp);
 		SCTP_TCB_LOCK(stcb);
+		SCTP_TCB_SEND_LOCK(stcb);
 	}
 	/* Double check the GONE flag */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE))
 		/* nothing around */
 		so = NULL;
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/*
 		 * For TCP type we need special handling when we are
 		 * connected. We also include the peel'ed off ones to.
 		 */
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED;
 			inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED;
 			if (so) {
 				SOCKBUF_LOCK(&so->so_rcv);
 				so->so_state &= ~(SS_ISCONNECTING |
 				    SS_ISDISCONNECTING |
 				    SS_ISCONFIRMING |
 				    SS_ISCONNECTED);
 				so->so_state |= SS_ISDISCONNECTED;
 				socantrcvmore_locked(so);
 				socantsendmore(so);
 				sctp_sowwakeup(inp, so);
 				sctp_sorwakeup(inp, so);
 				SCTP_SOWAKEUP(so);
 			}
 		}
 	}
 
 	/*
 	 * Make it invalid too, that way if its about to run it will abort
 	 * and return.
 	 */
 	/* re-increment the lock */
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 	}
 	if (stcb->asoc.refcnt) {
 		SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE);
 		sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL);
 		if (from_inpcbfree == SCTP_NORMAL_PROC) {
 			SCTP_INP_INFO_WUNLOCK();
 			SCTP_INP_WUNLOCK(inp);
 		}
+		SCTP_TCB_SEND_UNLOCK(stcb);
 		SCTP_TCB_UNLOCK(stcb);
 		return (0);
 	}
 	asoc->state = 0;
 	if (inp->sctp_tcbhash) {
 		LIST_REMOVE(stcb, sctp_tcbhash);
 	}
 	if (stcb->asoc.in_asocid_hash) {
 		LIST_REMOVE(stcb, sctp_tcbasocidhash);
 	}
 	/* Now lets remove it from the list of ALL associations in the EP */
 	LIST_REMOVE(stcb, sctp_tcblist);
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		SCTP_INP_INCR_REF(inp);
 		SCTP_INP_WUNLOCK(inp);
 	}
 	/* pull from vtag hash */
 	LIST_REMOVE(stcb, sctp_asocs);
 	sctp_add_vtag_to_timewait(asoc->my_vtag, SCTP_BASE_SYSCTL(sctp_vtag_time_wait),
 	    inp->sctp_lport, stcb->rport);
 
 	/*
 	 * Now restop the timers to be sure this is paranoia at is finest!
 	 */
 	sctp_stop_association_timers(stcb, true);
 
 	/*
 	 * The chunk lists and such SHOULD be empty but we check them just
 	 * in case.
 	 */
 	/* anything on the wheel needs to be removed */
-	SCTP_TCB_SEND_LOCK(stcb);
 	for (i = 0; i < asoc->streamoutcnt; i++) {
 		struct sctp_stream_out *outs;
 
 		outs = &asoc->strmout[i];
 		/* now clean up any chunks here */
 		TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
 			atomic_subtract_int(&asoc->stream_queue_cnt, 1);
 			TAILQ_REMOVE(&outs->outqueue, sp, next);
 			stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1);
 			sctp_free_spbufspace(stcb, asoc, sp);
 			if (sp->data) {
 				if (so) {
 					/* Still an open socket - report */
 					sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
 					    0, (void *)sp, SCTP_SO_LOCKED);
 				}
 				if (sp->data) {
 					sctp_m_freem(sp->data);
 					sp->data = NULL;
 					sp->tail_mbuf = NULL;
 					sp->length = 0;
 				}
 			}
 			if (sp->net) {
 				sctp_free_remote_addr(sp->net);
 				sp->net = NULL;
 			}
 			sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED);
 		}
 	}
-	SCTP_TCB_SEND_UNLOCK(stcb);
 	/* sa_ignore FREED_MEMORY */
 	TAILQ_FOREACH_SAFE(strrst, &asoc->resetHead, next_resp, nstrrst) {
 		TAILQ_REMOVE(&asoc->resetHead, strrst, next_resp);
 		SCTP_FREE(strrst, SCTP_M_STRESET);
 	}
 	TAILQ_FOREACH_SAFE(sq, &asoc->pending_reply_queue, next, nsq) {
 		TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next);
 		if (sq->data) {
 			sctp_m_freem(sq->data);
 			sq->data = NULL;
 		}
 		sctp_free_remote_addr(sq->whoFrom);
 		sq->whoFrom = NULL;
 		sq->stcb = NULL;
 		/* Free the ctl entry */
 		sctp_free_a_readq(stcb, sq);
 		/* sa_ignore FREED_MEMORY */
 	}
 	TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1);
 		asoc->free_chunk_cnt--;
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* pending send queue SHOULD be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
 		if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) {
 			asoc->strmout[chk->rec.data.sid].chunks_on_queues--;
 #ifdef INVARIANTS
 		} else {
 			panic("No chunks on the queues for sid %u.", chk->rec.data.sid);
 #endif
 		}
 		TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
 		if (chk->data) {
 			if (so) {
 				/* Still a socket? */
 				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
 				    0, chk, SCTP_SO_LOCKED);
 			}
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		if (chk->whoTo) {
 			sctp_free_remote_addr(chk->whoTo);
 			chk->whoTo = NULL;
 		}
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* sent queue SHOULD be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) {
 		if (chk->sent != SCTP_DATAGRAM_NR_ACKED) {
 			if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) {
 				asoc->strmout[chk->rec.data.sid].chunks_on_queues--;
 #ifdef INVARIANTS
 			} else {
 				panic("No chunks on the queues for sid %u.", chk->rec.data.sid);
 #endif
 			}
 		}
 		TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
 		if (chk->data) {
 			if (so) {
 				/* Still a socket? */
 				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb,
 				    0, chk, SCTP_SO_LOCKED);
 			}
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 #ifdef INVARIANTS
 	for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 		if (stcb->asoc.strmout[i].chunks_on_queues > 0) {
 			panic("%u chunks left for stream %u.", stcb->asoc.strmout[i].chunks_on_queues, i);
 		}
 	}
 #endif
 	/* control queue MAY not be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* ASCONF queue MAY not be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next);
 		if (chk->data) {
 			sctp_m_freem(chk->data);
 			chk->data = NULL;
 		}
 		if (chk->holds_key_ref)
 			sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
 		sctp_free_remote_addr(chk->whoTo);
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
 		SCTP_DECR_CHK_COUNT();
 		/* sa_ignore FREED_MEMORY */
 	}
 	if (asoc->mapping_array) {
 		SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
 		asoc->mapping_array = NULL;
 	}
 	if (asoc->nr_mapping_array) {
 		SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
 		asoc->nr_mapping_array = NULL;
 	}
 	/* the stream outs */
 	if (asoc->strmout) {
 		SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
 		asoc->strmout = NULL;
 	}
 	asoc->strm_realoutsize = asoc->streamoutcnt = 0;
 	if (asoc->strmin) {
 		for (i = 0; i < asoc->streamincnt; i++) {
 			sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue);
 			sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue);
 		}
 		SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
 		asoc->strmin = NULL;
 	}
 	asoc->streamincnt = 0;
 	TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) {
 #ifdef INVARIANTS
 		if (SCTP_BASE_INFO(ipi_count_raddr) == 0) {
 			panic("no net's left alloc'ed, or list points to itself");
 		}
 #endif
 		TAILQ_REMOVE(&asoc->nets, net, sctp_next);
 		sctp_free_remote_addr(net);
 	}
 	LIST_FOREACH_SAFE(laddr, &asoc->sctp_restricted_addrs, sctp_nxt_addr, naddr) {
 		/* sa_ignore FREED_MEMORY */
 		sctp_remove_laddr(laddr);
 	}
 
 	/* pending asconf (address) parameters */
 	TAILQ_FOREACH_SAFE(aparam, &asoc->asconf_queue, next, naparam) {
 		/* sa_ignore FREED_MEMORY */
 		TAILQ_REMOVE(&asoc->asconf_queue, aparam, next);
 		SCTP_FREE(aparam, SCTP_M_ASC_ADDR);
 	}
 	TAILQ_FOREACH_SAFE(aack, &asoc->asconf_ack_sent, next, naack) {
 		/* sa_ignore FREED_MEMORY */
 		TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next);
 		if (aack->data != NULL) {
 			sctp_m_freem(aack->data);
 		}
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack);
 	}
 	/* clean up auth stuff */
 	if (asoc->local_hmacs)
 		sctp_free_hmaclist(asoc->local_hmacs);
 	if (asoc->peer_hmacs)
 		sctp_free_hmaclist(asoc->peer_hmacs);
 
 	if (asoc->local_auth_chunks)
 		sctp_free_chunklist(asoc->local_auth_chunks);
 	if (asoc->peer_auth_chunks)
 		sctp_free_chunklist(asoc->peer_auth_chunks);
 
 	sctp_free_authinfo(&asoc->authinfo);
 
 	LIST_FOREACH_SAFE(shared_key, &asoc->shared_keys, next, nshared_key) {
 		LIST_REMOVE(shared_key, next);
 		sctp_free_sharedkey(shared_key);
 		/* sa_ignore FREED_MEMORY */
 	}
 
 	/* Insert new items here :> */
 
 	/* Get rid of LOCK */
+	SCTP_TCB_SEND_UNLOCK(stcb);
 	SCTP_TCB_UNLOCK(stcb);
 	SCTP_TCB_LOCK_DESTROY(stcb);
 	SCTP_TCB_SEND_LOCK_DESTROY(stcb);
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		SCTP_INP_INFO_WUNLOCK();
 		SCTP_INP_RLOCK(inp);
 	}
 #ifdef SCTP_TRACK_FREED_ASOCS
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 		/* now clean up the tasoc itself */
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 		SCTP_DECR_ASOC_COUNT();
 	} else {
 		LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist);
 	}
 #else
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb);
 	SCTP_DECR_ASOC_COUNT();
 #endif
 	if (from_inpcbfree == SCTP_NORMAL_PROC) {
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 			/*
 			 * If its NOT the inp_free calling us AND sctp_close
 			 * as been called, we call back...
 			 */
 			SCTP_INP_RUNLOCK(inp);
 			/*
 			 * This will start the kill timer (if we are the
 			 * last one) since we hold an increment yet. But
 			 * this is the only safe way to do this since
 			 * otherwise if the socket closes at the same time
 			 * we are here we might collide in the cleanup.
 			 */
 			sctp_inpcb_free(inp,
 			    SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE,
 			    SCTP_CALLED_DIRECTLY_NOCMPSET);
 			SCTP_INP_DECR_REF(inp);
 		} else {
 			/* The socket is still open. */
 			SCTP_INP_DECR_REF(inp);
 			SCTP_INP_RUNLOCK(inp);
 		}
 	}
 	/* destroyed the asoc */
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 11);
 #endif
 	return (1);
 }
 
 
 
 /*
  * determine if a destination is "reachable" based upon the addresses bound
  * to the current endpoint (e.g. only v4 or v6 currently bound)
  */
 /*
  * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use
  * assoc level v4/v6 flags, as the assoc *may* not have the same address
  * types bound as its endpoint
  */
 int
 sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr)
 {
 	struct sctp_inpcb *inp;
 	int answer;
 
 	/*
 	 * No locks here, the TCB, in all cases is already locked and an
 	 * assoc is up. There is either a INP lock by the caller applied (in
 	 * asconf case when deleting an address) or NOT in the HB case,
 	 * however if HB then the INP increment is up and the INP will not
 	 * be removed (on top of the fact that we have a TCB lock). So we
 	 * only want to read the sctp_flags, which is either bound-all or
 	 * not.. no protection needed since once an assoc is up you can't be
 	 * changing your binding.
 	 */
 	inp = stcb->sctp_ep;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* if bound all, destination is not restricted */
 		/*
 		 * RRS: Question during lock work: Is this correct? If you
 		 * are bound-all you still might need to obey the V4--V6
 		 * flags??? IMO this bound-all stuff needs to be removed!
 		 */
 		return (1);
 	}
 	/* NOTE: all "scope" checks are done when local addresses are added */
 	switch (destaddr->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		answer = inp->ip_inp.inp.inp_vflag & INP_IPV6;
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		answer = inp->ip_inp.inp.inp_vflag & INP_IPV4;
 		break;
 #endif
 	default:
 		/* invalid family, so it's unreachable */
 		answer = 0;
 		break;
 	}
 	return (answer);
 }
 
 /*
  * update the inp_vflags on an endpoint
  */
 static void
 sctp_update_ep_vflag(struct sctp_inpcb *inp)
 {
 	struct sctp_laddr *laddr;
 
 	/* first clear the flag */
 	inp->ip_inp.inp.inp_vflag = 0;
 	/* set the flag based on addresses on the ep list */
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == NULL) {
 			SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
 			    __func__);
 			continue;
 		}
 
 		if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
 			continue;
 		}
 		switch (laddr->ifa->address.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 			break;
 #endif
 #ifdef INET
 		case AF_INET:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 			break;
 #endif
 		default:
 			break;
 		}
 	}
 }
 
 /*
  * Add the address to the endpoint local address list There is nothing to be
  * done if we are bound to all addresses
  */
 void
 sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action)
 {
 	struct sctp_laddr *laddr;
 	struct sctp_tcb *stcb;
 	int fnd, error = 0;
 
 	fnd = 0;
 
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* You are already bound to all. You have it already */
 		return;
 	}
 #ifdef INET6
 	if (ifa->address.sa.sa_family == AF_INET6) {
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			/* Can't bind a non-useable addr. */
 			return;
 		}
 	}
 #endif
 	/* first, is it already present? */
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == ifa) {
 			fnd = 1;
 			break;
 		}
 	}
 
 	if (fnd == 0) {
 		/* Not in the ep list */
 		error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action);
 		if (error != 0)
 			return;
 		inp->laddr_count++;
 		/* update inp_vflag flags */
 		switch (ifa->address.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 			break;
 #endif
 #ifdef INET
 		case AF_INET:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 			break;
 #endif
 		default:
 			break;
 		}
 		LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 			sctp_add_local_addr_restricted(stcb, ifa);
 		}
 	}
 	return;
 }
 
 
 /*
  * select a new (hopefully reachable) destination net (should only be used
  * when we deleted an ep addr that is the only usable source address to reach
  * the destination net)
  */
 static void
 sctp_select_primary_destination(struct sctp_tcb *stcb)
 {
 	struct sctp_nets *net;
 
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/* for now, we'll just pick the first reachable one we find */
 		if (net->dest_state & SCTP_ADDR_UNCONFIRMED)
 			continue;
 		if (sctp_destination_is_reachable(stcb,
 		    (struct sockaddr *)&net->ro._l_addr)) {
 			/* found a reachable destination */
 			stcb->asoc.primary_destination = net;
 		}
 	}
 	/* I can't there from here! ...we're gonna die shortly... */
 }
 
 
 /*
  * Delete the address from the endpoint local address list. There is nothing
  * to be done if we are bound to all addresses
  */
 void
 sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
 {
 	struct sctp_laddr *laddr;
 	int fnd;
 
 	fnd = 0;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* You are already bound to all. You have it already */
 		return;
 	}
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == ifa) {
 			fnd = 1;
 			break;
 		}
 	}
 	if (fnd && (inp->laddr_count < 2)) {
 		/* can't delete unless there are at LEAST 2 addresses */
 		return;
 	}
 	if (fnd) {
 		/*
 		 * clean up any use of this address go through our
 		 * associations and clear any last_used_address that match
 		 * this one for each assoc, see if a new primary_destination
 		 * is needed
 		 */
 		struct sctp_tcb *stcb;
 
 		/* clean up "next_addr_touse" */
 		if (inp->next_addr_touse == laddr)
 			/* delete this address */
 			inp->next_addr_touse = NULL;
 
 		/* clean up "last_used_address" */
 		LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 			struct sctp_nets *net;
 
 			SCTP_TCB_LOCK(stcb);
 			if (stcb->asoc.last_used_address == laddr)
 				/* delete this address */
 				stcb->asoc.last_used_address = NULL;
 			/*
 			 * Now spin through all the nets and purge any ref
 			 * to laddr
 			 */
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 				if (net->ro._s_addr == laddr->ifa) {
 					/* Yep, purge src address selected */
 					RO_NHFREE(&net->ro);
 					sctp_free_ifa(net->ro._s_addr);
 					net->ro._s_addr = NULL;
 					net->src_addr_selected = 0;
 				}
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		}		/* for each tcb */
 		/* remove it from the ep list */
 		sctp_remove_laddr(laddr);
 		inp->laddr_count--;
 		/* update inp_vflag flags */
 		sctp_update_ep_vflag(inp);
 	}
 	return;
 }
 
 /*
  * Add the address to the TCB local address restricted list.
  * This is a "pending" address list (eg. addresses waiting for an
  * ASCONF-ACK response) and cannot be used as a valid source address.
  */
 void
 sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
 {
 	struct sctp_laddr *laddr;
 	struct sctpladdr *list;
 
 	/*
 	 * Assumes TCB is locked.. and possibly the INP. May need to
 	 * confirm/fix that if we need it and is not the case.
 	 */
 	list = &stcb->asoc.sctp_restricted_addrs;
 
 #ifdef INET6
 	if (ifa->address.sa.sa_family == AF_INET6) {
 		if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
 			/* Can't bind a non-existent addr. */
 			return;
 		}
 	}
 #endif
 	/* does the address already exist? */
 	LIST_FOREACH(laddr, list, sctp_nxt_addr) {
 		if (laddr->ifa == ifa) {
 			return;
 		}
 	}
 
 	/* add to the list */
 	(void)sctp_insert_laddr(list, ifa, 0);
 	return;
 }
 
 /*
  * Remove a local address from the TCB local address restricted list
  */
 void
 sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_laddr *laddr;
 
 	/*
 	 * This is called by asconf work. It is assumed that a) The TCB is
 	 * locked and b) The INP is locked. This is true in as much as I can
 	 * trace through the entry asconf code where I did these locks.
 	 * Again, the ASCONF code is a bit different in that it does lock
 	 * the INP during its work often times. This must be since we don't
 	 * want other proc's looking up things while what they are looking
 	 * up is changing :-D
 	 */
 
 	inp = stcb->sctp_ep;
 	/* if subset bound and don't allow ASCONF's, can't delete last */
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) &&
 	    sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
 		if (stcb->sctp_ep->laddr_count < 2) {
 			/* can't delete last address */
 			return;
 		}
 	}
 	LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) {
 		/* remove the address if it exists */
 		if (laddr->ifa == NULL)
 			continue;
 		if (laddr->ifa == ifa) {
 			sctp_remove_laddr(laddr);
 			return;
 		}
 	}
 
 	/* address not found! */
 	return;
 }
 
 /* sysctl */
 static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC;
 static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR;
 
 #if defined(SCTP_MCORE_INPUT) && defined(SMP)
 struct sctp_mcore_ctrl *sctp_mcore_workers = NULL;
 int *sctp_cpuarry = NULL;
 
 void
 sctp_queue_to_mcore(struct mbuf *m, int off, int cpu_to_use)
 {
 	/* Queue a packet to a processor for the specified core */
 	struct sctp_mcore_queue *qent;
 	struct sctp_mcore_ctrl *wkq;
 	int need_wake = 0;
 
 	if (sctp_mcore_workers == NULL) {
 		/* Something went way bad during setup */
 		sctp_input_with_port(m, off, 0);
 		return;
 	}
 	SCTP_MALLOC(qent, struct sctp_mcore_queue *,
 	    (sizeof(struct sctp_mcore_queue)),
 	    SCTP_M_MCORE);
 	if (qent == NULL) {
 		/* This is trouble  */
 		sctp_input_with_port(m, off, 0);
 		return;
 	}
 	qent->vn = curvnet;
 	qent->m = m;
 	qent->off = off;
 	qent->v6 = 0;
 	wkq = &sctp_mcore_workers[cpu_to_use];
 	SCTP_MCORE_QLOCK(wkq);
 
 	TAILQ_INSERT_TAIL(&wkq->que, qent, next);
 	if (wkq->running == 0) {
 		need_wake = 1;
 	}
 	SCTP_MCORE_QUNLOCK(wkq);
 	if (need_wake) {
 		wakeup(&wkq->running);
 	}
 }
 
 static void
 sctp_mcore_thread(void *arg)
 {
 
 	struct sctp_mcore_ctrl *wkq;
 	struct sctp_mcore_queue *qent;
 
 	wkq = (struct sctp_mcore_ctrl *)arg;
 	struct mbuf *m;
 	int off, v6;
 
 	/* Wait for first tickle */
 	SCTP_MCORE_LOCK(wkq);
 	wkq->running = 0;
 	msleep(&wkq->running,
 	    &wkq->core_mtx,
 	    0, "wait for pkt", 0);
 	SCTP_MCORE_UNLOCK(wkq);
 
 	/* Bind to our cpu */
 	thread_lock(curthread);
 	sched_bind(curthread, wkq->cpuid);
 	thread_unlock(curthread);
 
 	/* Now lets start working */
 	SCTP_MCORE_LOCK(wkq);
 	/* Now grab lock and go */
 	for (;;) {
 		SCTP_MCORE_QLOCK(wkq);
 skip_sleep:
 		wkq->running = 1;
 		qent = TAILQ_FIRST(&wkq->que);
 		if (qent) {
 			TAILQ_REMOVE(&wkq->que, qent, next);
 			SCTP_MCORE_QUNLOCK(wkq);
 			CURVNET_SET(qent->vn);
 			m = qent->m;
 			off = qent->off;
 			v6 = qent->v6;
 			SCTP_FREE(qent, SCTP_M_MCORE);
 			if (v6 == 0) {
 				sctp_input_with_port(m, off, 0);
 			} else {
 				SCTP_PRINTF("V6 not yet supported\n");
 				sctp_m_freem(m);
 			}
 			CURVNET_RESTORE();
 			SCTP_MCORE_QLOCK(wkq);
 		}
 		wkq->running = 0;
 		if (!TAILQ_EMPTY(&wkq->que)) {
 			goto skip_sleep;
 		}
 		SCTP_MCORE_QUNLOCK(wkq);
 		msleep(&wkq->running,
 		    &wkq->core_mtx,
 		    0, "wait for pkt", 0);
 	}
 }
 
 static void
 sctp_startup_mcore_threads(void)
 {
 	int i, cpu;
 
 	if (mp_ncpus == 1)
 		return;
 
 	if (sctp_mcore_workers != NULL) {
 		/*
 		 * Already been here in some previous vnet?
 		 */
 		return;
 	}
 	SCTP_MALLOC(sctp_mcore_workers, struct sctp_mcore_ctrl *,
 	    ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl)),
 	    SCTP_M_MCORE);
 	if (sctp_mcore_workers == NULL) {
 		/* TSNH I hope */
 		return;
 	}
 	memset(sctp_mcore_workers, 0, ((mp_maxid + 1) *
 	    sizeof(struct sctp_mcore_ctrl)));
 	/* Init the structures */
 	for (i = 0; i <= mp_maxid; i++) {
 		TAILQ_INIT(&sctp_mcore_workers[i].que);
 		SCTP_MCORE_LOCK_INIT(&sctp_mcore_workers[i]);
 		SCTP_MCORE_QLOCK_INIT(&sctp_mcore_workers[i]);
 		sctp_mcore_workers[i].cpuid = i;
 	}
 	if (sctp_cpuarry == NULL) {
 		SCTP_MALLOC(sctp_cpuarry, int *,
 		    (mp_ncpus * sizeof(int)),
 		    SCTP_M_MCORE);
 		i = 0;
 		CPU_FOREACH(cpu) {
 			sctp_cpuarry[i] = cpu;
 			i++;
 		}
 	}
 	/* Now start them all */
 	CPU_FOREACH(cpu) {
 		(void)kproc_create(sctp_mcore_thread,
 		    (void *)&sctp_mcore_workers[cpu],
 		    &sctp_mcore_workers[cpu].thread_proc,
 		    RFPROC,
 		    SCTP_KTHREAD_PAGES,
 		    SCTP_MCORE_NAME);
 	}
 }
 #endif
 
 void
 sctp_pcb_init(void)
 {
 	/*
 	 * SCTP initialization for the PCB structures should be called by
 	 * the sctp_init() function.
 	 */
 	int i;
 	struct timeval tv;
 
 	if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) {
 		/* error I was called twice */
 		return;
 	}
 	SCTP_BASE_VAR(sctp_pcb_initialized) = 1;
 
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log));
 #endif
 #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
 	SCTP_MALLOC(SCTP_BASE_STATS, struct sctpstat *,
 	    ((mp_maxid + 1) * sizeof(struct sctpstat)),
 	    SCTP_M_MCORE);
 #endif
 	(void)SCTP_GETTIME_TIMEVAL(&tv);
 #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
 	memset(SCTP_BASE_STATS, 0, sizeof(struct sctpstat) * (mp_maxid + 1));
 	SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t)tv.tv_sec;
 	SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t)tv.tv_usec;
 #else
 	memset(&SCTP_BASE_STATS, 0, sizeof(struct sctpstat));
 	SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t)tv.tv_sec;
 	SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t)tv.tv_usec;
 #endif
 	/* init the empty list of (All) Endpoints */
 	LIST_INIT(&SCTP_BASE_INFO(listhead));
 
 
 	/* init the hash table of endpoints */
 	TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize));
 	TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize));
 	TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale));
 	SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31),
 	    &SCTP_BASE_INFO(hashasocmark));
 	SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize),
 	    &SCTP_BASE_INFO(hashmark));
 	SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize),
 	    &SCTP_BASE_INFO(hashtcpmark));
 	SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize);
 
 
 	SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH,
 	    &SCTP_BASE_INFO(hashvrfmark));
 
 	SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE,
 	    &SCTP_BASE_INFO(vrf_ifn_hashmark));
 	/* init the zones */
 	/*
 	 * FIX ME: Should check for NULL returns, but if it does fail we are
 	 * doomed to panic anyways... add later maybe.
 	 */
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep",
 	    sizeof(struct sctp_inpcb), maxsockets);
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc",
 	    sizeof(struct sctp_tcb), sctp_max_number_of_assoc);
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr",
 	    sizeof(struct sctp_laddr),
 	    (sctp_max_number_of_assoc * sctp_scale_up_for_address));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr",
 	    sizeof(struct sctp_nets),
 	    (sctp_max_number_of_assoc * sctp_scale_up_for_address));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk",
 	    sizeof(struct sctp_tmit_chunk),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq",
 	    sizeof(struct sctp_queued_to_read),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out",
 	    sizeof(struct sctp_stream_queue_pending),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf",
 	    sizeof(struct sctp_asconf),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 	SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack",
 	    sizeof(struct sctp_asconf_ack),
 	    (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale)));
 
 
 	/* Master Lock INIT for info structure */
 	SCTP_INP_INFO_LOCK_INIT();
 	SCTP_STATLOG_INIT_LOCK();
 
 	SCTP_IPI_COUNT_INIT();
 	SCTP_IPI_ADDR_INIT();
 #ifdef SCTP_PACKET_LOGGING
 	SCTP_IP_PKTLOG_INIT();
 #endif
 	LIST_INIT(&SCTP_BASE_INFO(addr_wq));
 
 	SCTP_WQ_ADDR_INIT();
 	/* not sure if we need all the counts */
 	SCTP_BASE_INFO(ipi_count_ep) = 0;
 	/* assoc/tcb zone info */
 	SCTP_BASE_INFO(ipi_count_asoc) = 0;
 	/* local addrlist zone info */
 	SCTP_BASE_INFO(ipi_count_laddr) = 0;
 	/* remote addrlist zone info */
 	SCTP_BASE_INFO(ipi_count_raddr) = 0;
 	/* chunk info */
 	SCTP_BASE_INFO(ipi_count_chunk) = 0;
 
 	/* socket queue zone info */
 	SCTP_BASE_INFO(ipi_count_readq) = 0;
 
 	/* stream out queue cont */
 	SCTP_BASE_INFO(ipi_count_strmoq) = 0;
 
 	SCTP_BASE_INFO(ipi_free_strmoq) = 0;
 	SCTP_BASE_INFO(ipi_free_chunks) = 0;
 
 	SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer));
 
 	/* Init the TIMEWAIT list */
 	for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) {
 		LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]);
 	}
 	sctp_startup_iterator();
 
 #if defined(SCTP_MCORE_INPUT) && defined(SMP)
 	sctp_startup_mcore_threads();
 #endif
 
 	/*
 	 * INIT the default VRF which for BSD is the only one, other O/S's
 	 * may have more. But initially they must start with one and then
 	 * add the VRF's as addresses are added.
 	 */
 	sctp_init_vrf_list(SCTP_DEFAULT_VRF);
 }
 
 /*
  * Assumes that the SCTP_BASE_INFO() lock is NOT held.
  */
 void
 sctp_pcb_finish(void)
 {
 	struct sctp_vrflist *vrf_bucket;
 	struct sctp_vrf *vrf, *nvrf;
 	struct sctp_ifn *ifn, *nifn;
 	struct sctp_ifa *ifa, *nifa;
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block, *prev_twait_block;
 	struct sctp_laddr *wi, *nwi;
 	int i;
 	struct sctp_iterator *it, *nit;
 
 	if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
 		SCTP_PRINTF("%s: race condition on teardown.\n", __func__);
 		return;
 	}
 	SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
 	/*
 	 * In FreeBSD the iterator thread never exits but we do clean up.
 	 * The only way FreeBSD reaches here is if we have VRF's but we
 	 * still add the ifdef to make it compile on old versions.
 	 */
 retry:
 	SCTP_IPI_ITERATOR_WQ_LOCK();
 	/*
 	 * sctp_iterator_worker() might be working on an it entry without
 	 * holding the lock.  We won't find it on the list either and
 	 * continue and free/destroy it.  While holding the lock, spin, to
 	 * avoid the race condition as sctp_iterator_worker() will have to
 	 * wait to re-acquire the lock.
 	 */
 	if (sctp_it_ctl.iterator_running != 0 || sctp_it_ctl.cur_it != NULL) {
 		SCTP_IPI_ITERATOR_WQ_UNLOCK();
 		SCTP_PRINTF("%s: Iterator running while we held the lock. Retry. "
 		    "cur_it=%p\n", __func__, sctp_it_ctl.cur_it);
 		DELAY(10);
 		goto retry;
 	}
 	TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) {
 		if (it->vn != curvnet) {
 			continue;
 		}
 		TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
 		if (it->function_atend != NULL) {
 			(*it->function_atend) (it->pointer, it->val);
 		}
 		SCTP_FREE(it, SCTP_M_ITER);
 	}
 	SCTP_IPI_ITERATOR_WQ_UNLOCK();
 	SCTP_ITERATOR_LOCK();
 	if ((sctp_it_ctl.cur_it) &&
 	    (sctp_it_ctl.cur_it->vn == curvnet)) {
 		sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
 	}
 	SCTP_ITERATOR_UNLOCK();
 	SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer));
 	SCTP_WQ_ADDR_LOCK();
 	LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) {
 		LIST_REMOVE(wi, sctp_nxt_addr);
 		SCTP_DECR_LADDR_COUNT();
 		if (wi->action == SCTP_DEL_IP_ADDRESS) {
 			SCTP_FREE(wi->ifa, SCTP_M_IFA);
 		}
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi);
 	}
 	SCTP_WQ_ADDR_UNLOCK();
 
 	/*
 	 * free the vrf/ifn/ifa lists and hashes (be sure address monitor is
 	 * destroyed first).
 	 */
 	SCTP_IPI_ADDR_WLOCK();
 	vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))];
 	LIST_FOREACH_SAFE(vrf, vrf_bucket, next_vrf, nvrf) {
 		LIST_FOREACH_SAFE(ifn, &vrf->ifnlist, next_ifn, nifn) {
 			LIST_FOREACH_SAFE(ifa, &ifn->ifalist, next_ifa, nifa) {
 				/* free the ifa */
 				LIST_REMOVE(ifa, next_bucket);
 				LIST_REMOVE(ifa, next_ifa);
 				SCTP_FREE(ifa, SCTP_M_IFA);
 			}
 			/* free the ifn */
 			LIST_REMOVE(ifn, next_bucket);
 			LIST_REMOVE(ifn, next_ifn);
 			SCTP_FREE(ifn, SCTP_M_IFN);
 		}
 		SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark);
 		/* free the vrf */
 		LIST_REMOVE(vrf, next_vrf);
 		SCTP_FREE(vrf, SCTP_M_VRF);
 	}
 	SCTP_IPI_ADDR_WUNLOCK();
 	/* free the vrf hashes */
 	SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark));
 	SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark));
 
 	/*
 	 * free the TIMEWAIT list elements malloc'd in the function
 	 * sctp_add_vtag_to_timewait()...
 	 */
 	for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) {
 		chain = &SCTP_BASE_INFO(vtag_timewait)[i];
 		if (!LIST_EMPTY(chain)) {
 			prev_twait_block = NULL;
 			LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 				if (prev_twait_block) {
 					SCTP_FREE(prev_twait_block, SCTP_M_TIMW);
 				}
 				prev_twait_block = twait_block;
 			}
 			SCTP_FREE(prev_twait_block, SCTP_M_TIMW);
 		}
 	}
 
 	/* free the locks and mutexes */
 #ifdef SCTP_PACKET_LOGGING
 	SCTP_IP_PKTLOG_DESTROY();
 #endif
 	SCTP_IPI_ADDR_DESTROY();
 	SCTP_STATLOG_DESTROY();
 	SCTP_INP_INFO_LOCK_DESTROY();
 
 	SCTP_WQ_ADDR_DESTROY();
 
 	/* Get rid of other stuff too. */
 	if (SCTP_BASE_INFO(sctp_asochash) != NULL)
 		SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark));
 	if (SCTP_BASE_INFO(sctp_ephash) != NULL)
 		SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark));
 	if (SCTP_BASE_INFO(sctp_tcpephash) != NULL)
 		SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark));
 
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf));
 	SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack));
 #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
 	SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE);
 #endif
 }
 
 
 int
 sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
     int offset, int limit,
     struct sockaddr *src, struct sockaddr *dst,
     struct sockaddr *altsa, uint16_t port)
 {
 	/*
 	 * grub through the INIT pulling addresses and loading them to the
 	 * nets structure in the asoc. The from address in the mbuf should
 	 * also be loaded (if it is not already). This routine can be called
 	 * with either INIT or INIT-ACK's as long as the m points to the IP
 	 * packet and the offset points to the beginning of the parameters.
 	 */
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net, *nnet, *net_tmp;
 	struct sctp_paramhdr *phdr, param_buf;
 	struct sctp_tcb *stcb_tmp;
 	uint16_t ptype, plen;
 	struct sockaddr *sa;
 	uint8_t random_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_random *p_random = NULL;
 	uint16_t random_len = 0;
 	uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_hmac_algo *hmacs = NULL;
 	uint16_t hmacs_len = 0;
 	uint8_t saw_asconf = 0;
 	uint8_t saw_asconf_ack = 0;
 	uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_chunk_list *chunks = NULL;
 	uint16_t num_chunks = 0;
 	sctp_key_t *new_key;
 	uint32_t keylen;
 	int got_random = 0, got_hmacs = 0, got_chklist = 0;
 	uint8_t peer_supports_ecn;
 	uint8_t peer_supports_prsctp;
 	uint8_t peer_supports_auth;
 	uint8_t peer_supports_asconf;
 	uint8_t peer_supports_asconf_ack;
 	uint8_t peer_supports_reconfig;
 	uint8_t peer_supports_nrsack;
 	uint8_t peer_supports_pktdrop;
 	uint8_t peer_supports_idata;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 
 	/* First get the destination address setup too. */
 #ifdef INET
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_family = AF_INET;
 	sin.sin_len = sizeof(sin);
 	sin.sin_port = stcb->rport;
 #endif
 #ifdef INET6
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_port = stcb->rport;
 #endif
 	if (altsa) {
 		sa = altsa;
 	} else {
 		sa = src;
 	}
 	peer_supports_idata = 0;
 	peer_supports_ecn = 0;
 	peer_supports_prsctp = 0;
 	peer_supports_auth = 0;
 	peer_supports_asconf = 0;
 	peer_supports_reconfig = 0;
 	peer_supports_nrsack = 0;
 	peer_supports_pktdrop = 0;
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		/* mark all addresses that we have currently on the list */
 		net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC;
 	}
 	/* does the source address already exist? if so skip it */
 	inp = stcb->sctp_ep;
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, dst, stcb);
 	atomic_add_int(&stcb->asoc.refcnt, -1);
 
 	if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) {
 		/* we must add the source address */
 		/* no scope set here since we have a tcb already. */
 		switch (sa->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (stcb->asoc.scope.ipv4_addr_legal) {
 				if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) {
 					return (-1);
 				}
 			}
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (stcb->asoc.scope.ipv6_addr_legal) {
 				if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) {
 					return (-2);
 				}
 			}
 			break;
 #endif
 		default:
 			break;
 		}
 	} else {
 		if (net_tmp != NULL && stcb_tmp == stcb) {
 			net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC;
 		} else if (stcb_tmp != stcb) {
 			/* It belongs to another association? */
 			if (stcb_tmp)
 				SCTP_TCB_UNLOCK(stcb_tmp);
 			return (-3);
 		}
 	}
 	if (stcb->asoc.state == 0) {
 		/* the assoc was freed? */
 		return (-4);
 	}
 	/* now we must go through each of the params. */
 	phdr = sctp_get_next_param(m, offset, &param_buf, sizeof(param_buf));
 	while (phdr) {
 		ptype = ntohs(phdr->param_type);
 		plen = ntohs(phdr->param_length);
 		/*
 		 * SCTP_PRINTF("ptype => %0x, plen => %d\n",
 		 * (uint32_t)ptype, (int)plen);
 		 */
 		if (offset + plen > limit) {
 			break;
 		}
 		if (plen < sizeof(struct sctp_paramhdr)) {
 			break;
 		}
 #ifdef INET
 		if (ptype == SCTP_IPV4_ADDRESS) {
 			if (stcb->asoc.scope.ipv4_addr_legal) {
 				struct sctp_ipv4addr_param *p4, p4_buf;
 
 				/* ok get the v4 address and check/add */
 				phdr = sctp_get_next_param(m, offset,
 				    (struct sctp_paramhdr *)&p4_buf,
 				    sizeof(p4_buf));
 				if (plen != sizeof(struct sctp_ipv4addr_param) ||
 				    phdr == NULL) {
 					return (-5);
 				}
 				p4 = (struct sctp_ipv4addr_param *)phdr;
 				sin.sin_addr.s_addr = p4->addr;
 				if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 					/* Skip multi-cast addresses */
 					goto next_param;
 				}
 				if ((sin.sin_addr.s_addr == INADDR_BROADCAST) ||
 				    (sin.sin_addr.s_addr == INADDR_ANY)) {
 					goto next_param;
 				}
 				sa = (struct sockaddr *)&sin;
 				inp = stcb->sctp_ep;
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
 				    dst, stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 
 				if ((stcb_tmp == NULL && inp == stcb->sctp_ep) ||
 				    inp == NULL) {
 					/* we must add the source address */
 					/*
 					 * no scope set since we have a tcb
 					 * already
 					 */
 
 					/*
 					 * we must validate the state again
 					 * here
 					 */
 			add_it_now:
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-7);
 					}
 					if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) {
 						return (-8);
 					}
 				} else if (stcb_tmp == stcb) {
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-10);
 					}
 					if (net != NULL) {
 						/* clear flag */
 						net->dest_state &=
 						    ~SCTP_ADDR_NOT_IN_ASSOC;
 					}
 				} else {
 					/*
 					 * strange, address is in another
 					 * assoc? straighten out locks.
 					 */
 					if (stcb_tmp) {
 						if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) {
 							struct mbuf *op_err;
 							char msg[SCTP_DIAG_INFO_LEN];
 
 							/*
 							 * in setup state we
 							 * abort this guy
 							 */
 							SCTP_SNPRINTF(msg, sizeof(msg),
 							    "%s:%d at %s", __FILE__, __LINE__, __func__);
 							op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 							    msg);
 							sctp_abort_an_association(stcb_tmp->sctp_ep,
 							    stcb_tmp, op_err,
 							    SCTP_SO_NOT_LOCKED);
 							goto add_it_now;
 						}
 						SCTP_TCB_UNLOCK(stcb_tmp);
 					}
 
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-12);
 					}
 					return (-13);
 				}
 			}
 		} else
 #endif
 #ifdef INET6
 		if (ptype == SCTP_IPV6_ADDRESS) {
 			if (stcb->asoc.scope.ipv6_addr_legal) {
 				/* ok get the v6 address and check/add */
 				struct sctp_ipv6addr_param *p6, p6_buf;
 
 				phdr = sctp_get_next_param(m, offset,
 				    (struct sctp_paramhdr *)&p6_buf,
 				    sizeof(p6_buf));
 				if (plen != sizeof(struct sctp_ipv6addr_param) ||
 				    phdr == NULL) {
 					return (-14);
 				}
 				p6 = (struct sctp_ipv6addr_param *)phdr;
 				memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
 				    sizeof(p6->addr));
 				if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) {
 					/* Skip multi-cast addresses */
 					goto next_param;
 				}
 				if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) {
 					/*
 					 * Link local make no sense without
 					 * scope
 					 */
 					goto next_param;
 				}
 				sa = (struct sockaddr *)&sin6;
 				inp = stcb->sctp_ep;
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
 				    dst, stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 				if (stcb_tmp == NULL &&
 				    (inp == stcb->sctp_ep || inp == NULL)) {
 					/*
 					 * we must validate the state again
 					 * here
 					 */
 			add_it_now6:
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-16);
 					}
 					/*
 					 * we must add the address, no scope
 					 * set
 					 */
 					if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) {
 						return (-17);
 					}
 				} else if (stcb_tmp == stcb) {
 					/*
 					 * we must validate the state again
 					 * here
 					 */
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-19);
 					}
 					if (net != NULL) {
 						/* clear flag */
 						net->dest_state &=
 						    ~SCTP_ADDR_NOT_IN_ASSOC;
 					}
 				} else {
 					/*
 					 * strange, address is in another
 					 * assoc? straighten out locks.
 					 */
 					if (stcb_tmp) {
 						if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) {
 							struct mbuf *op_err;
 							char msg[SCTP_DIAG_INFO_LEN];
 
 							/*
 							 * in setup state we
 							 * abort this guy
 							 */
 							SCTP_SNPRINTF(msg, sizeof(msg),
 							    "%s:%d at %s", __FILE__, __LINE__, __func__);
 							op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 							    msg);
 							sctp_abort_an_association(stcb_tmp->sctp_ep,
 							    stcb_tmp, op_err,
 							    SCTP_SO_NOT_LOCKED);
 							goto add_it_now6;
 						}
 						SCTP_TCB_UNLOCK(stcb_tmp);
 					}
 					if (stcb->asoc.state == 0) {
 						/* the assoc was freed? */
 						return (-21);
 					}
 					return (-22);
 				}
 			}
 		} else
 #endif
 		if (ptype == SCTP_ECN_CAPABLE) {
 			peer_supports_ecn = 1;
 		} else if (ptype == SCTP_ULP_ADAPTATION) {
 			if (stcb->asoc.state != SCTP_STATE_OPEN) {
 				struct sctp_adaptation_layer_indication ai,
 				                                *aip;
 
 				phdr = sctp_get_next_param(m, offset,
 				    (struct sctp_paramhdr *)&ai, sizeof(ai));
 				aip = (struct sctp_adaptation_layer_indication *)phdr;
 				if (aip) {
 					stcb->asoc.peers_adaptation = ntohl(aip->indication);
 					stcb->asoc.adaptation_needed = 1;
 				}
 			}
 		} else if (ptype == SCTP_SET_PRIM_ADDR) {
 			struct sctp_asconf_addr_param lstore, *fee;
 			int lptype;
 			struct sockaddr *lsa = NULL;
 #ifdef INET
 			struct sctp_asconf_addrv4_param *fii;
 #endif
 
 			if (stcb->asoc.asconf_supported == 0) {
 				return (-100);
 			}
 			if (plen > sizeof(lstore)) {
 				return (-23);
 			}
 			if (plen < sizeof(struct sctp_asconf_addrv4_param)) {
 				return (-101);
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&lstore,
 			    plen);
 			if (phdr == NULL) {
 				return (-24);
 			}
 			fee = (struct sctp_asconf_addr_param *)phdr;
 			lptype = ntohs(fee->addrp.ph.param_type);
 			switch (lptype) {
 #ifdef INET
 			case SCTP_IPV4_ADDRESS:
 				if (plen !=
 				    sizeof(struct sctp_asconf_addrv4_param)) {
 					SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n",
 					    (int)sizeof(struct sctp_asconf_addrv4_param),
 					    plen);
 				} else {
 					fii = (struct sctp_asconf_addrv4_param *)fee;
 					sin.sin_addr.s_addr = fii->addrp.addr;
 					lsa = (struct sockaddr *)&sin;
 				}
 				break;
 #endif
 #ifdef INET6
 			case SCTP_IPV6_ADDRESS:
 				if (plen !=
 				    sizeof(struct sctp_asconf_addr_param)) {
 					SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n",
 					    (int)sizeof(struct sctp_asconf_addr_param),
 					    plen);
 				} else {
 					memcpy(sin6.sin6_addr.s6_addr,
 					    fee->addrp.addr,
 					    sizeof(fee->addrp.addr));
 					lsa = (struct sockaddr *)&sin6;
 				}
 				break;
 #endif
 			default:
 				break;
 			}
 			if (lsa) {
 				(void)sctp_set_primary_addr(stcb, sa, NULL);
 			}
 		} else if (ptype == SCTP_HAS_NAT_SUPPORT) {
 			stcb->asoc.peer_supports_nat = 1;
 		} else if (ptype == SCTP_PRSCTP_SUPPORTED) {
 			/* Peer supports pr-sctp */
 			peer_supports_prsctp = 1;
 		} else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) {
 			/* A supported extension chunk */
 			struct sctp_supported_chunk_types_param *pr_supported;
 			uint8_t local_store[SCTP_PARAM_BUFFER_SIZE];
 			int num_ent, i;
 
 			if (plen > sizeof(local_store)) {
 				return (-35);
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)&local_store, plen);
 			if (phdr == NULL) {
 				return (-25);
 			}
 			pr_supported = (struct sctp_supported_chunk_types_param *)phdr;
 			num_ent = plen - sizeof(struct sctp_paramhdr);
 			for (i = 0; i < num_ent; i++) {
 				switch (pr_supported->chunk_types[i]) {
 				case SCTP_ASCONF:
 					peer_supports_asconf = 1;
 					break;
 				case SCTP_ASCONF_ACK:
 					peer_supports_asconf_ack = 1;
 					break;
 				case SCTP_FORWARD_CUM_TSN:
 					peer_supports_prsctp = 1;
 					break;
 				case SCTP_PACKET_DROPPED:
 					peer_supports_pktdrop = 1;
 					break;
 				case SCTP_NR_SELECTIVE_ACK:
 					peer_supports_nrsack = 1;
 					break;
 				case SCTP_STREAM_RESET:
 					peer_supports_reconfig = 1;
 					break;
 				case SCTP_AUTHENTICATION:
 					peer_supports_auth = 1;
 					break;
 				case SCTP_IDATA:
 					peer_supports_idata = 1;
 					break;
 				default:
 					/* one I have not learned yet */
 					break;
 
 				}
 			}
 		} else if (ptype == SCTP_RANDOM) {
 			if (plen > sizeof(random_store))
 				break;
 			if (got_random) {
 				/* already processed a RANDOM */
 				goto next_param;
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)random_store,
 			    plen);
 			if (phdr == NULL)
 				return (-26);
 			p_random = (struct sctp_auth_random *)phdr;
 			random_len = plen - sizeof(*p_random);
 			/* enforce the random length */
 			if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) {
 				SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n");
 				return (-27);
 			}
 			got_random = 1;
 		} else if (ptype == SCTP_HMAC_LIST) {
 			uint16_t num_hmacs;
 			uint16_t i;
 
 			if (plen > sizeof(hmacs_store))
 				break;
 			if (got_hmacs) {
 				/* already processed a HMAC list */
 				goto next_param;
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)hmacs_store,
 			    plen);
 			if (phdr == NULL)
 				return (-28);
 			hmacs = (struct sctp_auth_hmac_algo *)phdr;
 			hmacs_len = plen - sizeof(*hmacs);
 			num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]);
 			/* validate the hmac list */
 			if (sctp_verify_hmac_param(hmacs, num_hmacs)) {
 				return (-29);
 			}
 			if (stcb->asoc.peer_hmacs != NULL)
 				sctp_free_hmaclist(stcb->asoc.peer_hmacs);
 			stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs);
 			if (stcb->asoc.peer_hmacs != NULL) {
 				for (i = 0; i < num_hmacs; i++) {
 					(void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs,
 					    ntohs(hmacs->hmac_ids[i]));
 				}
 			}
 			got_hmacs = 1;
 		} else if (ptype == SCTP_CHUNK_LIST) {
 			int i;
 
 			if (plen > sizeof(chunks_store))
 				break;
 			if (got_chklist) {
 				/* already processed a Chunks list */
 				goto next_param;
 			}
 			phdr = sctp_get_next_param(m, offset,
 			    (struct sctp_paramhdr *)chunks_store,
 			    plen);
 			if (phdr == NULL)
 				return (-30);
 			chunks = (struct sctp_auth_chunk_list *)phdr;
 			num_chunks = plen - sizeof(*chunks);
 			if (stcb->asoc.peer_auth_chunks != NULL)
 				sctp_clear_chunklist(stcb->asoc.peer_auth_chunks);
 			else
 				stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist();
 			for (i = 0; i < num_chunks; i++) {
 				(void)sctp_auth_add_chunk(chunks->chunk_types[i],
 				    stcb->asoc.peer_auth_chunks);
 				/* record asconf/asconf-ack if listed */
 				if (chunks->chunk_types[i] == SCTP_ASCONF)
 					saw_asconf = 1;
 				if (chunks->chunk_types[i] == SCTP_ASCONF_ACK)
 					saw_asconf_ack = 1;
 
 			}
 			got_chklist = 1;
 		} else if ((ptype == SCTP_HEARTBEAT_INFO) ||
 			    (ptype == SCTP_STATE_COOKIE) ||
 			    (ptype == SCTP_UNRECOG_PARAM) ||
 			    (ptype == SCTP_COOKIE_PRESERVE) ||
 			    (ptype == SCTP_SUPPORTED_ADDRTYPE) ||
 			    (ptype == SCTP_ADD_IP_ADDRESS) ||
 			    (ptype == SCTP_DEL_IP_ADDRESS) ||
 			    (ptype == SCTP_ERROR_CAUSE_IND) ||
 		    (ptype == SCTP_SUCCESS_REPORT)) {
 			 /* don't care */ ;
 		} else {
 			if ((ptype & 0x8000) == 0x0000) {
 				/*
 				 * must stop processing the rest of the
 				 * param's. Any report bits were handled
 				 * with the call to
 				 * sctp_arethere_unrecognized_parameters()
 				 * when the INIT or INIT-ACK was first seen.
 				 */
 				break;
 			}
 		}
 
 next_param:
 		offset += SCTP_SIZE32(plen);
 		if (offset >= limit) {
 			break;
 		}
 		phdr = sctp_get_next_param(m, offset, &param_buf,
 		    sizeof(param_buf));
 	}
 	/* Now check to see if we need to purge any addresses */
 	TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) {
 		if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) ==
 		    SCTP_ADDR_NOT_IN_ASSOC) {
 			/* This address has been removed from the asoc */
 			/* remove and free it */
 			stcb->asoc.numnets--;
 			TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next);
 			sctp_free_remote_addr(net);
 			if (net == stcb->asoc.primary_destination) {
 				stcb->asoc.primary_destination = NULL;
 				sctp_select_primary_destination(stcb);
 			}
 		}
 	}
 	if ((stcb->asoc.ecn_supported == 1) &&
 	    (peer_supports_ecn == 0)) {
 		stcb->asoc.ecn_supported = 0;
 	}
 	if ((stcb->asoc.prsctp_supported == 1) &&
 	    (peer_supports_prsctp == 0)) {
 		stcb->asoc.prsctp_supported = 0;
 	}
 	if ((stcb->asoc.auth_supported == 1) &&
 	    ((peer_supports_auth == 0) ||
 	    (got_random == 0) || (got_hmacs == 0))) {
 		stcb->asoc.auth_supported = 0;
 	}
 	if ((stcb->asoc.asconf_supported == 1) &&
 	    ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) ||
 	    (stcb->asoc.auth_supported == 0) ||
 	    (saw_asconf == 0) || (saw_asconf_ack == 0))) {
 		stcb->asoc.asconf_supported = 0;
 	}
 	if ((stcb->asoc.reconfig_supported == 1) &&
 	    (peer_supports_reconfig == 0)) {
 		stcb->asoc.reconfig_supported = 0;
 	}
 	if ((stcb->asoc.idata_supported == 1) &&
 	    (peer_supports_idata == 0)) {
 		stcb->asoc.idata_supported = 0;
 	}
 	if ((stcb->asoc.nrsack_supported == 1) &&
 	    (peer_supports_nrsack == 0)) {
 		stcb->asoc.nrsack_supported = 0;
 	}
 	if ((stcb->asoc.pktdrop_supported == 1) &&
 	    (peer_supports_pktdrop == 0)) {
 		stcb->asoc.pktdrop_supported = 0;
 	}
 	/* validate authentication required parameters */
 	if ((peer_supports_auth == 0) && (got_chklist == 1)) {
 		/* peer does not support auth but sent a chunks list? */
 		return (-31);
 	}
 	if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) {
 		/* peer supports asconf but not auth? */
 		return (-32);
 	} else if ((peer_supports_asconf == 1) &&
 		    (peer_supports_auth == 1) &&
 	    ((saw_asconf == 0) || (saw_asconf_ack == 0))) {
 		return (-33);
 	}
 	/* concatenate the full random key */
 	keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len;
 	if (chunks != NULL) {
 		keylen += sizeof(*chunks) + num_chunks;
 	}
 	new_key = sctp_alloc_key(keylen);
 	if (new_key != NULL) {
 		/* copy in the RANDOM */
 		if (p_random != NULL) {
 			keylen = sizeof(*p_random) + random_len;
 			memcpy(new_key->key, p_random, keylen);
 		} else {
 			keylen = 0;
 		}
 		/* append in the AUTH chunks */
 		if (chunks != NULL) {
 			memcpy(new_key->key + keylen, chunks,
 			    sizeof(*chunks) + num_chunks);
 			keylen += sizeof(*chunks) + num_chunks;
 		}
 		/* append in the HMACs */
 		if (hmacs != NULL) {
 			memcpy(new_key->key + keylen, hmacs,
 			    sizeof(*hmacs) + hmacs_len);
 		}
 	} else {
 		/* failed to get memory for the key */
 		return (-34);
 	}
 	if (stcb->asoc.authinfo.peer_random != NULL)
 		sctp_free_key(stcb->asoc.authinfo.peer_random);
 	stcb->asoc.authinfo.peer_random = new_key;
 	sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid);
 	sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid);
 
 	return (0);
 }
 
 int
 sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa,
     struct sctp_nets *net)
 {
 	/* make sure the requested primary address exists in the assoc */
 	if (net == NULL && sa)
 		net = sctp_findnet(stcb, sa);
 
 	if (net == NULL) {
 		/* didn't find the requested primary address! */
 		return (-1);
 	} else {
 		/* set the primary address */
 		if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 			/* Must be confirmed, so queue to set */
 			net->dest_state |= SCTP_ADDR_REQ_PRIMARY;
 			return (0);
 		}
 		stcb->asoc.primary_destination = net;
 		if (!(net->dest_state & SCTP_ADDR_PF) && (stcb->asoc.alternate)) {
 			sctp_free_remote_addr(stcb->asoc.alternate);
 			stcb->asoc.alternate = NULL;
 		}
 		net = TAILQ_FIRST(&stcb->asoc.nets);
 		if (net != stcb->asoc.primary_destination) {
 			/*
 			 * first one on the list is NOT the primary
 			 * sctp_cmpaddr() is much more efficient if the
 			 * primary is the first on the list, make it so.
 			 */
 			TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next);
 			TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next);
 		}
 		return (0);
 	}
 }
 
 int
 sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now)
 {
 	/*
 	 * This function serves two purposes. It will see if a TAG can be
 	 * re-used and return 1 for yes it is ok and 0 for don't use that
 	 * tag. A secondary function it will do is purge out old tags that
 	 * can be removed.
 	 */
 	struct sctpvtaghead *chain;
 	struct sctp_tagblock *twait_block;
 	struct sctpasochead *head;
 	struct sctp_tcb *stcb;
 	int i;
 
 	SCTP_INP_INFO_RLOCK();
 	head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag,
 	    SCTP_BASE_INFO(hashasocmark))];
 	LIST_FOREACH(stcb, head, sctp_asocs) {
 		/*
 		 * We choose not to lock anything here. TCB's can't be
 		 * removed since we have the read lock, so they can't be
 		 * freed on us, same thing for the INP. I may be wrong with
 		 * this assumption, but we will go with it for now :-)
 		 */
 		if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
 			continue;
 		}
 		if (stcb->asoc.my_vtag == tag) {
 			/* candidate */
 			if (stcb->rport != rport) {
 				continue;
 			}
 			if (stcb->sctp_ep->sctp_lport != lport) {
 				continue;
 			}
 			/* Its a used tag set */
 			SCTP_INP_INFO_RUNLOCK();
 			return (0);
 		}
 	}
 	chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
 	/* Now what about timed wait ? */
 	LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
 		/*
 		 * Block(s) are present, lets see if we have this tag in the
 		 * list
 		 */
 		for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
 			if (twait_block->vtag_block[i].v_tag == 0) {
 				/* not used */
 				continue;
 			} else if ((long)twait_block->vtag_block[i].tv_sec_at_expire <
 			    now->tv_sec) {
 				/* Audit expires this guy */
 				twait_block->vtag_block[i].tv_sec_at_expire = 0;
 				twait_block->vtag_block[i].v_tag = 0;
 				twait_block->vtag_block[i].lport = 0;
 				twait_block->vtag_block[i].rport = 0;
 			} else if ((twait_block->vtag_block[i].v_tag == tag) &&
 				    (twait_block->vtag_block[i].lport == lport) &&
 			    (twait_block->vtag_block[i].rport == rport)) {
 				/* Bad tag, sorry :< */
 				SCTP_INP_INFO_RUNLOCK();
 				return (0);
 			}
 		}
 	}
 	SCTP_INP_INFO_RUNLOCK();
 	return (1);
 }
 
 static void
 sctp_drain_mbufs(struct sctp_tcb *stcb)
 {
 	/*
 	 * We must hunt this association for MBUF's past the cumack (i.e.
 	 * out of order data that we can renege on).
 	 */
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk, *nchk;
 	uint32_t cumulative_tsn_p1;
 	struct sctp_queued_to_read *control, *ncontrol;
 	int cnt, strmat;
 	uint32_t gap, i;
 	int fnd = 0;
 
 	/* We look for anything larger than the cum-ack + 1 */
 
 	asoc = &stcb->asoc;
 	if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) {
 		/* none we can reneg on. */
 		return;
 	}
 	SCTP_STAT_INCR(sctps_protocol_drains_done);
 	cumulative_tsn_p1 = asoc->cumulative_tsn + 1;
 	cnt = 0;
 	/* Ok that was fun, now we will drain all the inbound streams? */
 	for (strmat = 0; strmat < asoc->streamincnt; strmat++) {
 		TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].inqueue, next_instrm, ncontrol) {
 #ifdef INVARIANTS
 			if (control->on_strm_q != SCTP_ON_ORDERED) {
 				panic("Huh control: %p on_q: %d -- not ordered?",
 				    control, control->on_strm_q);
 			}
 #endif
 			if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) {
 				/* Yep it is above cum-ack */
 				cnt++;
 				SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn);
 				KASSERT(control->length > 0, ("control has zero length"));
 				if (asoc->size_on_all_streams >= control->length) {
 					asoc->size_on_all_streams -= control->length;
 				} else {
 #ifdef INVARIANTS
 					panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length);
 #else
 					asoc->size_on_all_streams = 0;
 #endif
 				}
 				sctp_ucount_decr(asoc->cnt_on_all_streams);
 				SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
 				if (control->on_read_q) {
 					TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next);
 					control->on_read_q = 0;
 				}
 				TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, control, next_instrm);
 				control->on_strm_q = 0;
 				if (control->data) {
 					sctp_m_freem(control->data);
 					control->data = NULL;
 				}
 				sctp_free_remote_addr(control->whoFrom);
 				/* Now its reasm? */
 				TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) {
 					cnt++;
 					SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn);
 					KASSERT(chk->send_size > 0, ("chunk has zero length"));
 					if (asoc->size_on_reasm_queue >= chk->send_size) {
 						asoc->size_on_reasm_queue -= chk->send_size;
 					} else {
 #ifdef INVARIANTS
 						panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size);
 #else
 						asoc->size_on_reasm_queue = 0;
 #endif
 					}
 					sctp_ucount_decr(asoc->cnt_on_reasm_queue);
 					SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
 					TAILQ_REMOVE(&control->reasm, chk, sctp_next);
 					if (chk->data) {
 						sctp_m_freem(chk->data);
 						chk->data = NULL;
 					}
 					sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 				}
 				sctp_free_a_readq(stcb, control);
 			}
 		}
 		TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].uno_inqueue, next_instrm, ncontrol) {
 #ifdef INVARIANTS
 			if (control->on_strm_q != SCTP_ON_UNORDERED) {
 				panic("Huh control: %p on_q: %d -- not unordered?",
 				    control, control->on_strm_q);
 			}
 #endif
 			if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) {
 				/* Yep it is above cum-ack */
 				cnt++;
 				SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn);
 				KASSERT(control->length > 0, ("control has zero length"));
 				if (asoc->size_on_all_streams >= control->length) {
 					asoc->size_on_all_streams -= control->length;
 				} else {
 #ifdef INVARIANTS
 					panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length);
 #else
 					asoc->size_on_all_streams = 0;
 #endif
 				}
 				sctp_ucount_decr(asoc->cnt_on_all_streams);
 				SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
 				if (control->on_read_q) {
 					TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next);
 					control->on_read_q = 0;
 				}
 				TAILQ_REMOVE(&asoc->strmin[strmat].uno_inqueue, control, next_instrm);
 				control->on_strm_q = 0;
 				if (control->data) {
 					sctp_m_freem(control->data);
 					control->data = NULL;
 				}
 				sctp_free_remote_addr(control->whoFrom);
 				/* Now its reasm? */
 				TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) {
 					cnt++;
 					SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn);
 					KASSERT(chk->send_size > 0, ("chunk has zero length"));
 					if (asoc->size_on_reasm_queue >= chk->send_size) {
 						asoc->size_on_reasm_queue -= chk->send_size;
 					} else {
 #ifdef INVARIANTS
 						panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size);
 #else
 						asoc->size_on_reasm_queue = 0;
 #endif
 					}
 					sctp_ucount_decr(asoc->cnt_on_reasm_queue);
 					SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
 					TAILQ_REMOVE(&control->reasm, chk, sctp_next);
 					if (chk->data) {
 						sctp_m_freem(chk->data);
 						chk->data = NULL;
 					}
 					sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
 				}
 				sctp_free_a_readq(stcb, control);
 			}
 		}
 	}
 	if (cnt) {
 		/* We must back down to see what the new highest is */
 		for (i = asoc->highest_tsn_inside_map; SCTP_TSN_GE(i, asoc->mapping_array_base_tsn); i--) {
 			SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn);
 			if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
 				asoc->highest_tsn_inside_map = i;
 				fnd = 1;
 				break;
 			}
 		}
 		if (!fnd) {
 			asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1;
 		}
 
 		/*
 		 * Question, should we go through the delivery queue? The
 		 * only reason things are on here is the app not reading OR
 		 * a p-d-api up. An attacker COULD send enough in to
 		 * initiate the PD-API and then send a bunch of stuff to
 		 * other streams... these would wind up on the delivery
 		 * queue.. and then we would not get to them. But in order
 		 * to do this I then have to back-track and un-deliver
 		 * sequence numbers in streams.. el-yucko. I think for now
 		 * we will NOT look at the delivery queue and leave it to be
 		 * something to consider later. An alternative would be to
 		 * abort the P-D-API with a notification and then deliver
 		 * the data.... Or another method might be to keep track of
 		 * how many times the situation occurs and if we see a
 		 * possible attack underway just abort the association.
 		 */
 #ifdef SCTP_DEBUG
 		SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt);
 #endif
 		/*
 		 * Now do we need to find a new
 		 * asoc->highest_tsn_inside_map?
 		 */
 		asoc->last_revoke_count = cnt;
 		sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL,
 		    SCTP_FROM_SCTP_PCB + SCTP_LOC_11);
 		/* sa_ignore NO_NULL_CHK */
 		sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED);
 		sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED);
 	}
 	/*
 	 * Another issue, in un-setting the TSN's in the mapping array we
 	 * DID NOT adjust the highest_tsn marker.  This will cause one of
 	 * two things to occur. It may cause us to do extra work in checking
 	 * for our mapping array movement. More importantly it may cause us
 	 * to SACK every datagram. This may not be a bad thing though since
 	 * we will recover once we get our cum-ack above and all this stuff
 	 * we dumped recovered.
 	 */
 }
 
 void
 sctp_drain()
 {
 	/*
 	 * We must walk the PCB lists for ALL associations here. The system
 	 * is LOW on MBUF's and needs help. This is where reneging will
 	 * occur. We really hope this does NOT happen!
 	 */
 	VNET_ITERATOR_DECL(vnet_iter);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct sctp_inpcb *inp;
 		struct sctp_tcb *stcb;
 
 		SCTP_STAT_INCR(sctps_protocol_drain_calls);
 		if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
 #ifdef VIMAGE
 			continue;
 #else
 			return;
 #endif
 		}
 		SCTP_INP_INFO_RLOCK();
 		LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
 			/* For each endpoint */
 			SCTP_INP_RLOCK(inp);
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				/* For each association */
 				SCTP_TCB_LOCK(stcb);
 				sctp_drain_mbufs(stcb);
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			SCTP_INP_RUNLOCK(inp);
 		}
 		SCTP_INP_INFO_RUNLOCK();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * start a new iterator
  * iterates through all endpoints and associations based on the pcb_state
  * flags and asoc_state.  "af" (mandatory) is executed for all matching
  * assocs and "ef" (optional) is executed when the iterator completes.
  * "inpf" (optional) is executed for each new endpoint as it is being
  * iterated through. inpe (optional) is called when the inp completes
  * its way through all the stcbs.
  */
 int
 sctp_initiate_iterator(inp_func inpf,
     asoc_func af,
     inp_func inpe,
     uint32_t pcb_state,
     uint32_t pcb_features,
     uint32_t asoc_state,
     void *argp,
     uint32_t argi,
     end_func ef,
     struct sctp_inpcb *s_inp,
     uint8_t chunk_output_off)
 {
 	struct sctp_iterator *it = NULL;
 
 	if (af == NULL) {
 		return (-1);
 	}
 	if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
 		SCTP_PRINTF("%s: abort on initialize being %d\n", __func__,
 		    SCTP_BASE_VAR(sctp_pcb_initialized));
 		return (-1);
 	}
 	SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator),
 	    SCTP_M_ITER);
 	if (it == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM);
 		return (-1);
 	}
 	memset(it, 0, sizeof(*it));
 	it->function_assoc = af;
 	it->function_inp = inpf;
 	if (inpf)
 		it->done_current_ep = 0;
 	else
 		it->done_current_ep = 1;
 	it->function_atend = ef;
 	it->pointer = argp;
 	it->val = argi;
 	it->pcb_flags = pcb_state;
 	it->pcb_features = pcb_features;
 	it->asoc_state = asoc_state;
 	it->function_inp_end = inpe;
 	it->no_chunk_output = chunk_output_off;
 	it->vn = curvnet;
 	if (s_inp) {
 		/* Assume lock is held here */
 		it->inp = s_inp;
 		SCTP_INP_INCR_REF(it->inp);
 		it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP;
 	} else {
 		SCTP_INP_INFO_RLOCK();
 		it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead));
 		if (it->inp) {
 			SCTP_INP_INCR_REF(it->inp);
 		}
 		SCTP_INP_INFO_RUNLOCK();
 		it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP;
 
 	}
 	SCTP_IPI_ITERATOR_WQ_LOCK();
 	if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
 		SCTP_IPI_ITERATOR_WQ_UNLOCK();
 		SCTP_PRINTF("%s: rollback on initialize being %d it=%p\n", __func__,
 		    SCTP_BASE_VAR(sctp_pcb_initialized), it);
 		SCTP_FREE(it, SCTP_M_ITER);
 		return (-1);
 	}
 	TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
 	if (sctp_it_ctl.iterator_running == 0) {
 		sctp_wakeup_iterator();
 	}
 	SCTP_IPI_ITERATOR_WQ_UNLOCK();
 	/* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */
 	return (0);
 }
Index: projects/clang1100-import/sys/netinet/sctp_structs.h
===================================================================
--- projects/clang1100-import/sys/netinet/sctp_structs.h	(revision 364278)
+++ projects/clang1100-import/sys/netinet/sctp_structs.h	(revision 364279)
@@ -1,1247 +1,1248 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_STRUCTS_H_
 #define _NETINET_SCTP_STRUCTS_H_
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_auth.h>
 
 struct sctp_timer {
 	sctp_os_timer_t timer;
 
 	int type;
 	/*
 	 * Depending on the timer type these will be setup and cast with the
 	 * appropriate entity.
 	 */
 	void *ep;
 	void *tcb;
 	void *net;
 	void *vnet;
 
 	/* for sanity checking */
 	void *self;
 	uint32_t ticks;
 	uint32_t stopped_from;
 };
 
 
 struct sctp_foo_stuff {
 	struct sctp_inpcb *inp;
 	uint32_t lineno;
 	uint32_t ticks;
 	int updown;
 };
 
 
 /*
  * This is the information we track on each interface that we know about from
  * the distant end.
  */
 TAILQ_HEAD(sctpnetlisthead, sctp_nets);
 
 struct sctp_stream_reset_list {
 	TAILQ_ENTRY(sctp_stream_reset_list) next_resp;
 	uint32_t seq;
 	uint32_t tsn;
 	uint32_t number_entries;
 	uint16_t list_of_streams[];
 };
 
 TAILQ_HEAD(sctp_resethead, sctp_stream_reset_list);
 
 /*
  * Users of the iterator need to malloc a iterator with a call to
  * sctp_initiate_iterator(inp_func, assoc_func, inp_func,  pcb_flags, pcb_features,
  *     asoc_state, void-ptr-arg, uint32-arg, end_func, inp);
  *
  * Use the following two defines if you don't care what pcb flags are on the EP
  * and/or you don't care what state the association is in.
  *
  * Note that if you specify an INP as the last argument then ONLY each
  * association of that single INP will be executed upon. Note that the pcb
  * flags STILL apply so if the inp you specify has different pcb_flags then
  * what you put in pcb_flags nothing will happen. use SCTP_PCB_ANY_FLAGS to
  * assure the inp you specify gets treated.
  */
 #define SCTP_PCB_ANY_FLAGS	0x00000000
 #define SCTP_PCB_ANY_FEATURES	0x00000000
 #define SCTP_ASOC_ANY_STATE	0x00000000
 
 typedef void (*asoc_func) (struct sctp_inpcb *, struct sctp_tcb *, void *ptr,
     uint32_t val);
 typedef int (*inp_func) (struct sctp_inpcb *, void *ptr, uint32_t val);
 typedef void (*end_func) (void *ptr, uint32_t val);
 
 #if defined(SCTP_MCORE_INPUT) && defined(SMP)
 /* whats on the mcore control struct */
 struct sctp_mcore_queue {
 	TAILQ_ENTRY(sctp_mcore_queue) next;
 	struct vnet *vn;
 	struct mbuf *m;
 	int off;
 	int v6;
 };
 
 TAILQ_HEAD(sctp_mcore_qhead, sctp_mcore_queue);
 
 struct sctp_mcore_ctrl {
 	SCTP_PROCESS_STRUCT thread_proc;
 	struct sctp_mcore_qhead que;
 	struct mtx core_mtx;
 	struct mtx que_mtx;
 	int running;
 	int cpuid;
 };
 #endif
 
 struct sctp_iterator {
 	TAILQ_ENTRY(sctp_iterator) sctp_nxt_itr;
 	struct vnet *vn;
 	struct sctp_timer tmr;
 	struct sctp_inpcb *inp;	/* current endpoint */
 	struct sctp_tcb *stcb;	/* current* assoc */
 	struct sctp_inpcb *next_inp;	/* special hook to skip to */
 	asoc_func function_assoc;	/* per assoc function */
 	inp_func function_inp;	/* per endpoint function */
 	inp_func function_inp_end;	/* end INP function */
 	end_func function_atend;	/* iterator completion function */
 	void *pointer;		/* pointer for apply func to use */
 	uint32_t val;		/* value for apply func to use */
 	uint32_t pcb_flags;	/* endpoint flags being checked */
 	uint32_t pcb_features;	/* endpoint features being checked */
 	uint32_t asoc_state;	/* assoc state being checked */
 	uint32_t iterator_flags;
 	uint8_t no_chunk_output;
 	uint8_t done_current_ep;
 };
 
 /* iterator_flags values */
 #define SCTP_ITERATOR_DO_ALL_INP	0x00000001
 #define SCTP_ITERATOR_DO_SINGLE_INP	0x00000002
 
 
 TAILQ_HEAD(sctpiterators, sctp_iterator);
 
 struct sctp_copy_all {
 	struct sctp_inpcb *inp;	/* ep */
 	struct mbuf *m;
 	struct sctp_sndrcvinfo sndrcv;
 	ssize_t sndlen;
 	int cnt_sent;
 	int cnt_failed;
 };
 
 struct sctp_asconf_iterator {
 	struct sctpladdr list_of_work;
 	int cnt;
 };
 
 struct iterator_control {
 	struct mtx ipi_iterator_wq_mtx;
 	struct mtx it_mtx;
 	SCTP_PROCESS_STRUCT thread_proc;
 	struct sctpiterators iteratorhead;
 	struct sctp_iterator *cur_it;
 	uint32_t iterator_running;
 	uint32_t iterator_flags;
 };
 #define SCTP_ITERATOR_STOP_CUR_IT	0x00000004
 #define SCTP_ITERATOR_STOP_CUR_INP	0x00000008
 
 struct sctp_net_route {
 	struct nhop_object *ro_nh;
 	struct llentry *ro_lle;
 	char *ro_prepend;
 	uint16_t ro_plen;
 	uint16_t ro_flags;
 	uint16_t ro_mtu;
 	uint16_t spare;
 	union sctp_sockstore _l_addr;	/* remote peer addr */
 	struct sctp_ifa *_s_addr;	/* our selected src addr */
 };
 
 struct htcp {
 	uint16_t alpha;		/* Fixed point arith, << 7 */
 	uint8_t beta;		/* Fixed point arith, << 7 */
 	uint8_t modeswitch;	/* Delay modeswitch until we had at least one
 				 * congestion event */
 	uint32_t last_cong;	/* Time since last congestion event end */
 	uint32_t undo_last_cong;
 	uint16_t bytes_acked;
 	uint32_t bytecount;
 	uint32_t minRTT;
 	uint32_t maxRTT;
 
 	uint32_t undo_maxRTT;
 	uint32_t undo_old_maxB;
 
 	/* Bandwidth estimation */
 	uint32_t minB;
 	uint32_t maxB;
 	uint32_t old_maxB;
 	uint32_t Bi;
 	uint32_t lasttime;
 };
 
 struct rtcc_cc {
 	struct timeval tls;	/* The time we started the sending  */
 	uint64_t lbw;		/* Our last estimated bw */
 	uint64_t lbw_rtt;	/* RTT at bw estimate */
 	uint64_t bw_bytes;	/* The total bytes since this sending began */
 	uint64_t bw_tot_time;	/* The total time since sending began */
 	uint64_t new_tot_time;	/* temp holding the new value */
 	uint64_t bw_bytes_at_last_rttc;	/* What bw_bytes was at last rtt calc */
 	uint32_t cwnd_at_bw_set;	/* Cwnd at last bw saved - lbw */
 	uint32_t vol_reduce;	/* cnt of voluntary reductions */
 	uint16_t steady_step;	/* The number required to be in steady state */
 	uint16_t step_cnt;	/* The current number */
 	uint8_t ret_from_eq;	/* When all things are equal what do I return
 				 * 0/1 - 1 no cc advance */
 	uint8_t use_dccc_ecn;	/* Flag to enable DCCC ECN */
 	uint8_t tls_needs_set;	/* Flag to indicate we need to set tls 0 or 1
 				 * means set at send 2 not */
 	uint8_t last_step_state;	/* Last state if steady state stepdown
 					 * is on */
 	uint8_t rtt_set_this_sack;	/* Flag saying this sack had RTT calc
 					 * on it */
 	uint8_t last_inst_ind;	/* Last saved inst indication */
 };
 
 
 struct sctp_nets {
 	TAILQ_ENTRY(sctp_nets) sctp_next;	/* next link */
 
 	/*
 	 * Things on the top half may be able to be split into a common
 	 * structure shared by all.
 	 */
 	struct sctp_timer pmtu_timer;
 	struct sctp_timer hb_timer;
 
 	/*
 	 * The following two in combination equate to a route entry for v6
 	 * or v4.
 	 */
 	struct sctp_net_route ro;
 
 	/* mtu discovered so far */
 	uint32_t mtu;
 	uint32_t ssthresh;	/* not sure about this one for split */
 	uint32_t last_cwr_tsn;
 	uint32_t cwr_window_tsn;
 	uint32_t ecn_ce_pkt_cnt;
 	uint32_t lost_cnt;
 	/* smoothed average things for RTT and RTO itself */
 	int lastsa;
 	int lastsv;
 	uint64_t rtt;		/* last measured rtt value in us */
 	uint32_t RTO;
 
 	/* This is used for SHUTDOWN/SHUTDOWN-ACK/SEND or INIT timers */
 	struct sctp_timer rxt_timer;
 
 	/* last time in seconds I sent to it */
 	struct timeval last_sent_time;
 	union cc_control_data {
 		struct htcp htcp_ca;	/* JRS - struct used in HTCP algorithm */
 		struct rtcc_cc rtcc;	/* rtcc module cc stuff  */
 	}               cc_mod;
 	int ref_count;
 
 	/* Congestion stats per destination */
 	/*
 	 * flight size variables and such, sorry Vern, I could not avoid
 	 * this if I wanted performance :>
 	 */
 	uint32_t flight_size;
 	uint32_t cwnd;		/* actual cwnd */
 	uint32_t prev_cwnd;	/* cwnd before any processing */
 	uint32_t ecn_prev_cwnd;	/* ECN prev cwnd at first ecn_echo seen in new
 				 * window */
 	uint32_t partial_bytes_acked;	/* in CA tracks when to incr a MTU */
 	/* tracking variables to avoid the aloc/free in sack processing */
 	unsigned int net_ack;
 	unsigned int net_ack2;
 
 	/*
 	 * JRS - 5/8/07 - Variable to track last time a destination was
 	 * active for CMT PF
 	 */
 	uint32_t last_active;
 
 	/*
 	 * CMT variables (iyengar@cis.udel.edu)
 	 */
 	uint32_t this_sack_highest_newack;	/* tracks highest TSN newly
 						 * acked for a given dest in
 						 * the current SACK. Used in
 						 * SFR and HTNA algos */
 	uint32_t pseudo_cumack;	/* CMT CUC algorithm. Maintains next expected
 				 * pseudo-cumack for this destination */
 	uint32_t rtx_pseudo_cumack;	/* CMT CUC algorithm. Maintains next
 					 * expected pseudo-cumack for this
 					 * destination */
 
 	/* CMT fast recovery variables */
 	uint32_t fast_recovery_tsn;
 	uint32_t heartbeat_random1;
 	uint32_t heartbeat_random2;
 #ifdef INET6
 	uint32_t flowlabel;
 #endif
 	uint8_t dscp;
 
 	struct timeval start_time;	/* time when this net was created */
 	uint32_t marked_retrans;	/* number or DATA chunks marked for
 					 * timer based retransmissions */
 	uint32_t marked_fastretrans;
 	uint32_t heart_beat_delay;	/* Heart Beat delay in ms */
 
 	/* if this guy is ok or not ... status */
 	uint16_t dest_state;
 	/* number of timeouts to consider the destination unreachable */
 	uint16_t failure_threshold;
 	/* number of timeouts to consider the destination potentially failed */
 	uint16_t pf_threshold;
 	/* error stats on the destination */
 	uint16_t error_count;
 	/* UDP port number in case of UDP tunneling */
 	uint16_t port;
 
 	uint8_t fast_retran_loss_recovery;
 	uint8_t will_exit_fast_recovery;
 	/* Flags that probably can be combined into dest_state */
 	uint8_t fast_retran_ip;	/* fast retransmit in progress */
 	uint8_t hb_responded;
 	uint8_t saw_newack;	/* CMT's SFR algorithm flag */
 	uint8_t src_addr_selected;	/* if we split we move */
 	uint8_t indx_of_eligible_next_to_use;
 	uint8_t addr_is_local;	/* its a local address (if known) could move
 				 * in split */
 
 	/*
 	 * CMT variables (iyengar@cis.udel.edu)
 	 */
 	uint8_t find_pseudo_cumack;	/* CMT CUC algorithm. Flag used to
 					 * find a new pseudocumack. This flag
 					 * is set after a new pseudo-cumack
 					 * has been received and indicates
 					 * that the sender should find the
 					 * next pseudo-cumack expected for
 					 * this destination */
 	uint8_t find_rtx_pseudo_cumack;	/* CMT CUCv2 algorithm. Flag used to
 					 * find a new rtx-pseudocumack. This
 					 * flag is set after a new
 					 * rtx-pseudo-cumack has been received
 					 * and indicates that the sender
 					 * should find the next
 					 * rtx-pseudo-cumack expected for this
 					 * destination */
 	uint8_t new_pseudo_cumack;	/* CMT CUC algorithm. Flag used to
 					 * indicate if a new pseudo-cumack or
 					 * rtx-pseudo-cumack has been received */
 	uint8_t window_probe;	/* Doing a window probe? */
 	uint8_t RTO_measured;	/* Have we done the first measure */
 	uint8_t last_hs_used;	/* index into the last HS table entry we used */
 	uint8_t lan_type;
 	uint8_t rto_needed;
 	uint32_t flowid;
 	uint8_t flowtype;
 };
 
 
 struct sctp_data_chunkrec {
 	uint32_t tsn;		/* the TSN of this transmit */
 	uint32_t mid;		/* the message identifier of this transmit */
 	uint16_t sid;		/* the stream number of this guy */
 	uint32_t ppid;
 	uint32_t context;	/* from send */
 	uint32_t cwnd_at_send;
 	/*
 	 * part of the Highest sacked algorithm to be able to stroke counts
 	 * on ones that are FR'd.
 	 */
 	uint32_t fast_retran_tsn;	/* sending_seq at the time of FR */
 	struct timeval timetodrop;	/* time we drop it from queue */
 	uint32_t fsn;		/* Fragment Sequence Number */
 	uint8_t doing_fast_retransmit;
 	uint8_t rcv_flags;	/* flags pulled from data chunk on inbound for
 				 * outbound holds sending flags for PR-SCTP. */
 	uint8_t state_flags;
 	uint8_t chunk_was_revoked;
 	uint8_t fwd_tsn_cnt;
 };
 
 TAILQ_HEAD(sctpchunk_listhead, sctp_tmit_chunk);
 
 /* The lower byte is used to enumerate PR_SCTP policies */
 #define CHUNK_FLAGS_PR_SCTP_TTL	        SCTP_PR_SCTP_TTL
 #define CHUNK_FLAGS_PR_SCTP_BUF	        SCTP_PR_SCTP_BUF
 #define CHUNK_FLAGS_PR_SCTP_RTX         SCTP_PR_SCTP_RTX
 
 /* The upper byte is used as a bit mask */
 #define CHUNK_FLAGS_FRAGMENT_OK	        0x0100
 
 struct chk_id {
 	uint8_t id;
 	uint8_t can_take_data;
 };
 
 
 struct sctp_tmit_chunk {
 	union {
 		struct sctp_data_chunkrec data;
 		struct chk_id chunk_id;
 	}     rec;
 	struct sctp_association *asoc;	/* bp to asoc this belongs to */
 	struct timeval sent_rcv_time;	/* filled in if RTT being calculated */
 	struct mbuf *data;	/* pointer to mbuf chain of data */
 	struct mbuf *last_mbuf;	/* pointer to last mbuf in chain */
 	struct sctp_nets *whoTo;
 	          TAILQ_ENTRY(sctp_tmit_chunk) sctp_next;	/* next link */
 	int32_t sent;		/* the send status */
 	uint16_t snd_count;	/* number of times I sent */
 	uint16_t flags;		/* flags, such as FRAGMENT_OK */
 	uint16_t send_size;
 	uint16_t book_size;
 	uint16_t mbcnt;
 	uint16_t auth_keyid;
 	uint8_t holds_key_ref;	/* flag if auth keyid refcount is held */
 	uint8_t pad_inplace;
 	uint8_t do_rtt;
 	uint8_t book_size_scale;
 	uint8_t no_fr_allowed;
 	uint8_t copy_by_ref;
 	uint8_t window_probe;
 };
 
 struct sctp_queued_to_read {	/* sinfo structure Pluse more */
 	uint16_t sinfo_stream;	/* off the wire */
 	uint16_t sinfo_flags;	/* SCTP_UNORDERED from wire use SCTP_EOF for
 				 * EOR */
 	uint32_t sinfo_ppid;	/* off the wire */
 	uint32_t sinfo_context;	/* pick this up from assoc def context? */
 	uint32_t sinfo_timetolive;	/* not used by kernel */
 	uint32_t sinfo_tsn;	/* Use this in reassembly as first TSN */
 	uint32_t sinfo_cumtsn;	/* Use this in reassembly as last TSN */
 	sctp_assoc_t sinfo_assoc_id;	/* our assoc id */
 	/* Non sinfo stuff */
 	uint32_t mid;		/* Fragment Index */
 	uint32_t length;	/* length of data */
 	uint32_t held_length;	/* length held in sb */
 	uint32_t top_fsn;	/* Highest FSN in queue */
 	uint32_t fsn_included;	/* Highest FSN in *data portion */
 	struct sctp_nets *whoFrom;	/* where it came from */
 	struct mbuf *data;	/* front of the mbuf chain of data with
 				 * PKT_HDR */
 	struct mbuf *tail_mbuf;	/* used for multi-part data */
 	struct mbuf *aux_data;	/* used to hold/cache  control if o/s does not
 				 * take it from us */
 	struct sctp_tcb *stcb;	/* assoc, used for window update */
 	         TAILQ_ENTRY(sctp_queued_to_read) next;
 	         TAILQ_ENTRY(sctp_queued_to_read) next_instrm;
 	struct sctpchunk_listhead reasm;
 	uint16_t port_from;
 	uint16_t spec_flags;	/* Flags to hold the notification field */
 	uint8_t do_not_ref_stcb;
 	uint8_t end_added;
 	uint8_t pdapi_aborted;
 	uint8_t pdapi_started;
 	uint8_t some_taken;
 	uint8_t last_frag_seen;
 	uint8_t first_frag_seen;
 	uint8_t on_read_q;
 	uint8_t on_strm_q;
 };
 
 #define SCTP_ON_ORDERED 1
 #define SCTP_ON_UNORDERED 2
 
 /* This data structure will be on the outbound
  * stream queues. Data will be pulled off from
  * the front of the mbuf data and chunk-ified
  * by the output routines. We will custom
  * fit every chunk we pull to the send/sent
  * queue to make up the next full packet
  * if we can. An entry cannot be removed
  * from the stream_out queue until
  * the msg_is_complete flag is set. This
  * means at times data/tail_mbuf MIGHT
  * be NULL.. If that occurs it happens
  * for one of two reasons. Either the user
  * is blocked on a send() call and has not
  * awoken to copy more data down... OR
  * the user is in the explict MSG_EOR mode
  * and wrote some data, but has not completed
  * sending.
  */
 struct sctp_stream_queue_pending {
 	struct mbuf *data;
 	struct mbuf *tail_mbuf;
 	struct timeval ts;
 	struct sctp_nets *net;
 	          TAILQ_ENTRY(sctp_stream_queue_pending) next;
 	          TAILQ_ENTRY(sctp_stream_queue_pending) ss_next;
 	uint32_t fsn;
 	uint32_t length;
 	uint32_t timetolive;
 	uint32_t ppid;
 	uint32_t context;
 	uint16_t sinfo_flags;
 	uint16_t sid;
 	uint16_t act_flags;
 	uint16_t auth_keyid;
 	uint8_t holds_key_ref;
 	uint8_t msg_is_complete;
 	uint8_t some_taken;
 	uint8_t sender_all_done;
 	uint8_t put_last_out;
 	uint8_t discard_rest;
+	uint8_t processing;
 };
 
 /*
  * this struct contains info that is used to track inbound stream data and
  * help with ordering.
  */
 TAILQ_HEAD(sctpwheelunrel_listhead, sctp_stream_in);
 struct sctp_stream_in {
 	struct sctp_readhead inqueue;
 	struct sctp_readhead uno_inqueue;
 	uint32_t last_mid_delivered;	/* used for re-order */
 	uint16_t sid;
 	uint8_t delivery_started;
 	uint8_t pd_api_started;
 };
 
 TAILQ_HEAD(sctpwheel_listhead, sctp_stream_out);
 TAILQ_HEAD(sctplist_listhead, sctp_stream_queue_pending);
 
 
 /* Round-robin schedulers */
 struct ss_rr {
 	/* next link in wheel */
 	TAILQ_ENTRY(sctp_stream_out) next_spoke;
 };
 
 /* Priority scheduler */
 struct ss_prio {
 	/* next link in wheel */
 	TAILQ_ENTRY(sctp_stream_out) next_spoke;
 	/* priority id */
 	uint16_t priority;
 };
 
 /* Fair Bandwidth scheduler */
 struct ss_fb {
 	/* next link in wheel */
 	TAILQ_ENTRY(sctp_stream_out) next_spoke;
 	/* stores message size */
 	int32_t rounds;
 };
 
 /*
  * This union holds all data necessary for
  * different stream schedulers.
  */
 struct scheduling_data {
 	struct sctp_stream_out *locked_on_sending;
 	/* circular looking for output selection */
 	struct sctp_stream_out *last_out_stream;
 	union {
 		struct sctpwheel_listhead wheel;
 		struct sctplist_listhead list;
 	}     out;
 };
 
 /*
  * This union holds all parameters per stream
  * necessary for different stream schedulers.
  */
 union scheduling_parameters {
 	struct ss_rr rr;
 	struct ss_prio prio;
 	struct ss_fb fb;
 };
 
 /* States for outgoing streams */
 #define SCTP_STREAM_CLOSED           0x00
 #define SCTP_STREAM_OPENING          0x01
 #define SCTP_STREAM_OPEN             0x02
 #define SCTP_STREAM_RESET_PENDING    0x03
 #define SCTP_STREAM_RESET_IN_FLIGHT  0x04
 
 #define SCTP_MAX_STREAMS_AT_ONCE_RESET 200
 
 /* This struct is used to track the traffic on outbound streams */
 struct sctp_stream_out {
 	struct sctp_streamhead outqueue;
 	union scheduling_parameters ss_params;
 	uint32_t chunks_on_queues;	/* send queue and sent queue */
 #if defined(SCTP_DETAILED_STR_STATS)
 	uint32_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1];
 	uint32_t abandoned_sent[SCTP_PR_SCTP_MAX + 1];
 #else
 	/* Only the aggregation */
 	uint32_t abandoned_unsent[1];
 	uint32_t abandoned_sent[1];
 #endif
 	/*
 	 * For associations using DATA chunks, the lower 16-bit of
 	 * next_mid_ordered are used as the next SSN.
 	 */
 	uint32_t next_mid_ordered;
 	uint32_t next_mid_unordered;
 	uint16_t sid;
 	uint8_t last_msg_incomplete;
 	uint8_t state;
 };
 
 /* used to keep track of the addresses yet to try to add/delete */
 TAILQ_HEAD(sctp_asconf_addrhead, sctp_asconf_addr);
 struct sctp_asconf_addr {
 	TAILQ_ENTRY(sctp_asconf_addr) next;
 	struct sctp_asconf_addr_param ap;
 	struct sctp_ifa *ifa;	/* save the ifa for add/del ip */
 	uint8_t sent;		/* has this been sent yet? */
 	uint8_t special_del;	/* not to be used in lookup */
 };
 
 struct sctp_scoping {
 	uint8_t ipv4_addr_legal;
 	uint8_t ipv6_addr_legal;
 	uint8_t loopback_scope;
 	uint8_t ipv4_local_scope;
 	uint8_t local_scope;
 	uint8_t site_scope;
 };
 
 #define SCTP_TSN_LOG_SIZE 40
 
 struct sctp_tsn_log {
 	void *stcb;
 	uint32_t tsn;
 	uint32_t seq;
 	uint16_t strm;
 	uint16_t sz;
 	uint16_t flgs;
 	uint16_t in_pos;
 	uint16_t in_out;
 	uint16_t resv;
 };
 
 #define SCTP_FS_SPEC_LOG_SIZE 200
 struct sctp_fs_spec_log {
 	uint32_t sent;
 	uint32_t total_flight;
 	uint32_t tsn;
 	uint16_t book;
 	uint8_t incr;
 	uint8_t decr;
 };
 
 /* This struct is here to cut out the compatiabilty
  * pad that bulks up both the inp and stcb. The non
  * pad portion MUST stay in complete sync with
  * sctp_sndrcvinfo... i.e. if sinfo_xxxx is added
  * this must be done here too.
  */
 struct sctp_nonpad_sndrcvinfo {
 	uint16_t sinfo_stream;
 	uint16_t sinfo_ssn;
 	uint16_t sinfo_flags;
 	uint32_t sinfo_ppid;
 	uint32_t sinfo_context;
 	uint32_t sinfo_timetolive;
 	uint32_t sinfo_tsn;
 	uint32_t sinfo_cumtsn;
 	sctp_assoc_t sinfo_assoc_id;
 	uint16_t sinfo_keynumber;
 	uint16_t sinfo_keynumber_valid;
 };
 
 /*
  * JRS - Structure to hold function pointers to the functions responsible
  * for congestion control.
  */
 
 struct sctp_cc_functions {
 	void (*sctp_set_initial_cc_param) (struct sctp_tcb *stcb, struct sctp_nets *net);
 	void (*sctp_cwnd_update_after_sack) (struct sctp_tcb *stcb,
 	    struct sctp_association *asoc,
 	    int accum_moved, int reneged_all, int will_exit);
 	void (*sctp_cwnd_update_exit_pf) (struct sctp_tcb *stcb, struct sctp_nets *net);
 	void (*sctp_cwnd_update_after_fr) (struct sctp_tcb *stcb,
 	    struct sctp_association *asoc);
 	void (*sctp_cwnd_update_after_timeout) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net);
 	void (*sctp_cwnd_update_after_ecn_echo) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net, int in_window, int num_pkt_lost);
 	void (*sctp_cwnd_update_after_packet_dropped) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net, struct sctp_pktdrop_chunk *cp,
 	    uint32_t *bottle_bw, uint32_t *on_queue);
 	void (*sctp_cwnd_update_after_output) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net, int burst_limit);
 	void (*sctp_cwnd_update_packet_transmitted) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net);
 	void (*sctp_cwnd_update_tsn_acknowledged) (struct sctp_nets *net,
 	    struct sctp_tmit_chunk *);
 	void (*sctp_cwnd_new_transmission_begins) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net);
 	void (*sctp_cwnd_prepare_net_for_sack) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net);
 	int (*sctp_cwnd_socket_option) (struct sctp_tcb *stcb, int set, struct sctp_cc_option *);
 	void (*sctp_rtt_calculated) (struct sctp_tcb *, struct sctp_nets *, struct timeval *);
 };
 
 /*
  * RS - Structure to hold function pointers to the functions responsible
  * for stream scheduling.
  */
 struct sctp_ss_functions {
 	void (*sctp_ss_init) (struct sctp_tcb *stcb, struct sctp_association *asoc,
 	    int holds_lock);
 	void (*sctp_ss_clear) (struct sctp_tcb *stcb, struct sctp_association *asoc,
 	    int clear_values, int holds_lock);
 	void (*sctp_ss_init_stream) (struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq);
 	void (*sctp_ss_add_to_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc,
 	    struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock);
 	int (*sctp_ss_is_empty) (struct sctp_tcb *stcb, struct sctp_association *asoc);
 	void (*sctp_ss_remove_from_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc,
 	    struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock);
 struct sctp_stream_out *(*sctp_ss_select_stream) (struct sctp_tcb *stcb,
 	    struct sctp_nets *net, struct sctp_association *asoc);
 	void (*sctp_ss_scheduled) (struct sctp_tcb *stcb, struct sctp_nets *net,
 	    struct sctp_association *asoc, struct sctp_stream_out *strq, int moved_how_much);
 	void (*sctp_ss_packet_done) (struct sctp_tcb *stcb, struct sctp_nets *net,
 	    struct sctp_association *asoc);
 	int (*sctp_ss_get_value) (struct sctp_tcb *stcb, struct sctp_association *asoc,
 	    struct sctp_stream_out *strq, uint16_t *value);
 	int (*sctp_ss_set_value) (struct sctp_tcb *stcb, struct sctp_association *asoc,
 	    struct sctp_stream_out *strq, uint16_t value);
 	int (*sctp_ss_is_user_msgs_incomplete) (struct sctp_tcb *stcb, struct sctp_association *asoc);
 };
 
 /* used to save ASCONF chunks for retransmission */
 TAILQ_HEAD(sctp_asconf_head, sctp_asconf);
 struct sctp_asconf {
 	TAILQ_ENTRY(sctp_asconf) next;
 	uint32_t serial_number;
 	uint16_t snd_count;
 	struct mbuf *data;
 	uint16_t len;
 };
 
 /* used to save ASCONF-ACK chunks for retransmission */
 TAILQ_HEAD(sctp_asconf_ackhead, sctp_asconf_ack);
 struct sctp_asconf_ack {
 	TAILQ_ENTRY(sctp_asconf_ack) next;
 	uint32_t serial_number;
 	struct sctp_nets *last_sent_to;
 	struct mbuf *data;
 	uint16_t len;
 };
 
 /*
  * Here we have information about each individual association that we track.
  * We probably in production would be more dynamic. But for ease of
  * implementation we will have a fixed array that we hunt for in a linear
  * fashion.
  */
 struct sctp_association {
 	/* association state */
 	int state;
 
 	/* queue of pending addrs to add/delete */
 	struct sctp_asconf_addrhead asconf_queue;
 
 	struct timeval time_entered;	/* time we entered state */
 	struct timeval time_last_rcvd;
 	struct timeval time_last_sent;
 	struct timeval time_last_sat_advance;
 	struct sctp_nonpad_sndrcvinfo def_send;
 
 	/* timers and such */
 	struct sctp_timer dack_timer;	/* Delayed ack timer */
 	struct sctp_timer asconf_timer;	/* asconf */
 	struct sctp_timer strreset_timer;	/* stream reset */
 	struct sctp_timer shut_guard_timer;	/* shutdown guard */
 	struct sctp_timer autoclose_timer;	/* automatic close timer */
 	struct sctp_timer delete_prim_timer;	/* deleting primary dst */
 
 	/* list of restricted local addresses */
 	struct sctpladdr sctp_restricted_addrs;
 
 	/* last local address pending deletion (waiting for an address add) */
 	struct sctp_ifa *asconf_addr_del_pending;
 	/* Deleted primary destination (used to stop timer) */
 	struct sctp_nets *deleted_primary;
 
 	struct sctpnetlisthead nets;	/* remote address list */
 
 	/* Free chunk list */
 	struct sctpchunk_listhead free_chunks;
 
 	/* Control chunk queue */
 	struct sctpchunk_listhead control_send_queue;
 
 	/* ASCONF chunk queue */
 	struct sctpchunk_listhead asconf_send_queue;
 
 	/*
 	 * Once a TSN hits the wire it is moved to the sent_queue. We
 	 * maintain two counts here (don't know if any but retran_cnt is
 	 * needed). The idea is that the sent_queue_retran_cnt reflects how
 	 * many chunks have been marked for retranmission by either T3-rxt
 	 * or FR.
 	 */
 	struct sctpchunk_listhead sent_queue;
 	struct sctpchunk_listhead send_queue;
 
 	/* Scheduling queues */
 	struct scheduling_data ss_data;
 
 	/* If an iterator is looking at me, this is it */
 	struct sctp_iterator *stcb_starting_point_for_iterator;
 
 	/* ASCONF save the last ASCONF-ACK so we can resend it if necessary */
 	struct sctp_asconf_ackhead asconf_ack_sent;
 
 	/*
 	 * pointer to last stream reset queued to control queue by us with
 	 * requests.
 	 */
 	struct sctp_tmit_chunk *str_reset;
 	/*
 	 * if Source Address Selection happening, this will rotate through
 	 * the link list.
 	 */
 	struct sctp_laddr *last_used_address;
 
 	/* stream arrays */
 	struct sctp_stream_in *strmin;
 	struct sctp_stream_out *strmout;
 	uint8_t *mapping_array;
 	/* primary destination to use */
 	struct sctp_nets *primary_destination;
 	struct sctp_nets *alternate;	/* If primary is down or PF */
 	/* For CMT */
 	struct sctp_nets *last_net_cmt_send_started;
 	/* last place I got a data chunk from */
 	struct sctp_nets *last_data_chunk_from;
 	/* last place I got a control from */
 	struct sctp_nets *last_control_chunk_from;
 
 
 	/*
 	 * wait to the point the cum-ack passes req->send_reset_at_tsn for
 	 * any req on the list.
 	 */
 	struct sctp_resethead resetHead;
 
 	/* queue of chunks waiting to be sent into the local stack */
 	struct sctp_readhead pending_reply_queue;
 
 	/* JRS - the congestion control functions are in this struct */
 	struct sctp_cc_functions cc_functions;
 	/*
 	 * JRS - value to store the currently loaded congestion control
 	 * module
 	 */
 	uint32_t congestion_control_module;
 	/* RS - the stream scheduling functions are in this struct */
 	struct sctp_ss_functions ss_functions;
 	/* RS - value to store the currently loaded stream scheduling module */
 	uint32_t stream_scheduling_module;
 
 	uint32_t vrf_id;
 	uint32_t cookie_preserve_req;
 	/* ASCONF next seq I am sending out, inits at init-tsn */
 	uint32_t asconf_seq_out;
 	uint32_t asconf_seq_out_acked;
 	/* ASCONF last received ASCONF from peer, starts at peer's TSN-1 */
 	uint32_t asconf_seq_in;
 
 	/* next seq I am sending in str reset messages */
 	uint32_t str_reset_seq_out;
 	/* next seq I am expecting in str reset messages */
 	uint32_t str_reset_seq_in;
 
 	/* various verification tag information */
 	uint32_t my_vtag;	/* The tag to be used. if assoc is re-initited
 				 * by remote end, and I have unlocked this
 				 * will be regenerated to a new random value. */
 	uint32_t peer_vtag;	/* The peers last tag */
 
 	uint32_t my_vtag_nonce;
 	uint32_t peer_vtag_nonce;
 
 	uint32_t assoc_id;
 
 	/* This is the SCTP fragmentation threshold */
 	uint32_t smallest_mtu;
 
 	/*
 	 * Special hook for Fast retransmit, allows us to track the highest
 	 * TSN that is NEW in this SACK if gap ack blocks are present.
 	 */
 	uint32_t this_sack_highest_gap;
 
 	/*
 	 * The highest consecutive TSN that has been acked by peer on my
 	 * sends
 	 */
 	uint32_t last_acked_seq;
 
 	/* The next TSN that I will use in sending. */
 	uint32_t sending_seq;
 
 	/* Original seq number I used ??questionable to keep?? */
 	uint32_t init_seq_number;
 
 
 	/* The Advanced Peer Ack Point, as required by the PR-SCTP */
 	/* (A1 in Section 4.2) */
 	uint32_t advanced_peer_ack_point;
 
 	/*
 	 * The highest consequetive TSN at the bottom of the mapping array
 	 * (for his sends).
 	 */
 	uint32_t cumulative_tsn;
 	/*
 	 * Used to track the mapping array and its offset bits. This MAY be
 	 * lower then cumulative_tsn.
 	 */
 	uint32_t mapping_array_base_tsn;
 	/*
 	 * used to track highest TSN we have received and is listed in the
 	 * mapping array.
 	 */
 	uint32_t highest_tsn_inside_map;
 
 	/* EY - new NR variables used for nr_sack based on mapping_array */
 	uint8_t *nr_mapping_array;
 	uint32_t highest_tsn_inside_nr_map;
 
 	uint32_t fast_recovery_tsn;
 	uint32_t sat_t3_recovery_tsn;
 	uint32_t tsn_last_delivered;
 	/*
 	 * For the pd-api we should re-write this a bit more efficient. We
 	 * could have multiple sctp_queued_to_read's that we are building at
 	 * once. Now we only do this when we get ready to deliver to the
 	 * socket buffer. Note that we depend on the fact that the struct is
 	 * "stuck" on the read queue until we finish all the pd-api.
 	 */
 	struct sctp_queued_to_read *control_pdapi;
 
 	uint32_t tsn_of_pdapi_last_delivered;
 	uint32_t pdapi_ppid;
 	uint32_t context;
 	uint32_t last_reset_action[SCTP_MAX_RESET_PARAMS];
 	uint32_t last_sending_seq[SCTP_MAX_RESET_PARAMS];
 	uint32_t last_base_tsnsent[SCTP_MAX_RESET_PARAMS];
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	/*
 	 * special log  - This adds considerable size to the asoc, but
 	 * provides a log that you can use to detect problems via kgdb.
 	 */
 	struct sctp_tsn_log in_tsnlog[SCTP_TSN_LOG_SIZE];
 	struct sctp_tsn_log out_tsnlog[SCTP_TSN_LOG_SIZE];
 	uint32_t cumack_log[SCTP_TSN_LOG_SIZE];
 	uint32_t cumack_logsnt[SCTP_TSN_LOG_SIZE];
 	uint16_t tsn_in_at;
 	uint16_t tsn_out_at;
 	uint16_t tsn_in_wrapped;
 	uint16_t tsn_out_wrapped;
 	uint16_t cumack_log_at;
 	uint16_t cumack_log_atsnt;
 #endif				/* SCTP_ASOCLOG_OF_TSNS */
 #ifdef SCTP_FS_SPEC_LOG
 	struct sctp_fs_spec_log fslog[SCTP_FS_SPEC_LOG_SIZE];
 	uint16_t fs_index;
 #endif
 
 	/*
 	 * window state information and smallest MTU that I use to bound
 	 * segmentation
 	 */
 	uint32_t peers_rwnd;
 	uint32_t my_rwnd;
 	uint32_t my_last_reported_rwnd;
 	uint32_t sctp_frag_point;
 
 	uint32_t total_output_queue_size;
 
 	uint32_t sb_cc;		/* shadow of sb_cc */
 	uint32_t sb_send_resv;	/* amount reserved on a send */
 	uint32_t my_rwnd_control_len;	/* shadow of sb_mbcnt used for rwnd
 					 * control */
 #ifdef INET6
 	uint32_t default_flowlabel;
 #endif
 	uint32_t pr_sctp_cnt;
 	int ctrl_queue_cnt;	/* could be removed  REM - NO IT CAN'T!! RRS */
 	/*
 	 * All outbound datagrams queue into this list from the individual
 	 * stream queue. Here they get assigned a TSN and then await
 	 * sending. The stream seq comes when it is first put in the
 	 * individual str queue
 	 */
 	unsigned int stream_queue_cnt;
 	unsigned int send_queue_cnt;
 	unsigned int sent_queue_cnt;
 	unsigned int sent_queue_cnt_removeable;
 	/*
 	 * Number on sent queue that are marked for retran until this value
 	 * is 0 we only send one packet of retran'ed data.
 	 */
 	unsigned int sent_queue_retran_cnt;
 
 	unsigned int size_on_reasm_queue;
 	unsigned int cnt_on_reasm_queue;
 	unsigned int fwd_tsn_cnt;
 	/* amount of data (bytes) currently in flight (on all destinations) */
 	unsigned int total_flight;
 	/* Total book size in flight */
 	unsigned int total_flight_count;	/* count of chunks used with
 						 * book total */
 	/* count of destinaton nets and list of destination nets */
 	unsigned int numnets;
 
 	/* Total error count on this association */
 	unsigned int overall_error_count;
 
 	unsigned int cnt_msg_on_sb;
 
 	/* All stream count of chunks for delivery */
 	unsigned int size_on_all_streams;
 	unsigned int cnt_on_all_streams;
 
 	/* Heart Beat delay in ms */
 	uint32_t heart_beat_delay;
 
 	/* autoclose */
 	uint32_t sctp_autoclose_ticks;
 
 	/* how many preopen streams we have */
 	unsigned int pre_open_streams;
 
 	/* How many streams I support coming into me */
 	unsigned int max_inbound_streams;
 
 	/* the cookie life I award for any cookie, in seconds */
 	uint32_t cookie_life;
 	/* time to delay acks for */
 	unsigned int delayed_ack;
 	unsigned int old_delayed_ack;
 	unsigned int sack_freq;
 	unsigned int data_pkts_seen;
 
 	unsigned int numduptsns;
 	int dup_tsns[SCTP_MAX_DUP_TSNS];
 	uint32_t initial_init_rto_max;	/* initial RTO for INIT's */
 	uint32_t initial_rto;	/* initial send RTO */
 	uint32_t minrto;	/* per assoc RTO-MIN */
 	uint32_t maxrto;	/* per assoc RTO-MAX */
 
 	/* authentication fields */
 	sctp_auth_chklist_t *local_auth_chunks;
 	sctp_auth_chklist_t *peer_auth_chunks;
 	sctp_hmaclist_t *local_hmacs;	/* local HMACs supported */
 	sctp_hmaclist_t *peer_hmacs;	/* peer HMACs supported */
 	struct sctp_keyhead shared_keys;	/* assoc's shared keys */
 	sctp_authinfo_t authinfo;	/* randoms, cached keys */
 	/*
 	 * refcnt to block freeing when a sender or receiver is off coping
 	 * user data in.
 	 */
 	uint32_t refcnt;
 	uint32_t chunks_on_out_queue;	/* total chunks floating around,
 					 * locked by send socket buffer */
 	uint32_t peers_adaptation;
 	uint32_t default_mtu;
 	uint16_t peer_hmac_id;	/* peer HMAC id to send */
 
 	/*
 	 * Being that we have no bag to collect stale cookies, and that we
 	 * really would not want to anyway.. we will count them in this
 	 * counter. We of course feed them to the pigeons right away (I have
 	 * always thought of pigeons as flying rats).
 	 */
 	uint16_t stale_cookie_count;
 
 	/*
 	 * For the partial delivery API, if up, invoked this is what last
 	 * TSN I delivered
 	 */
 	uint16_t str_of_pdapi;
 	uint16_t ssn_of_pdapi;
 
 	/* counts of actual built streams. Allocation may be more however */
 	/* could re-arrange to optimize space here. */
 	uint16_t streamincnt;
 	uint16_t streamoutcnt;
 	uint16_t strm_realoutsize;
 	uint16_t strm_pending_add_size;
 	/* my maximum number of retrans of INIT and SEND */
 	/* copied from SCTP but should be individually setable */
 	uint16_t max_init_times;
 	uint16_t max_send_times;
 
 	uint16_t def_net_failure;
 
 	uint16_t def_net_pf_threshold;
 
 	/*
 	 * lock flag: 0 is ok to send, 1+ (duals as a retran count) is
 	 * awaiting ACK
 	 */
 	uint16_t mapping_array_size;
 
 	uint16_t last_strm_seq_delivered;
 	uint16_t last_strm_no_delivered;
 
 	uint16_t last_revoke_count;
 	int16_t num_send_timers_up;
 
 	uint16_t stream_locked_on;
 	uint16_t ecn_echo_cnt_onq;
 
 	uint16_t free_chunk_cnt;
 	uint8_t stream_locked;
 	uint8_t authenticated;	/* packet authenticated ok */
 	/*
 	 * This flag indicates that a SACK need to be sent. Initially this
 	 * is 1 to send the first sACK immediately.
 	 */
 	uint8_t send_sack;
 
 	/* max burst of new packets into the network */
 	uint32_t max_burst;
 	/* max burst of fast retransmit packets */
 	uint32_t fr_max_burst;
 
 	uint8_t sat_network;	/* RTT is in range of sat net or greater */
 	uint8_t sat_network_lockout;	/* lockout code */
 	uint8_t burst_limit_applied;	/* Burst limit in effect at last send? */
 	/* flag goes on when we are doing a partial delivery api */
 	uint8_t hb_random_values[4];
 	uint8_t fragmented_delivery_inprogress;
 	uint8_t fragment_flags;
 	uint8_t last_flags_delivered;
 	uint8_t hb_ect_randombit;
 	uint8_t hb_random_idx;
 	uint8_t default_dscp;
 	uint8_t asconf_del_pending;	/* asconf delete last addr pending */
 	uint8_t trigger_reset;
 	/*
 	 * This value, plus all other ack'd but above cum-ack is added
 	 * together to cross check against the bit that we have yet to
 	 * define (probably in the SACK). When the cum-ack is updated, this
 	 * sum is updated as well.
 	 */
 
 	/* Flags whether an extension is supported or not */
 	uint8_t ecn_supported;
 	uint8_t prsctp_supported;
 	uint8_t auth_supported;
 	uint8_t asconf_supported;
 	uint8_t reconfig_supported;
 	uint8_t nrsack_supported;
 	uint8_t pktdrop_supported;
 	uint8_t idata_supported;
 
 	/* Did the peer make the stream config (add out) request */
 	uint8_t peer_req_out;
 
 	uint8_t local_strreset_support;
 	uint8_t peer_supports_nat;
 
 	struct sctp_scoping scope;
 	/* flags to handle send alternate net tracking */
 	uint8_t used_alt_asconfack;
 	uint8_t fast_retran_loss_recovery;
 	uint8_t sat_t3_loss_recovery;
 	uint8_t dropped_special_cnt;
 	uint8_t seen_a_sack_this_pkt;
 	uint8_t stream_reset_outstanding;
 	uint8_t stream_reset_out_is_outstanding;
 	uint8_t delayed_connection;
 	uint8_t ifp_had_enobuf;
 	uint8_t saw_sack_with_frags;
 	uint8_t saw_sack_with_nr_frags;
 	uint8_t in_asocid_hash;
 	uint8_t assoc_up_sent;
 	uint8_t adaptation_needed;
 	uint8_t adaptation_sent;
 	/* CMT variables */
 	uint8_t cmt_dac_pkts_rcvd;
 	uint8_t sctp_cmt_on_off;
 	uint8_t iam_blocking;
 	uint8_t cookie_how[8];
 	/* JRS 5/21/07 - CMT PF variable */
 	uint8_t sctp_cmt_pf;
 	uint8_t use_precise_time;
 	uint64_t sctp_features;
 	uint32_t max_cwnd;
 	uint16_t port;		/* remote UDP encapsulation port */
 	/*
 	 * The mapping array is used to track out of order sequences above
 	 * last_acked_seq. 0 indicates packet missing 1 indicates packet
 	 * rec'd. We slide it up every time we raise last_acked_seq and 0
 	 * trailing locactions out.  If I get a TSN above the array
 	 * mappingArraySz, I discard the datagram and let retransmit happen.
 	 */
 	uint32_t marked_retrans;
 	uint32_t timoinit;
 	uint32_t timodata;
 	uint32_t timosack;
 	uint32_t timoshutdown;
 	uint32_t timoheartbeat;
 	uint32_t timocookie;
 	uint32_t timoshutdownack;
 	struct timeval start_time;
 	struct timeval discontinuity_time;
 	uint64_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1];
 	uint64_t abandoned_sent[SCTP_PR_SCTP_MAX + 1];
 };
 
 #endif
Index: projects/clang1100-import/sys/netinet/sctputil.c
===================================================================
--- projects/clang1100-import/sys/netinet/sctputil.c	(revision 364278)
+++ projects/clang1100-import/sys/netinet/sctputil.c	(revision 364279)
@@ -1,7675 +1,7668 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
 #ifdef INET6
 #include <netinet6/sctp6_var.h>
 #endif
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_uio.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_indata.h>
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/sctp_kdtrace.h>
 #if defined(INET6) || defined(INET)
 #include <netinet/tcp_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <sys/proc.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #ifndef KTR_SCTP
 #define KTR_SCTP KTR_SUBSYS
 #endif
 
 extern const struct sctp_cc_functions sctp_cc_functions[];
 extern const struct sctp_ss_functions sctp_ss_functions[];
 
 void
 sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.sb.stcb = stcb;
 	sctp_clog.x.sb.so_sbcc = sb->sb_cc;
 	if (stcb)
 		sctp_clog.x.sb.stcb_sbcc = stcb->asoc.sb_cc;
 	else
 		sctp_clog.x.sb.stcb_sbcc = 0;
 	sctp_clog.x.sb.incr = incr;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_SB,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.close.inp = (void *)inp;
 	sctp_clog.x.close.sctp_flags = inp->sctp_flags;
 	if (stcb) {
 		sctp_clog.x.close.stcb = (void *)stcb;
 		sctp_clog.x.close.state = (uint16_t)stcb->asoc.state;
 	} else {
 		sctp_clog.x.close.stcb = 0;
 		sctp_clog.x.close.state = 0;
 	}
 	sctp_clog.x.close.loc = loc;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_CLOSE,
 	    0,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 rto_logging(struct sctp_nets *net, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	memset(&sctp_clog, 0, sizeof(sctp_clog));
 	sctp_clog.x.rto.net = (void *)net;
 	sctp_clog.x.rto.rtt = net->rtt / 1000;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_RTT,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16_t stream, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.strlog.stcb = stcb;
 	sctp_clog.x.strlog.n_tsn = tsn;
 	sctp_clog.x.strlog.n_sseq = sseq;
 	sctp_clog.x.strlog.e_tsn = 0;
 	sctp_clog.x.strlog.e_sseq = 0;
 	sctp_clog.x.strlog.strm = stream;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_STRM,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_nagle_event(struct sctp_tcb *stcb, int action)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.nagle.stcb = (void *)stcb;
 	sctp_clog.x.nagle.total_flight = stcb->asoc.total_flight;
 	sctp_clog.x.nagle.total_in_queue = stcb->asoc.total_output_queue_size;
 	sctp_clog.x.nagle.count_in_queue = stcb->asoc.chunks_on_out_queue;
 	sctp_clog.x.nagle.count_in_flight = stcb->asoc.total_flight_count;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_NAGLE,
 	    action,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, uint16_t dups, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.sack.cumack = cumack;
 	sctp_clog.x.sack.oldcumack = old_cumack;
 	sctp_clog.x.sack.tsn = tsn;
 	sctp_clog.x.sack.numGaps = gaps;
 	sctp_clog.x.sack.numDups = dups;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_SACK,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	memset(&sctp_clog, 0, sizeof(sctp_clog));
 	sctp_clog.x.map.base = map;
 	sctp_clog.x.map.cum = cum;
 	sctp_clog.x.map.high = high;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_MAP,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	memset(&sctp_clog, 0, sizeof(sctp_clog));
 	sctp_clog.x.fr.largest_tsn = biggest_tsn;
 	sctp_clog.x.fr.largest_new_tsn = biggest_new_tsn;
 	sctp_clog.x.fr.tsn = tsn;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_FR,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 #ifdef SCTP_MBUF_LOGGING
 void
 sctp_log_mb(struct mbuf *m, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.mb.mp = m;
 	sctp_clog.x.mb.mbuf_flags = (uint8_t)(SCTP_BUF_GET_FLAGS(m));
 	sctp_clog.x.mb.size = (uint16_t)(SCTP_BUF_LEN(m));
 	sctp_clog.x.mb.data = SCTP_BUF_AT(m, 0);
 	if (SCTP_BUF_IS_EXTENDED(m)) {
 		sctp_clog.x.mb.ext = SCTP_BUF_EXTEND_BASE(m);
 		sctp_clog.x.mb.refcnt = (uint8_t)(SCTP_BUF_EXTEND_REFCNT(m));
 	} else {
 		sctp_clog.x.mb.ext = 0;
 		sctp_clog.x.mb.refcnt = 0;
 	}
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_MBUF,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_mbc(struct mbuf *m, int from)
 {
 	struct mbuf *mat;
 
 	for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) {
 		sctp_log_mb(mat, from);
 	}
 }
 #endif
 
 void
 sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	if (control == NULL) {
 		SCTP_PRINTF("Gak log of NULL?\n");
 		return;
 	}
 	sctp_clog.x.strlog.stcb = control->stcb;
 	sctp_clog.x.strlog.n_tsn = control->sinfo_tsn;
 	sctp_clog.x.strlog.n_sseq = (uint16_t)control->mid;
 	sctp_clog.x.strlog.strm = control->sinfo_stream;
 	if (poschk != NULL) {
 		sctp_clog.x.strlog.e_tsn = poschk->sinfo_tsn;
 		sctp_clog.x.strlog.e_sseq = (uint16_t)poschk->mid;
 	} else {
 		sctp_clog.x.strlog.e_tsn = 0;
 		sctp_clog.x.strlog.e_sseq = 0;
 	}
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_STRM,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.cwnd.net = net;
 	if (stcb->asoc.send_queue_cnt > 255)
 		sctp_clog.x.cwnd.cnt_in_send = 255;
 	else
 		sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt;
 	if (stcb->asoc.stream_queue_cnt > 255)
 		sctp_clog.x.cwnd.cnt_in_str = 255;
 	else
 		sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt;
 
 	if (net) {
 		sctp_clog.x.cwnd.cwnd_new_value = net->cwnd;
 		sctp_clog.x.cwnd.inflight = net->flight_size;
 		sctp_clog.x.cwnd.pseudo_cumack = net->pseudo_cumack;
 		sctp_clog.x.cwnd.meets_pseudo_cumack = net->new_pseudo_cumack;
 		sctp_clog.x.cwnd.need_new_pseudo_cumack = net->find_pseudo_cumack;
 	}
 	if (SCTP_CWNDLOG_PRESEND == from) {
 		sctp_clog.x.cwnd.meets_pseudo_cumack = stcb->asoc.peers_rwnd;
 	}
 	sctp_clog.x.cwnd.cwnd_augment = augment;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_CWND,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	memset(&sctp_clog, 0, sizeof(sctp_clog));
 	if (inp) {
 		sctp_clog.x.lock.sock = (void *)inp->sctp_socket;
 
 	} else {
 		sctp_clog.x.lock.sock = (void *)NULL;
 	}
 	sctp_clog.x.lock.inp = (void *)inp;
 	if (stcb) {
 		sctp_clog.x.lock.tcb_lock = mtx_owned(&stcb->tcb_mtx);
 	} else {
 		sctp_clog.x.lock.tcb_lock = SCTP_LOCK_UNKNOWN;
 	}
 	if (inp) {
 		sctp_clog.x.lock.inp_lock = mtx_owned(&inp->inp_mtx);
 		sctp_clog.x.lock.create_lock = mtx_owned(&inp->inp_create_mtx);
 	} else {
 		sctp_clog.x.lock.inp_lock = SCTP_LOCK_UNKNOWN;
 		sctp_clog.x.lock.create_lock = SCTP_LOCK_UNKNOWN;
 	}
 	sctp_clog.x.lock.info_lock = rw_wowned(&SCTP_BASE_INFO(ipi_ep_mtx));
 	if (inp && (inp->sctp_socket)) {
 		sctp_clog.x.lock.sock_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx));
 		sctp_clog.x.lock.sockrcvbuf_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx));
 		sctp_clog.x.lock.socksndbuf_lock = mtx_owned(&(inp->sctp_socket->so_snd.sb_mtx));
 	} else {
 		sctp_clog.x.lock.sock_lock = SCTP_LOCK_UNKNOWN;
 		sctp_clog.x.lock.sockrcvbuf_lock = SCTP_LOCK_UNKNOWN;
 		sctp_clog.x.lock.socksndbuf_lock = SCTP_LOCK_UNKNOWN;
 	}
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_LOCK_EVENT,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int burst, uint8_t from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	memset(&sctp_clog, 0, sizeof(sctp_clog));
 	sctp_clog.x.cwnd.net = net;
 	sctp_clog.x.cwnd.cwnd_new_value = error;
 	sctp_clog.x.cwnd.inflight = net->flight_size;
 	sctp_clog.x.cwnd.cwnd_augment = burst;
 	if (stcb->asoc.send_queue_cnt > 255)
 		sctp_clog.x.cwnd.cnt_in_send = 255;
 	else
 		sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt;
 	if (stcb->asoc.stream_queue_cnt > 255)
 		sctp_clog.x.cwnd.cnt_in_str = 255;
 	else
 		sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_MAXBURST,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t overhead)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.rwnd.rwnd = peers_rwnd;
 	sctp_clog.x.rwnd.send_size = snd_size;
 	sctp_clog.x.rwnd.overhead = overhead;
 	sctp_clog.x.rwnd.new_rwnd = 0;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_RWND,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint32_t overhead, uint32_t a_rwndval)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.rwnd.rwnd = peers_rwnd;
 	sctp_clog.x.rwnd.send_size = flight_size;
 	sctp_clog.x.rwnd.overhead = overhead;
 	sctp_clog.x.rwnd.new_rwnd = a_rwndval;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_RWND,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 #ifdef SCTP_MBCNT_LOGGING
 static void
 sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.mbcnt.total_queue_size = total_oq;
 	sctp_clog.x.mbcnt.size_change = book;
 	sctp_clog.x.mbcnt.total_queue_mb_size = total_mbcnt_q;
 	sctp_clog.x.mbcnt.mbcnt_change = mbcnt;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_MBCNT,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 #endif
 
 void
 sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_MISC_EVENT,
 	    from,
 	    a, b, c, d);
 #endif
 }
 
 void
 sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.wake.stcb = (void *)stcb;
 	sctp_clog.x.wake.wake_cnt = wake_cnt;
 	sctp_clog.x.wake.flight = stcb->asoc.total_flight_count;
 	sctp_clog.x.wake.send_q = stcb->asoc.send_queue_cnt;
 	sctp_clog.x.wake.sent_q = stcb->asoc.sent_queue_cnt;
 
 	if (stcb->asoc.stream_queue_cnt < 0xff)
 		sctp_clog.x.wake.stream_qcnt = (uint8_t)stcb->asoc.stream_queue_cnt;
 	else
 		sctp_clog.x.wake.stream_qcnt = 0xff;
 
 	if (stcb->asoc.chunks_on_out_queue < 0xff)
 		sctp_clog.x.wake.chunks_on_oque = (uint8_t)stcb->asoc.chunks_on_out_queue;
 	else
 		sctp_clog.x.wake.chunks_on_oque = 0xff;
 
 	sctp_clog.x.wake.sctpflags = 0;
 	/* set in the defered mode stuff */
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE)
 		sctp_clog.x.wake.sctpflags |= 1;
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT)
 		sctp_clog.x.wake.sctpflags |= 2;
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT)
 		sctp_clog.x.wake.sctpflags |= 4;
 	/* what about the sb */
 	if (stcb->sctp_socket) {
 		struct socket *so = stcb->sctp_socket;
 
 		sctp_clog.x.wake.sbflags = (uint8_t)((so->so_snd.sb_flags & 0x00ff));
 	} else {
 		sctp_clog.x.wake.sbflags = 0xff;
 	}
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_WAKE,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 void
 sctp_log_block(uint8_t from, struct sctp_association *asoc, ssize_t sendlen)
 {
 #if defined(SCTP_LOCAL_TRACE_BUF)
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.blk.onsb = asoc->total_output_queue_size;
 	sctp_clog.x.blk.send_sent_qcnt = (uint16_t)(asoc->send_queue_cnt + asoc->sent_queue_cnt);
 	sctp_clog.x.blk.peer_rwnd = asoc->peers_rwnd;
 	sctp_clog.x.blk.stream_qcnt = (uint16_t)asoc->stream_queue_cnt;
 	sctp_clog.x.blk.chunks_on_oque = (uint16_t)asoc->chunks_on_out_queue;
 	sctp_clog.x.blk.flight_size = (uint16_t)(asoc->total_flight / 1024);
 	sctp_clog.x.blk.sndlen = (uint32_t)sendlen;
 	SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
 	    SCTP_LOG_EVENT_BLOCK,
 	    from,
 	    sctp_clog.x.misc.log1,
 	    sctp_clog.x.misc.log2,
 	    sctp_clog.x.misc.log3,
 	    sctp_clog.x.misc.log4);
 #endif
 }
 
 int
 sctp_fill_stat_log(void *optval SCTP_UNUSED, size_t *optsize SCTP_UNUSED)
 {
 	/* May need to fix this if ktrdump does not work */
 	return (0);
 }
 
 #ifdef SCTP_AUDITING_ENABLED
 uint8_t sctp_audit_data[SCTP_AUDIT_SIZE][2];
 static int sctp_audit_indx = 0;
 
 static
 void
 sctp_print_audit_report(void)
 {
 	int i;
 	int cnt;
 
 	cnt = 0;
 	for (i = sctp_audit_indx; i < SCTP_AUDIT_SIZE; i++) {
 		if ((sctp_audit_data[i][0] == 0xe0) &&
 		    (sctp_audit_data[i][1] == 0x01)) {
 			cnt = 0;
 			SCTP_PRINTF("\n");
 		} else if (sctp_audit_data[i][0] == 0xf0) {
 			cnt = 0;
 			SCTP_PRINTF("\n");
 		} else if ((sctp_audit_data[i][0] == 0xc0) &&
 		    (sctp_audit_data[i][1] == 0x01)) {
 			SCTP_PRINTF("\n");
 			cnt = 0;
 		}
 		SCTP_PRINTF("%2.2x%2.2x ", (uint32_t)sctp_audit_data[i][0],
 		    (uint32_t)sctp_audit_data[i][1]);
 		cnt++;
 		if ((cnt % 14) == 0)
 			SCTP_PRINTF("\n");
 	}
 	for (i = 0; i < sctp_audit_indx; i++) {
 		if ((sctp_audit_data[i][0] == 0xe0) &&
 		    (sctp_audit_data[i][1] == 0x01)) {
 			cnt = 0;
 			SCTP_PRINTF("\n");
 		} else if (sctp_audit_data[i][0] == 0xf0) {
 			cnt = 0;
 			SCTP_PRINTF("\n");
 		} else if ((sctp_audit_data[i][0] == 0xc0) &&
 		    (sctp_audit_data[i][1] == 0x01)) {
 			SCTP_PRINTF("\n");
 			cnt = 0;
 		}
 		SCTP_PRINTF("%2.2x%2.2x ", (uint32_t)sctp_audit_data[i][0],
 		    (uint32_t)sctp_audit_data[i][1]);
 		cnt++;
 		if ((cnt % 14) == 0)
 			SCTP_PRINTF("\n");
 	}
 	SCTP_PRINTF("\n");
 }
 
 void
 sctp_auditing(int from, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 	int resend_cnt, tot_out, rep, tot_book_cnt;
 	struct sctp_nets *lnet;
 	struct sctp_tmit_chunk *chk;
 
 	sctp_audit_data[sctp_audit_indx][0] = 0xAA;
 	sctp_audit_data[sctp_audit_indx][1] = 0x000000ff & from;
 	sctp_audit_indx++;
 	if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 		sctp_audit_indx = 0;
 	}
 	if (inp == NULL) {
 		sctp_audit_data[sctp_audit_indx][0] = 0xAF;
 		sctp_audit_data[sctp_audit_indx][1] = 0x01;
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 		return;
 	}
 	if (stcb == NULL) {
 		sctp_audit_data[sctp_audit_indx][0] = 0xAF;
 		sctp_audit_data[sctp_audit_indx][1] = 0x02;
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 		return;
 	}
 	sctp_audit_data[sctp_audit_indx][0] = 0xA1;
 	sctp_audit_data[sctp_audit_indx][1] =
 	    (0x000000ff & stcb->asoc.sent_queue_retran_cnt);
 	sctp_audit_indx++;
 	if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 		sctp_audit_indx = 0;
 	}
 	rep = 0;
 	tot_book_cnt = 0;
 	resend_cnt = tot_out = 0;
 	TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 		if (chk->sent == SCTP_DATAGRAM_RESEND) {
 			resend_cnt++;
 		} else if (chk->sent < SCTP_DATAGRAM_RESEND) {
 			tot_out += chk->book_size;
 			tot_book_cnt++;
 		}
 	}
 	if (resend_cnt != stcb->asoc.sent_queue_retran_cnt) {
 		sctp_audit_data[sctp_audit_indx][0] = 0xAF;
 		sctp_audit_data[sctp_audit_indx][1] = 0xA1;
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 		SCTP_PRINTF("resend_cnt:%d asoc-tot:%d\n",
 		    resend_cnt, stcb->asoc.sent_queue_retran_cnt);
 		rep = 1;
 		stcb->asoc.sent_queue_retran_cnt = resend_cnt;
 		sctp_audit_data[sctp_audit_indx][0] = 0xA2;
 		sctp_audit_data[sctp_audit_indx][1] =
 		    (0x000000ff & stcb->asoc.sent_queue_retran_cnt);
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 	}
 	if (tot_out != stcb->asoc.total_flight) {
 		sctp_audit_data[sctp_audit_indx][0] = 0xAF;
 		sctp_audit_data[sctp_audit_indx][1] = 0xA2;
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 		rep = 1;
 		SCTP_PRINTF("tot_flt:%d asoc_tot:%d\n", tot_out,
 		    (int)stcb->asoc.total_flight);
 		stcb->asoc.total_flight = tot_out;
 	}
 	if (tot_book_cnt != stcb->asoc.total_flight_count) {
 		sctp_audit_data[sctp_audit_indx][0] = 0xAF;
 		sctp_audit_data[sctp_audit_indx][1] = 0xA5;
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 		rep = 1;
 		SCTP_PRINTF("tot_flt_book:%d\n", tot_book_cnt);
 
 		stcb->asoc.total_flight_count = tot_book_cnt;
 	}
 	tot_out = 0;
 	TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
 		tot_out += lnet->flight_size;
 	}
 	if (tot_out != stcb->asoc.total_flight) {
 		sctp_audit_data[sctp_audit_indx][0] = 0xAF;
 		sctp_audit_data[sctp_audit_indx][1] = 0xA3;
 		sctp_audit_indx++;
 		if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 			sctp_audit_indx = 0;
 		}
 		rep = 1;
 		SCTP_PRINTF("real flight:%d net total was %d\n",
 		    stcb->asoc.total_flight, tot_out);
 		/* now corrective action */
 		TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
 
 			tot_out = 0;
 			TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 				if ((chk->whoTo == lnet) &&
 				    (chk->sent < SCTP_DATAGRAM_RESEND)) {
 					tot_out += chk->book_size;
 				}
 			}
 			if (lnet->flight_size != tot_out) {
 				SCTP_PRINTF("net:%p flight was %d corrected to %d\n",
 				    (void *)lnet, lnet->flight_size,
 				    tot_out);
 				lnet->flight_size = tot_out;
 			}
 		}
 	}
 	if (rep) {
 		sctp_print_audit_report();
 	}
 }
 
 void
 sctp_audit_log(uint8_t ev, uint8_t fd)
 {
 
 	sctp_audit_data[sctp_audit_indx][0] = ev;
 	sctp_audit_data[sctp_audit_indx][1] = fd;
 	sctp_audit_indx++;
 	if (sctp_audit_indx >= SCTP_AUDIT_SIZE) {
 		sctp_audit_indx = 0;
 	}
 }
 
 #endif
 
 /*
  * The conversion from time to ticks and vice versa is done by rounding
  * upwards. This way we can test in the code the time to be positive and
  * know that this corresponds to a positive number of ticks.
  */
 
 uint32_t
 sctp_msecs_to_ticks(uint32_t msecs)
 {
 	uint64_t temp;
 	uint32_t ticks;
 
 	if (hz == 1000) {
 		ticks = msecs;
 	} else {
 		temp = (((uint64_t)msecs * hz) + 999) / 1000;
 		if (temp > UINT32_MAX) {
 			ticks = UINT32_MAX;
 		} else {
 			ticks = (uint32_t)temp;
 		}
 	}
 	return (ticks);
 }
 
 uint32_t
 sctp_ticks_to_msecs(uint32_t ticks)
 {
 	uint64_t temp;
 	uint32_t msecs;
 
 	if (hz == 1000) {
 		msecs = ticks;
 	} else {
 		temp = (((uint64_t)ticks * 1000) + (hz - 1)) / hz;
 		if (temp > UINT32_MAX) {
 			msecs = UINT32_MAX;
 		} else {
 			msecs = (uint32_t)temp;
 		}
 	}
 	return (msecs);
 }
 
 uint32_t
 sctp_secs_to_ticks(uint32_t secs)
 {
 	uint64_t temp;
 	uint32_t ticks;
 
 	temp = (uint64_t)secs * hz;
 	if (temp > UINT32_MAX) {
 		ticks = UINT32_MAX;
 	} else {
 		ticks = (uint32_t)temp;
 	}
 	return (ticks);
 }
 
 uint32_t
 sctp_ticks_to_secs(uint32_t ticks)
 {
 	uint64_t temp;
 	uint32_t secs;
 
 	temp = ((uint64_t)ticks + (hz - 1)) / hz;
 	if (temp > UINT32_MAX) {
 		secs = UINT32_MAX;
 	} else {
 		secs = (uint32_t)temp;
 	}
 	return (secs);
 }
 
 /*
  * sctp_stop_timers_for_shutdown() should be called
  * when entering the SHUTDOWN_SENT or SHUTDOWN_ACK_SENT
  * state to make sure that all timers are stopped.
  */
 void
 sctp_stop_timers_for_shutdown(struct sctp_tcb *stcb)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net;
 
 	inp = stcb->sctp_ep;
 
 	sctp_timer_stop(SCTP_TIMER_TYPE_RECV, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_12);
 	sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_13);
 	sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_14);
 	sctp_timer_stop(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_15);
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_16);
 		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_17);
 	}
 }
 
 void
 sctp_stop_association_timers(struct sctp_tcb *stcb, bool stop_assoc_kill_timer)
 {
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net;
 
 	inp = stcb->sctp_ep;
 	sctp_timer_stop(SCTP_TIMER_TYPE_RECV, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_18);
 	sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_19);
 	if (stop_assoc_kill_timer) {
 		sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_20);
 	}
 	sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_21);
 	sctp_timer_stop(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_22);
 	sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNGUARD, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_23);
 	/* Mobility adaptation */
 	sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, inp, stcb, NULL,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_24);
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_25);
 		sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_26);
 		sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_27);
 		sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_28);
 		sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_29);
 		sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_30);
 		sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_31);
 	}
 }
 
 /*
  * A list of sizes based on typical mtu's, used only if next hop size not
  * returned. These values MUST be multiples of 4 and MUST be ordered.
  */
 static uint32_t sctp_mtu_sizes[] = {
 	68,
 	296,
 	508,
 	512,
 	544,
 	576,
 	1004,
 	1492,
 	1500,
 	1536,
 	2000,
 	2048,
 	4352,
 	4464,
 	8168,
 	17912,
 	32000,
 	65532
 };
 
 /*
  * Return the largest MTU in sctp_mtu_sizes smaller than val.
  * If val is smaller than the minimum, just return the largest
  * multiple of 4 smaller or equal to val.
  * Ensure that the result is a multiple of 4.
  */
 uint32_t
 sctp_get_prev_mtu(uint32_t val)
 {
 	uint32_t i;
 
 	val &= 0xfffffffc;
 	if (val <= sctp_mtu_sizes[0]) {
 		return (val);
 	}
 	for (i = 1; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) {
 		if (val <= sctp_mtu_sizes[i]) {
 			break;
 		}
 	}
 	KASSERT((sctp_mtu_sizes[i - 1] & 0x00000003) == 0,
 	    ("sctp_mtu_sizes[%u] not a multiple of 4", i - 1));
 	return (sctp_mtu_sizes[i - 1]);
 }
 
 /*
  * Return the smallest MTU in sctp_mtu_sizes larger than val.
  * If val is larger than the maximum, just return the largest multiple of 4 smaller
  * or equal to val.
  * Ensure that the result is a multiple of 4.
  */
 uint32_t
 sctp_get_next_mtu(uint32_t val)
 {
 	/* select another MTU that is just bigger than this one */
 	uint32_t i;
 
 	val &= 0xfffffffc;
 	for (i = 0; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) {
 		if (val < sctp_mtu_sizes[i]) {
 			KASSERT((sctp_mtu_sizes[i] & 0x00000003) == 0,
 			    ("sctp_mtu_sizes[%u] not a multiple of 4", i));
 			return (sctp_mtu_sizes[i]);
 		}
 	}
 	return (val);
 }
 
 void
 sctp_fill_random_store(struct sctp_pcb *m)
 {
 	/*
 	 * Here we use the MD5/SHA-1 to hash with our good randomNumbers and
 	 * our counter. The result becomes our good random numbers and we
 	 * then setup to give these out. Note that we do no locking to
 	 * protect this. This is ok, since if competing folks call this we
 	 * will get more gobbled gook in the random store which is what we
 	 * want. There is a danger that two guys will use the same random
 	 * numbers, but thats ok too since that is random as well :->
 	 */
 	m->store_at = 0;
 	(void)sctp_hmac(SCTP_HMAC, (uint8_t *)m->random_numbers,
 	    sizeof(m->random_numbers), (uint8_t *)&m->random_counter,
 	    sizeof(m->random_counter), (uint8_t *)m->random_store);
 	m->random_counter++;
 }
 
 uint32_t
 sctp_select_initial_TSN(struct sctp_pcb *inp)
 {
 	/*
 	 * A true implementation should use random selection process to get
 	 * the initial stream sequence number, using RFC1750 as a good
 	 * guideline
 	 */
 	uint32_t x, *xp;
 	uint8_t *p;
 	int store_at, new_store;
 
 	if (inp->initial_sequence_debug != 0) {
 		uint32_t ret;
 
 		ret = inp->initial_sequence_debug;
 		inp->initial_sequence_debug++;
 		return (ret);
 	}
 retry:
 	store_at = inp->store_at;
 	new_store = store_at + sizeof(uint32_t);
 	if (new_store >= (SCTP_SIGNATURE_SIZE - 3)) {
 		new_store = 0;
 	}
 	if (!atomic_cmpset_int(&inp->store_at, store_at, new_store)) {
 		goto retry;
 	}
 	if (new_store == 0) {
 		/* Refill the random store */
 		sctp_fill_random_store(inp);
 	}
 	p = &inp->random_store[store_at];
 	xp = (uint32_t *)p;
 	x = *xp;
 	return (x);
 }
 
 uint32_t
 sctp_select_a_tag(struct sctp_inpcb *inp, uint16_t lport, uint16_t rport, int check)
 {
 	uint32_t x;
 	struct timeval now;
 
 	if (check) {
 		(void)SCTP_GETTIME_TIMEVAL(&now);
 	}
 	for (;;) {
 		x = sctp_select_initial_TSN(&inp->sctp_ep);
 		if (x == 0) {
 			/* we never use 0 */
 			continue;
 		}
 		if (!check || sctp_is_vtag_good(x, lport, rport, &now)) {
 			break;
 		}
 	}
 	return (x);
 }
 
 int32_t
 sctp_map_assoc_state(int kernel_state)
 {
 	int32_t user_state;
 
 	if (kernel_state & SCTP_STATE_WAS_ABORTED) {
 		user_state = SCTP_CLOSED;
 	} else if (kernel_state & SCTP_STATE_SHUTDOWN_PENDING) {
 		user_state = SCTP_SHUTDOWN_PENDING;
 	} else {
 		switch (kernel_state & SCTP_STATE_MASK) {
 		case SCTP_STATE_EMPTY:
 			user_state = SCTP_CLOSED;
 			break;
 		case SCTP_STATE_INUSE:
 			user_state = SCTP_CLOSED;
 			break;
 		case SCTP_STATE_COOKIE_WAIT:
 			user_state = SCTP_COOKIE_WAIT;
 			break;
 		case SCTP_STATE_COOKIE_ECHOED:
 			user_state = SCTP_COOKIE_ECHOED;
 			break;
 		case SCTP_STATE_OPEN:
 			user_state = SCTP_ESTABLISHED;
 			break;
 		case SCTP_STATE_SHUTDOWN_SENT:
 			user_state = SCTP_SHUTDOWN_SENT;
 			break;
 		case SCTP_STATE_SHUTDOWN_RECEIVED:
 			user_state = SCTP_SHUTDOWN_RECEIVED;
 			break;
 		case SCTP_STATE_SHUTDOWN_ACK_SENT:
 			user_state = SCTP_SHUTDOWN_ACK_SENT;
 			break;
 		default:
 			user_state = SCTP_CLOSED;
 			break;
 		}
 	}
 	return (user_state);
 }
 
 int
 sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     uint32_t override_tag, uint32_t vrf_id, uint16_t o_strms)
 {
 	struct sctp_association *asoc;
 
 	/*
 	 * Anything set to zero is taken care of by the allocation routine's
 	 * bzero
 	 */
 
 	/*
 	 * Up front select what scoping to apply on addresses I tell my peer
 	 * Not sure what to do with these right now, we will need to come up
 	 * with a way to set them. We may need to pass them through from the
 	 * caller in the sctp_aloc_assoc() function.
 	 */
 	int i;
 #if defined(SCTP_DETAILED_STR_STATS)
 	int j;
 #endif
 
 	asoc = &stcb->asoc;
 	/* init all variables to a known value. */
 	SCTP_SET_STATE(stcb, SCTP_STATE_INUSE);
 	asoc->max_burst = inp->sctp_ep.max_burst;
 	asoc->fr_max_burst = inp->sctp_ep.fr_max_burst;
 	asoc->heart_beat_delay = sctp_ticks_to_msecs(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]);
 	asoc->cookie_life = inp->sctp_ep.def_cookie_life;
 	asoc->sctp_cmt_on_off = inp->sctp_cmt_on_off;
 	asoc->ecn_supported = inp->ecn_supported;
 	asoc->prsctp_supported = inp->prsctp_supported;
 	asoc->auth_supported = inp->auth_supported;
 	asoc->asconf_supported = inp->asconf_supported;
 	asoc->reconfig_supported = inp->reconfig_supported;
 	asoc->nrsack_supported = inp->nrsack_supported;
 	asoc->pktdrop_supported = inp->pktdrop_supported;
 	asoc->idata_supported = inp->idata_supported;
 	asoc->sctp_cmt_pf = (uint8_t)0;
 	asoc->sctp_frag_point = inp->sctp_frag_point;
 	asoc->sctp_features = inp->sctp_features;
 	asoc->default_dscp = inp->sctp_ep.default_dscp;
 	asoc->max_cwnd = inp->max_cwnd;
 #ifdef INET6
 	if (inp->sctp_ep.default_flowlabel) {
 		asoc->default_flowlabel = inp->sctp_ep.default_flowlabel;
 	} else {
 		if (inp->ip_inp.inp.inp_flags & IN6P_AUTOFLOWLABEL) {
 			asoc->default_flowlabel = sctp_select_initial_TSN(&inp->sctp_ep);
 			asoc->default_flowlabel &= 0x000fffff;
 			asoc->default_flowlabel |= 0x80000000;
 		} else {
 			asoc->default_flowlabel = 0;
 		}
 	}
 #endif
 	asoc->sb_send_resv = 0;
 	if (override_tag) {
 		asoc->my_vtag = override_tag;
 	} else {
 		asoc->my_vtag = sctp_select_a_tag(inp, stcb->sctp_ep->sctp_lport, stcb->rport, 1);
 	}
 	/* Get the nonce tags */
 	asoc->my_vtag_nonce = sctp_select_a_tag(inp, stcb->sctp_ep->sctp_lport, stcb->rport, 0);
 	asoc->peer_vtag_nonce = sctp_select_a_tag(inp, stcb->sctp_ep->sctp_lport, stcb->rport, 0);
 	asoc->vrf_id = vrf_id;
 
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	asoc->tsn_in_at = 0;
 	asoc->tsn_out_at = 0;
 	asoc->tsn_in_wrapped = 0;
 	asoc->tsn_out_wrapped = 0;
 	asoc->cumack_log_at = 0;
 	asoc->cumack_log_atsnt = 0;
 #endif
 #ifdef SCTP_FS_SPEC_LOG
 	asoc->fs_index = 0;
 #endif
 	asoc->refcnt = 0;
 	asoc->assoc_up_sent = 0;
 	asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number = asoc->sending_seq =
 	    sctp_select_initial_TSN(&inp->sctp_ep);
 	asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
 	/* we are optimisitic here */
 	asoc->peer_supports_nat = 0;
 	asoc->sent_queue_retran_cnt = 0;
 
 	/* for CMT */
 	asoc->last_net_cmt_send_started = NULL;
 
 	/* This will need to be adjusted */
 	asoc->last_acked_seq = asoc->init_seq_number - 1;
 	asoc->advanced_peer_ack_point = asoc->last_acked_seq;
 	asoc->asconf_seq_in = asoc->last_acked_seq;
 
 	/* here we are different, we hold the next one we expect */
 	asoc->str_reset_seq_in = asoc->last_acked_seq + 1;
 
 	asoc->initial_init_rto_max = inp->sctp_ep.initial_init_rto_max;
 	asoc->initial_rto = inp->sctp_ep.initial_rto;
 
 	asoc->default_mtu = inp->sctp_ep.default_mtu;
 	asoc->max_init_times = inp->sctp_ep.max_init_times;
 	asoc->max_send_times = inp->sctp_ep.max_send_times;
 	asoc->def_net_failure = inp->sctp_ep.def_net_failure;
 	asoc->def_net_pf_threshold = inp->sctp_ep.def_net_pf_threshold;
 	asoc->free_chunk_cnt = 0;
 
 	asoc->iam_blocking = 0;
 	asoc->context = inp->sctp_context;
 	asoc->local_strreset_support = inp->local_strreset_support;
 	asoc->def_send = inp->def_send;
 	asoc->delayed_ack = sctp_ticks_to_msecs(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]);
 	asoc->sack_freq = inp->sctp_ep.sctp_sack_freq;
 	asoc->pr_sctp_cnt = 0;
 	asoc->total_output_queue_size = 0;
 
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 		asoc->scope.ipv6_addr_legal = 1;
 		if (SCTP_IPV6_V6ONLY(inp) == 0) {
 			asoc->scope.ipv4_addr_legal = 1;
 		} else {
 			asoc->scope.ipv4_addr_legal = 0;
 		}
 	} else {
 		asoc->scope.ipv6_addr_legal = 0;
 		asoc->scope.ipv4_addr_legal = 1;
 	}
 
 	asoc->my_rwnd = max(SCTP_SB_LIMIT_RCV(inp->sctp_socket), SCTP_MINIMAL_RWND);
 	asoc->peers_rwnd = SCTP_SB_LIMIT_RCV(inp->sctp_socket);
 
 	asoc->smallest_mtu = inp->sctp_frag_point;
 	asoc->minrto = inp->sctp_ep.sctp_minrto;
 	asoc->maxrto = inp->sctp_ep.sctp_maxrto;
 
 	asoc->stream_locked_on = 0;
 	asoc->ecn_echo_cnt_onq = 0;
 	asoc->stream_locked = 0;
 
 	asoc->send_sack = 1;
 
 	LIST_INIT(&asoc->sctp_restricted_addrs);
 
 	TAILQ_INIT(&asoc->nets);
 	TAILQ_INIT(&asoc->pending_reply_queue);
 	TAILQ_INIT(&asoc->asconf_ack_sent);
 	/* Setup to fill the hb random cache at first HB */
 	asoc->hb_random_idx = 4;
 
 	asoc->sctp_autoclose_ticks = inp->sctp_ep.auto_close_time;
 
 	stcb->asoc.congestion_control_module = inp->sctp_ep.sctp_default_cc_module;
 	stcb->asoc.cc_functions = sctp_cc_functions[inp->sctp_ep.sctp_default_cc_module];
 
 	stcb->asoc.stream_scheduling_module = inp->sctp_ep.sctp_default_ss_module;
 	stcb->asoc.ss_functions = sctp_ss_functions[inp->sctp_ep.sctp_default_ss_module];
 
 	/*
 	 * Now the stream parameters, here we allocate space for all streams
 	 * that we request by default.
 	 */
 	asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams =
 	    o_strms;
 	SCTP_MALLOC(asoc->strmout, struct sctp_stream_out *,
 	    asoc->streamoutcnt * sizeof(struct sctp_stream_out),
 	    SCTP_M_STRMO);
 	if (asoc->strmout == NULL) {
 		/* big trouble no memory */
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
 		return (ENOMEM);
 	}
 	for (i = 0; i < asoc->streamoutcnt; i++) {
 		/*
 		 * inbound side must be set to 0xffff, also NOTE when we get
 		 * the INIT-ACK back (for INIT sender) we MUST reduce the
 		 * count (streamoutcnt) but first check if we sent to any of
 		 * the upper streams that were dropped (if some were). Those
 		 * that were dropped must be notified to the upper layer as
 		 * failed to send.
 		 */
 		asoc->strmout[i].next_mid_ordered = 0;
 		asoc->strmout[i].next_mid_unordered = 0;
 		TAILQ_INIT(&asoc->strmout[i].outqueue);
 		asoc->strmout[i].chunks_on_queues = 0;
 #if defined(SCTP_DETAILED_STR_STATS)
 		for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
 			asoc->strmout[i].abandoned_sent[j] = 0;
 			asoc->strmout[i].abandoned_unsent[j] = 0;
 		}
 #else
 		asoc->strmout[i].abandoned_sent[0] = 0;
 		asoc->strmout[i].abandoned_unsent[0] = 0;
 #endif
 		asoc->strmout[i].sid = i;
 		asoc->strmout[i].last_msg_incomplete = 0;
 		asoc->strmout[i].state = SCTP_STREAM_OPENING;
 		asoc->ss_functions.sctp_ss_init_stream(stcb, &asoc->strmout[i], NULL);
 	}
 	asoc->ss_functions.sctp_ss_init(stcb, asoc, 0);
 
 	/* Now the mapping array */
 	asoc->mapping_array_size = SCTP_INITIAL_MAPPING_ARRAY;
 	SCTP_MALLOC(asoc->mapping_array, uint8_t *, asoc->mapping_array_size,
 	    SCTP_M_MAP);
 	if (asoc->mapping_array == NULL) {
 		SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
 		return (ENOMEM);
 	}
 	memset(asoc->mapping_array, 0, asoc->mapping_array_size);
 	SCTP_MALLOC(asoc->nr_mapping_array, uint8_t *, asoc->mapping_array_size,
 	    SCTP_M_MAP);
 	if (asoc->nr_mapping_array == NULL) {
 		SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
 		SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
 		SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
 		return (ENOMEM);
 	}
 	memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size);
 
 	/* Now the init of the other outqueues */
 	TAILQ_INIT(&asoc->free_chunks);
 	TAILQ_INIT(&asoc->control_send_queue);
 	TAILQ_INIT(&asoc->asconf_send_queue);
 	TAILQ_INIT(&asoc->send_queue);
 	TAILQ_INIT(&asoc->sent_queue);
 	TAILQ_INIT(&asoc->resetHead);
 	asoc->max_inbound_streams = inp->sctp_ep.max_open_streams_intome;
 	TAILQ_INIT(&asoc->asconf_queue);
 	/* authentication fields */
 	asoc->authinfo.random = NULL;
 	asoc->authinfo.active_keyid = 0;
 	asoc->authinfo.assoc_key = NULL;
 	asoc->authinfo.assoc_keyid = 0;
 	asoc->authinfo.recv_key = NULL;
 	asoc->authinfo.recv_keyid = 0;
 	LIST_INIT(&asoc->shared_keys);
 	asoc->marked_retrans = 0;
 	asoc->port = inp->sctp_ep.port;
 	asoc->timoinit = 0;
 	asoc->timodata = 0;
 	asoc->timosack = 0;
 	asoc->timoshutdown = 0;
 	asoc->timoheartbeat = 0;
 	asoc->timocookie = 0;
 	asoc->timoshutdownack = 0;
 	(void)SCTP_GETTIME_TIMEVAL(&asoc->start_time);
 	asoc->discontinuity_time = asoc->start_time;
 	for (i = 0; i < SCTP_PR_SCTP_MAX + 1; i++) {
 		asoc->abandoned_unsent[i] = 0;
 		asoc->abandoned_sent[i] = 0;
 	}
 	/*
 	 * sa_ignore MEMLEAK {memory is put in the assoc mapping array and
 	 * freed later when the association is freed.
 	 */
 	return (0);
 }
 
 void
 sctp_print_mapping_array(struct sctp_association *asoc)
 {
 	unsigned int i, limit;
 
 	SCTP_PRINTF("Mapping array size: %d, baseTSN: %8.8x, cumAck: %8.8x, highestTSN: (%8.8x, %8.8x).\n",
 	    asoc->mapping_array_size,
 	    asoc->mapping_array_base_tsn,
 	    asoc->cumulative_tsn,
 	    asoc->highest_tsn_inside_map,
 	    asoc->highest_tsn_inside_nr_map);
 	for (limit = asoc->mapping_array_size; limit > 1; limit--) {
 		if (asoc->mapping_array[limit - 1] != 0) {
 			break;
 		}
 	}
 	SCTP_PRINTF("Renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
 	for (i = 0; i < limit; i++) {
 		SCTP_PRINTF("%2.2x%c", asoc->mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
 	}
 	if (limit % 16)
 		SCTP_PRINTF("\n");
 	for (limit = asoc->mapping_array_size; limit > 1; limit--) {
 		if (asoc->nr_mapping_array[limit - 1]) {
 			break;
 		}
 	}
 	SCTP_PRINTF("Non renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
 	for (i = 0; i < limit; i++) {
 		SCTP_PRINTF("%2.2x%c", asoc->nr_mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
 	}
 	if (limit % 16)
 		SCTP_PRINTF("\n");
 }
 
 int
 sctp_expand_mapping_array(struct sctp_association *asoc, uint32_t needed)
 {
 	/* mapping array needs to grow */
 	uint8_t *new_array1, *new_array2;
 	uint32_t new_size;
 
 	new_size = asoc->mapping_array_size + ((needed + 7) / 8 + SCTP_MAPPING_ARRAY_INCR);
 	SCTP_MALLOC(new_array1, uint8_t *, new_size, SCTP_M_MAP);
 	SCTP_MALLOC(new_array2, uint8_t *, new_size, SCTP_M_MAP);
 	if ((new_array1 == NULL) || (new_array2 == NULL)) {
 		/* can't get more, forget it */
 		SCTP_PRINTF("No memory for expansion of SCTP mapping array %d\n", new_size);
 		if (new_array1) {
 			SCTP_FREE(new_array1, SCTP_M_MAP);
 		}
 		if (new_array2) {
 			SCTP_FREE(new_array2, SCTP_M_MAP);
 		}
 		return (-1);
 	}
 	memset(new_array1, 0, new_size);
 	memset(new_array2, 0, new_size);
 	memcpy(new_array1, asoc->mapping_array, asoc->mapping_array_size);
 	memcpy(new_array2, asoc->nr_mapping_array, asoc->mapping_array_size);
 	SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
 	SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP);
 	asoc->mapping_array = new_array1;
 	asoc->nr_mapping_array = new_array2;
 	asoc->mapping_array_size = new_size;
 	return (0);
 }
 
 
 static void
 sctp_iterator_work(struct sctp_iterator *it)
 {
 	struct epoch_tracker et;
 	struct sctp_inpcb *tinp;
 	int iteration_count = 0;
 	int inp_skip = 0;
 	int first_in = 1;
 
 	NET_EPOCH_ENTER(et);
 	SCTP_INP_INFO_RLOCK();
 	SCTP_ITERATOR_LOCK();
 	sctp_it_ctl.cur_it = it;
 	if (it->inp) {
 		SCTP_INP_RLOCK(it->inp);
 		SCTP_INP_DECR_REF(it->inp);
 	}
 	if (it->inp == NULL) {
 		/* iterator is complete */
 done_with_iterator:
 		sctp_it_ctl.cur_it = NULL;
 		SCTP_ITERATOR_UNLOCK();
 		SCTP_INP_INFO_RUNLOCK();
 		if (it->function_atend != NULL) {
 			(*it->function_atend) (it->pointer, it->val);
 		}
 		SCTP_FREE(it, SCTP_M_ITER);
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 select_a_new_ep:
 	if (first_in) {
 		first_in = 0;
 	} else {
 		SCTP_INP_RLOCK(it->inp);
 	}
 	while (((it->pcb_flags) &&
 	    ((it->inp->sctp_flags & it->pcb_flags) != it->pcb_flags)) ||
 	    ((it->pcb_features) &&
 	    ((it->inp->sctp_features & it->pcb_features) != it->pcb_features))) {
 		/* endpoint flags or features don't match, so keep looking */
 		if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
 			SCTP_INP_RUNLOCK(it->inp);
 			goto done_with_iterator;
 		}
 		tinp = it->inp;
 		it->inp = LIST_NEXT(it->inp, sctp_list);
 		it->stcb = NULL;
 		SCTP_INP_RUNLOCK(tinp);
 		if (it->inp == NULL) {
 			goto done_with_iterator;
 		}
 		SCTP_INP_RLOCK(it->inp);
 	}
 	/* now go through each assoc which is in the desired state */
 	if (it->done_current_ep == 0) {
 		if (it->function_inp != NULL)
 			inp_skip = (*it->function_inp) (it->inp, it->pointer, it->val);
 		it->done_current_ep = 1;
 	}
 	if (it->stcb == NULL) {
 		/* run the per instance function */
 		it->stcb = LIST_FIRST(&it->inp->sctp_asoc_list);
 	}
 	if ((inp_skip) || it->stcb == NULL) {
 		if (it->function_inp_end != NULL) {
 			inp_skip = (*it->function_inp_end) (it->inp,
 			    it->pointer,
 			    it->val);
 		}
 		SCTP_INP_RUNLOCK(it->inp);
 		goto no_stcb;
 	}
 	while (it->stcb) {
 		SCTP_TCB_LOCK(it->stcb);
 		if (it->asoc_state && ((it->stcb->asoc.state & it->asoc_state) != it->asoc_state)) {
 			/* not in the right state... keep looking */
 			SCTP_TCB_UNLOCK(it->stcb);
 			goto next_assoc;
 		}
 		/* see if we have limited out the iterator loop */
 		iteration_count++;
 		if (iteration_count > SCTP_ITERATOR_MAX_AT_ONCE) {
 			/* Pause to let others grab the lock */
 			atomic_add_int(&it->stcb->asoc.refcnt, 1);
 			SCTP_TCB_UNLOCK(it->stcb);
 			SCTP_INP_INCR_REF(it->inp);
 			SCTP_INP_RUNLOCK(it->inp);
 			SCTP_ITERATOR_UNLOCK();
 			SCTP_INP_INFO_RUNLOCK();
 			SCTP_INP_INFO_RLOCK();
 			SCTP_ITERATOR_LOCK();
 			if (sctp_it_ctl.iterator_flags) {
 				/* We won't be staying here */
 				SCTP_INP_DECR_REF(it->inp);
 				atomic_add_int(&it->stcb->asoc.refcnt, -1);
 				if (sctp_it_ctl.iterator_flags &
 				    SCTP_ITERATOR_STOP_CUR_IT) {
 					sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_IT;
 					goto done_with_iterator;
 				}
 				if (sctp_it_ctl.iterator_flags &
 				    SCTP_ITERATOR_STOP_CUR_INP) {
 					sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_INP;
 					goto no_stcb;
 				}
 				/* If we reach here huh? */
 				SCTP_PRINTF("Unknown it ctl flag %x\n",
 				    sctp_it_ctl.iterator_flags);
 				sctp_it_ctl.iterator_flags = 0;
 			}
 			SCTP_INP_RLOCK(it->inp);
 			SCTP_INP_DECR_REF(it->inp);
 			SCTP_TCB_LOCK(it->stcb);
 			atomic_add_int(&it->stcb->asoc.refcnt, -1);
 			iteration_count = 0;
 		}
 		KASSERT(it->inp == it->stcb->sctp_ep,
 		    ("%s: stcb %p does not belong to inp %p, but inp %p",
 		    __func__, it->stcb, it->inp, it->stcb->sctp_ep));
 
 		/* run function on this one */
 		(*it->function_assoc) (it->inp, it->stcb, it->pointer, it->val);
 
 		/*
 		 * we lie here, it really needs to have its own type but
 		 * first I must verify that this won't effect things :-0
 		 */
 		if (it->no_chunk_output == 0)
 			sctp_chunk_output(it->inp, it->stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
 
 		SCTP_TCB_UNLOCK(it->stcb);
 next_assoc:
 		it->stcb = LIST_NEXT(it->stcb, sctp_tcblist);
 		if (it->stcb == NULL) {
 			/* Run last function */
 			if (it->function_inp_end != NULL) {
 				inp_skip = (*it->function_inp_end) (it->inp,
 				    it->pointer,
 				    it->val);
 			}
 		}
 	}
 	SCTP_INP_RUNLOCK(it->inp);
 no_stcb:
 	/* done with all assocs on this endpoint, move on to next endpoint */
 	it->done_current_ep = 0;
 	if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
 		it->inp = NULL;
 	} else {
 		it->inp = LIST_NEXT(it->inp, sctp_list);
 	}
 	it->stcb = NULL;
 	if (it->inp == NULL) {
 		goto done_with_iterator;
 	}
 	goto select_a_new_ep;
 }
 
 void
 sctp_iterator_worker(void)
 {
 	struct sctp_iterator *it;
 
 	/* This function is called with the WQ lock in place */
 	sctp_it_ctl.iterator_running = 1;
 	while ((it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead)) != NULL) {
 		/* now lets work on this one */
 		TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
 		SCTP_IPI_ITERATOR_WQ_UNLOCK();
 		CURVNET_SET(it->vn);
 		sctp_iterator_work(it);
 		CURVNET_RESTORE();
 		SCTP_IPI_ITERATOR_WQ_LOCK();
 		/* sa_ignore FREED_MEMORY */
 	}
 	sctp_it_ctl.iterator_running = 0;
 	return;
 }
 
 
 static void
 sctp_handle_addr_wq(void)
 {
 	/* deal with the ADDR wq from the rtsock calls */
 	struct sctp_laddr *wi, *nwi;
 	struct sctp_asconf_iterator *asc;
 
 	SCTP_MALLOC(asc, struct sctp_asconf_iterator *,
 	    sizeof(struct sctp_asconf_iterator), SCTP_M_ASC_IT);
 	if (asc == NULL) {
 		/* Try later, no memory */
 		sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
 		    (struct sctp_inpcb *)NULL,
 		    (struct sctp_tcb *)NULL,
 		    (struct sctp_nets *)NULL);
 		return;
 	}
 	LIST_INIT(&asc->list_of_work);
 	asc->cnt = 0;
 
 	LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) {
 		LIST_REMOVE(wi, sctp_nxt_addr);
 		LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr);
 		asc->cnt++;
 	}
 
 	if (asc->cnt == 0) {
 		SCTP_FREE(asc, SCTP_M_ASC_IT);
 	} else {
 		int ret;
 
 		ret = sctp_initiate_iterator(sctp_asconf_iterator_ep,
 		    sctp_asconf_iterator_stcb,
 		    NULL,	/* No ep end for boundall */
 		    SCTP_PCB_FLAGS_BOUNDALL,
 		    SCTP_PCB_ANY_FEATURES,
 		    SCTP_ASOC_ANY_STATE,
 		    (void *)asc, 0,
 		    sctp_asconf_iterator_end, NULL, 0);
 		if (ret) {
 			SCTP_PRINTF("Failed to initiate iterator for handle_addr_wq\n");
 			/*
 			 * Freeing if we are stopping or put back on the
 			 * addr_wq.
 			 */
 			if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
 				sctp_asconf_iterator_end(asc, 0);
 			} else {
 				LIST_FOREACH(wi, &asc->list_of_work, sctp_nxt_addr) {
 					LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
 				}
 				SCTP_FREE(asc, SCTP_M_ASC_IT);
 			}
 		}
 	}
 }
 
 /*-
  * The following table shows which pointers for the inp, stcb, or net are
  * stored for each timer after it was started.
  *
  *|Name                         |Timer                        |inp |stcb|net |
  *|-----------------------------|-----------------------------|----|----|----|
  *|SCTP_TIMER_TYPE_SEND         |net->rxt_timer               |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_INIT         |net->rxt_timer               |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_RECV         |stcb->asoc.dack_timer        |Yes |Yes |No  |
  *|SCTP_TIMER_TYPE_SHUTDOWN     |net->rxt_timer               |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_HEARTBEAT    |net->hb_timer                |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_COOKIE       |net->rxt_timer               |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_NEWCOOKIE    |inp->sctp_ep.signature_change|Yes |No  |No  |
  *|SCTP_TIMER_TYPE_PATHMTURAISE |net->pmtu_timer              |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_SHUTDOWNACK  |net->rxt_timer               |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_ASCONF       |stcb->asoc.asconf_timer      |Yes |Yes |Yes |
  *|SCTP_TIMER_TYPE_SHUTDOWNGUARD|stcb->asoc.shut_guard_timer  |Yes |Yes |No  |
  *|SCTP_TIMER_TYPE_AUTOCLOSE    |stcb->asoc.autoclose_timer   |Yes |Yes |No  |
  *|SCTP_TIMER_TYPE_STRRESET     |stcb->asoc.strreset_timer    |Yes |Yes |No  |
  *|SCTP_TIMER_TYPE_INPKILL      |inp->sctp_ep.signature_change|Yes |No  |No  |
  *|SCTP_TIMER_TYPE_ASOCKILL     |stcb->asoc.strreset_timer    |Yes |Yes |No  |
  *|SCTP_TIMER_TYPE_ADDR_WQ      |SCTP_BASE_INFO(addr_wq_timer)|No  |No  |No  |
  *|SCTP_TIMER_TYPE_PRIM_DELETED |stcb->asoc.delete_prim_timer |Yes |Yes |No  |
  */
 
 void
 sctp_timeout_handler(void *t)
 {
 	struct epoch_tracker et;
 	struct timeval tv;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 	struct sctp_timer *tmr;
 	struct mbuf *op_err;
 	int type;
 	int i, secret;
 	bool did_output, released_asoc_reference;
 
 	/*
 	 * If inp, stcb or net are not NULL, then references to these were
 	 * added when the timer was started, and must be released before
 	 * this function returns.
 	 */
 	tmr = (struct sctp_timer *)t;
 	inp = (struct sctp_inpcb *)tmr->ep;
 	stcb = (struct sctp_tcb *)tmr->tcb;
 	net = (struct sctp_nets *)tmr->net;
 	CURVNET_SET((struct vnet *)tmr->vnet);
 	NET_EPOCH_ENTER(et);
 	did_output = 1;
 	released_asoc_reference = false;
 
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xF0, (uint8_t)tmr->type);
 	sctp_auditing(3, inp, stcb, net);
 #endif
 
 	/* sanity checks... */
 	KASSERT(tmr->self == NULL || tmr->self == tmr,
 	    ("sctp_timeout_handler: tmr->self corrupted"));
 	KASSERT(SCTP_IS_TIMER_TYPE_VALID(tmr->type),
 	    ("sctp_timeout_handler: invalid timer type %d", tmr->type));
 	type = tmr->type;
 	KASSERT(stcb == NULL || stcb->sctp_ep == inp,
 	    ("sctp_timeout_handler of type %d: inp = %p, stcb->sctp_ep %p",
 	    type, stcb, stcb->sctp_ep));
 	tmr->stopped_from = 0xa001;
 	if ((stcb != NULL) && (stcb->asoc.state == SCTP_STATE_EMPTY)) {
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d handler exiting due to CLOSED association.\n",
 		    type);
 		goto out_decr;
 	}
 	tmr->stopped_from = 0xa002;
 	SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d goes off.\n", type);
 	if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) {
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d handler exiting due to not being active.\n",
 		    type);
 		goto out_decr;
 	}
 
 	tmr->stopped_from = 0xa003;
 	if (stcb) {
 		SCTP_TCB_LOCK(stcb);
 		/*
 		 * Release reference so that association can be freed if
 		 * necessary below. This is safe now that we have acquired
 		 * the lock.
 		 */
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 		released_asoc_reference = true;
 		if ((type != SCTP_TIMER_TYPE_ASOCKILL) &&
 		    ((stcb->asoc.state == SCTP_STATE_EMPTY) ||
 		    (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED))) {
 			SCTPDBG(SCTP_DEBUG_TIMER2,
 			    "Timer type %d handler exiting due to CLOSED association.\n",
 			    type);
 			goto out;
 		}
 	} else if (inp != NULL) {
 		SCTP_INP_WLOCK(inp);
 	} else {
 		SCTP_WQ_ADDR_LOCK();
 	}
 
 	/* Record in stopped_from which timeout occurred. */
 	tmr->stopped_from = type;
 	/* mark as being serviced now */
 	if (SCTP_OS_TIMER_PENDING(&tmr->timer)) {
 		/*
 		 * Callout has been rescheduled.
 		 */
 		goto out;
 	}
 	if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) {
 		/*
 		 * Not active, so no action.
 		 */
 		goto out;
 	}
 	SCTP_OS_TIMER_DEACTIVATE(&tmr->timer);
 
 	/* call the handler for the appropriate timer type */
 	switch (type) {
 	case SCTP_TIMER_TYPE_SEND:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timodata);
 		stcb->asoc.timodata++;
 		stcb->asoc.num_send_timers_up--;
 		if (stcb->asoc.num_send_timers_up < 0) {
 			stcb->asoc.num_send_timers_up = 0;
 		}
 		SCTP_TCB_LOCK_ASSERT(stcb);
 		if (sctp_t3rxt_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 
 			goto out_decr;
 		}
 		SCTP_TCB_LOCK_ASSERT(stcb);
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, net);
 #endif
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		if ((stcb->asoc.num_send_timers_up == 0) &&
 		    (stcb->asoc.sent_queue_cnt > 0)) {
 			struct sctp_tmit_chunk *chk;
 
 			/*
 			 * Safeguard. If there on some on the sent queue
 			 * somewhere but no timers running something is
 			 * wrong... so we start a timer on the first chunk
 			 * on the send queue on whatever net it is sent to.
 			 */
 			TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
 				if (chk->whoTo != NULL) {
 					break;
 				}
 			}
 			if (chk != NULL) {
 				sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo);
 			}
 		}
 		break;
 	case SCTP_TIMER_TYPE_INIT:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoinit);
 		stcb->asoc.timoinit++;
 		if (sctp_t1init_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 		did_output = false;
 		break;
 	case SCTP_TIMER_TYPE_RECV:
 		KASSERT(inp != NULL && stcb != NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timosack);
 		stcb->asoc.timosack++;
 		sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED);
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, NULL);
 #endif
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SACK_TMR, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWN:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoshutdown);
 		stcb->asoc.timoshutdown++;
 		if (sctp_shutdown_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, net);
 #endif
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_HEARTBEAT:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoheartbeat);
 		stcb->asoc.timoheartbeat++;
 		if (sctp_heartbeat_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, net);
 #endif
 		if (!(net->dest_state & SCTP_ADDR_NOHB)) {
 			sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_HB_TMR, SCTP_SO_NOT_LOCKED);
 			did_output = true;
 		} else {
 			did_output = false;
 		}
 		break;
 	case SCTP_TIMER_TYPE_COOKIE:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timocookie);
 		stcb->asoc.timocookie++;
 		if (sctp_cookie_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, net);
 #endif
 		/*
 		 * We consider T3 and Cookie timer pretty much the same with
 		 * respect to where from in chunk_output.
 		 */
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_NEWCOOKIE:
 		KASSERT(inp != NULL && stcb == NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timosecret);
 		(void)SCTP_GETTIME_TIMEVAL(&tv);
 		inp->sctp_ep.time_of_secret_change = tv.tv_sec;
 		inp->sctp_ep.last_secret_number =
 		    inp->sctp_ep.current_secret_number;
 		inp->sctp_ep.current_secret_number++;
 		if (inp->sctp_ep.current_secret_number >=
 		    SCTP_HOW_MANY_SECRETS) {
 			inp->sctp_ep.current_secret_number = 0;
 		}
 		secret = (int)inp->sctp_ep.current_secret_number;
 		for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) {
 			inp->sctp_ep.secret_key[secret][i] =
 			    sctp_select_initial_TSN(&inp->sctp_ep);
 		}
 		sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL);
 		did_output = false;
 		break;
 	case SCTP_TIMER_TYPE_PATHMTURAISE:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timopathmtu);
 		sctp_pathmtu_timer(inp, stcb, net);
 		did_output = false;
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWNACK:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		if (sctp_shutdownack_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 		SCTP_STAT_INCR(sctps_timoshutdownack);
 		stcb->asoc.timoshutdownack++;
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, net);
 #endif
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_ACK_TMR, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_ASCONF:
 		KASSERT(inp != NULL && stcb != NULL && net != NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoasconf);
 		if (sctp_asconf_timer(inp, stcb, net)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 #ifdef SCTP_AUDITING_ENABLED
 		sctp_auditing(4, inp, stcb, net);
 #endif
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_ASCONF_TMR, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWNGUARD:
 		KASSERT(inp != NULL && stcb != NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoshutdownguard);
 		op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
 		    "Shutdown guard timer expired");
 		sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		/* no need to unlock on tcb its gone */
 		goto out_decr;
 	case SCTP_TIMER_TYPE_AUTOCLOSE:
 		KASSERT(inp != NULL && stcb != NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoautoclose);
 		sctp_autoclose_timer(inp, stcb);
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_STRRESET:
 		KASSERT(inp != NULL && stcb != NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timostrmrst);
 		if (sctp_strreset_timer(inp, stcb)) {
 			/* no need to unlock on tcb its gone */
 			goto out_decr;
 		}
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_TMR, SCTP_SO_NOT_LOCKED);
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_INPKILL:
 		KASSERT(inp != NULL && stcb == NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoinpkill);
 		/*
 		 * special case, take away our increment since WE are the
 		 * killer
 		 */
 		sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_3);
 		SCTP_INP_DECR_REF(inp);
 		SCTP_INP_WUNLOCK(inp);
 		sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 		    SCTP_CALLED_FROM_INPKILL_TIMER);
 		inp = NULL;
 		goto out_no_decr;
 	case SCTP_TIMER_TYPE_ASOCKILL:
 		KASSERT(inp != NULL && stcb != NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timoassockill);
 		/* Can we free it yet? */
 		SCTP_INP_DECR_REF(inp);
 		sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_1);
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_2);
 		/*
 		 * free asoc, always unlocks (or destroy's) so prevent
 		 * duplicate unlock or unlock of a free mtx :-0
 		 */
 		stcb = NULL;
 		goto out_no_decr;
 	case SCTP_TIMER_TYPE_ADDR_WQ:
 		KASSERT(inp == NULL && stcb == NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		sctp_handle_addr_wq();
 		did_output = true;
 		break;
 	case SCTP_TIMER_TYPE_PRIM_DELETED:
 		KASSERT(inp != NULL && stcb != NULL && net == NULL,
 		    ("timeout of type %d: inp = %p, stcb = %p, net = %p",
 		    type, inp, stcb, net));
 		SCTP_STAT_INCR(sctps_timodelprim);
 		sctp_delete_prim_timer(inp, stcb);
 		did_output = false;
 		break;
 	default:
 #ifdef INVARIANTS
 		panic("Unknown timer type %d", type);
 #else
 		did_output = false;
 		goto out;
 #endif
 	}
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xF1, (uint8_t)type);
 	if (inp != NULL)
 		sctp_auditing(5, inp, stcb, net);
 #endif
 	if (did_output && (stcb != NULL)) {
 		/*
 		 * Now we need to clean up the control chunk chain if an
 		 * ECNE is on it. It must be marked as UNSENT again so next
 		 * call will continue to send it until such time that we get
 		 * a CWR, to remove it. It is, however, less likely that we
 		 * will find a ecn echo on the chain though.
 		 */
 		sctp_fix_ecn_echo(&stcb->asoc);
 	}
 out:
 	if (stcb != NULL) {
 		SCTP_TCB_UNLOCK(stcb);
 	} else if (inp != NULL) {
 		SCTP_INP_WUNLOCK(inp);
 	} else {
 		SCTP_WQ_ADDR_UNLOCK();
 	}
 
 out_decr:
 	/* These reference counts were incremented in sctp_timer_start(). */
 	if (inp != NULL) {
 		SCTP_INP_DECR_REF(inp);
 	}
 	if ((stcb != NULL) && !released_asoc_reference) {
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 	}
 	if (net != NULL) {
 		sctp_free_remote_addr(net);
 	}
 out_no_decr:
 	SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d handler finished.\n", type);
 	CURVNET_RESTORE();
 	NET_EPOCH_EXIT(et);
 }
 
 /*-
  * The following table shows which parameters must be provided
  * when calling sctp_timer_start(). For parameters not being
  * provided, NULL must be used.
  *
  * |Name                         |inp |stcb|net |
  * |-----------------------------|----|----|----|
  * |SCTP_TIMER_TYPE_SEND         |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_INIT         |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_RECV         |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_SHUTDOWN     |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_HEARTBEAT    |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_COOKIE       |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_NEWCOOKIE    |Yes |No  |No  |
  * |SCTP_TIMER_TYPE_PATHMTURAISE |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_SHUTDOWNACK  |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_ASCONF       |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_SHUTDOWNGUARD|Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_AUTOCLOSE    |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_STRRESET     |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_INPKILL      |Yes |No  |No  |
  * |SCTP_TIMER_TYPE_ASOCKILL     |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_ADDR_WQ      |No  |No  |No  |
  * |SCTP_TIMER_TYPE_PRIM_DELETED |Yes |Yes |No  |
  *
  */
 
 void
 sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
 	struct sctp_timer *tmr;
 	uint32_t to_ticks;
 	uint32_t rndval, jitter;
 
 	KASSERT(stcb == NULL || stcb->sctp_ep == inp,
 	    ("sctp_timer_start of type %d: inp = %p, stcb->sctp_ep %p",
 	    t_type, stcb, stcb->sctp_ep));
 	tmr = NULL;
 	to_ticks = 0;
 	if (stcb != NULL) {
 		SCTP_TCB_LOCK_ASSERT(stcb);
 	} else if (inp != NULL) {
 		SCTP_INP_WLOCK_ASSERT(inp);
 	} else {
 		SCTP_WQ_ADDR_LOCK_ASSERT();
 	}
 	if (stcb != NULL) {
 		/*
 		 * Don't restart timer on association that's about to be
 		 * killed.
 		 */
 		if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) &&
 		    (t_type != SCTP_TIMER_TYPE_ASOCKILL)) {
 			SCTPDBG(SCTP_DEBUG_TIMER2,
 			    "Timer type %d not started: inp=%p, stcb=%p, net=%p (stcb deleted).\n",
 			    t_type, inp, stcb, net);
 			return;
 		}
 		/* Don't restart timer on net that's been removed. */
 		if (net != NULL && (net->dest_state & SCTP_ADDR_BEING_DELETED)) {
 			SCTPDBG(SCTP_DEBUG_TIMER2,
 			    "Timer type %d not started: inp=%p, stcb=%p, net=%p (net deleted).\n",
 			    t_type, inp, stcb, net);
 			return;
 		}
 	}
 	switch (t_type) {
 	case SCTP_TIMER_TYPE_SEND:
 		/* Here we use the RTO timer. */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_INIT:
 		/*
 		 * Here we use the INIT timer default usually about 1
 		 * second.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_RECV:
 		/*
 		 * Here we use the Delayed-Ack timer value from the inp,
 		 * ususually about 200ms.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.dack_timer;
 		to_ticks = sctp_msecs_to_ticks(stcb->asoc.delayed_ack);
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWN:
 		/* Here we use the RTO of the destination. */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_HEARTBEAT:
 		/*
 		 * The net is used here so that we can add in the RTO. Even
 		 * though we use a different timer. We also add the HB timer
 		 * PLUS a random jitter.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		if ((net->dest_state & SCTP_ADDR_NOHB) &&
 		    !(net->dest_state & SCTP_ADDR_UNCONFIRMED)) {
 			SCTPDBG(SCTP_DEBUG_TIMER2,
 			    "Timer type %d not started: inp=%p, stcb=%p, net=%p.\n",
 			    t_type, inp, stcb, net);
 			return;
 		}
 		tmr = &net->hb_timer;
 		if (net->RTO == 0) {
 			to_ticks = stcb->asoc.initial_rto;
 		} else {
 			to_ticks = net->RTO;
 		}
 		rndval = sctp_select_initial_TSN(&inp->sctp_ep);
 		jitter = rndval % to_ticks;
 		if (jitter >= (to_ticks >> 1)) {
 			to_ticks = to_ticks + (jitter - (to_ticks >> 1));
 		} else {
 			to_ticks = to_ticks - jitter;
 		}
 		if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED) &&
 		    !(net->dest_state & SCTP_ADDR_PF)) {
 			to_ticks += net->heart_beat_delay;
 		}
 		/*
 		 * Now we must convert the to_ticks that are now in ms to
 		 * ticks.
 		 */
 		to_ticks = sctp_msecs_to_ticks(to_ticks);
 		break;
 	case SCTP_TIMER_TYPE_COOKIE:
 		/*
 		 * Here we can use the RTO timer from the network since one
 		 * RTT was complete. If a retransmission happened then we
 		 * will be using the RTO initial value.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_NEWCOOKIE:
 		/*
 		 * Nothing needed but the endpoint here ususually about 60
 		 * minutes.
 		 */
 		if ((inp == NULL) || (stcb != NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &inp->sctp_ep.signature_change;
 		to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_SIGNATURE];
 		break;
 	case SCTP_TIMER_TYPE_PATHMTURAISE:
 		/*
 		 * Here we use the value found in the EP for PMTUD,
 		 * ususually about 10 minutes.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
 			SCTPDBG(SCTP_DEBUG_TIMER2,
 			    "Timer type %d not started: inp=%p, stcb=%p, net=%p.\n",
 			    t_type, inp, stcb, net);
 			return;
 		}
 		tmr = &net->pmtu_timer;
 		to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_PMTU];
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWNACK:
 		/* Here we use the RTO of the destination. */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_ASCONF:
 		/*
 		 * Here the timer comes from the stcb but its value is from
 		 * the net's RTO.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.asconf_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWNGUARD:
 		/*
 		 * Here we use the endpoints shutdown guard timer usually
 		 * about 3 minutes.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.shut_guard_timer;
 		if (inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] == 0) {
 			if (stcb->asoc.maxrto < UINT32_MAX / 5) {
 				to_ticks = sctp_msecs_to_ticks(5 * stcb->asoc.maxrto);
 			} else {
 				to_ticks = sctp_msecs_to_ticks(UINT32_MAX);
 			}
 		} else {
 			to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN];
 		}
 		break;
 	case SCTP_TIMER_TYPE_AUTOCLOSE:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.autoclose_timer;
 		to_ticks = stcb->asoc.sctp_autoclose_ticks;
 		break;
 	case SCTP_TIMER_TYPE_STRRESET:
 		/*
 		 * Here the timer comes from the stcb but its value is from
 		 * the net's RTO.
 		 */
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.strreset_timer;
 		if (net->RTO == 0) {
 			to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		} else {
 			to_ticks = sctp_msecs_to_ticks(net->RTO);
 		}
 		break;
 	case SCTP_TIMER_TYPE_INPKILL:
 		/*
 		 * The inp is setup to die. We re-use the signature_chage
 		 * timer since that has stopped and we are in the GONE
 		 * state.
 		 */
 		if ((inp == NULL) || (stcb != NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &inp->sctp_ep.signature_change;
 		to_ticks = sctp_msecs_to_ticks(SCTP_INP_KILL_TIMEOUT);
 		break;
 	case SCTP_TIMER_TYPE_ASOCKILL:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.strreset_timer;
 		to_ticks = sctp_msecs_to_ticks(SCTP_ASOC_KILL_TIMEOUT);
 		break;
 	case SCTP_TIMER_TYPE_ADDR_WQ:
 		if ((inp != NULL) || (stcb != NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		/* Only 1 tick away :-) */
 		tmr = &SCTP_BASE_INFO(addr_wq_timer);
 		to_ticks = SCTP_ADDRESS_TICK_DELAY;
 		break;
 	case SCTP_TIMER_TYPE_PRIM_DELETED:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.delete_prim_timer;
 		to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto);
 		break;
 	default:
 #ifdef INVARIANTS
 		panic("Unknown timer type %d", t_type);
 #else
 		return;
 #endif
 	}
 	KASSERT(tmr != NULL, ("tmr is NULL for timer type %d", t_type));
 	KASSERT(to_ticks > 0, ("to_ticks == 0 for timer type %d", t_type));
 	if (SCTP_OS_TIMER_PENDING(&tmr->timer)) {
 		/*
 		 * We do NOT allow you to have it already running. If it is,
 		 * we leave the current one up unchanged.
 		 */
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d already running: inp=%p, stcb=%p, net=%p.\n",
 		    t_type, inp, stcb, net);
 		return;
 	}
 	/* At this point we can proceed. */
 	if (t_type == SCTP_TIMER_TYPE_SEND) {
 		stcb->asoc.num_send_timers_up++;
 	}
 	tmr->stopped_from = 0;
 	tmr->type = t_type;
 	tmr->ep = (void *)inp;
 	tmr->tcb = (void *)stcb;
 	if (t_type == SCTP_TIMER_TYPE_STRRESET) {
 		tmr->net = NULL;
 	} else {
 		tmr->net = (void *)net;
 	}
 	tmr->self = (void *)tmr;
 	tmr->vnet = (void *)curvnet;
 	tmr->ticks = sctp_get_tick_count();
 	if (SCTP_OS_TIMER_START(&tmr->timer, to_ticks, sctp_timeout_handler, tmr) == 0) {
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d started: ticks=%u, inp=%p, stcb=%p, net=%p.\n",
 		    t_type, to_ticks, inp, stcb, net);
 		/*
 		 * If this is a newly scheduled callout, as opposed to a
 		 * rescheduled one, increment relevant reference counts.
 		 */
 		if (tmr->ep != NULL) {
 			SCTP_INP_INCR_REF(inp);
 		}
 		if (tmr->tcb != NULL) {
 			atomic_add_int(&stcb->asoc.refcnt, 1);
 		}
 		if (tmr->net != NULL) {
 			atomic_add_int(&net->ref_count, 1);
 		}
 	} else {
 		/*
 		 * This should not happen, since we checked for pending
 		 * above.
 		 */
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d restarted: ticks=%u, inp=%p, stcb=%p, net=%p.\n",
 		    t_type, to_ticks, inp, stcb, net);
 	}
 	return;
 }
 
 /*-
  * The following table shows which parameters must be provided
  * when calling sctp_timer_stop(). For parameters not being
  * provided, NULL must be used.
  *
  * |Name                         |inp |stcb|net |
  * |-----------------------------|----|----|----|
  * |SCTP_TIMER_TYPE_SEND         |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_INIT         |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_RECV         |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_SHUTDOWN     |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_HEARTBEAT    |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_COOKIE       |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_NEWCOOKIE    |Yes |No  |No  |
  * |SCTP_TIMER_TYPE_PATHMTURAISE |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_SHUTDOWNACK  |Yes |Yes |Yes |
  * |SCTP_TIMER_TYPE_ASCONF       |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_SHUTDOWNGUARD|Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_AUTOCLOSE    |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_STRRESET     |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_INPKILL      |Yes |No  |No  |
  * |SCTP_TIMER_TYPE_ASOCKILL     |Yes |Yes |No  |
  * |SCTP_TIMER_TYPE_ADDR_WQ      |No  |No  |No  |
  * |SCTP_TIMER_TYPE_PRIM_DELETED |Yes |Yes |No  |
  *
  */
 
 void
 sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct sctp_nets *net, uint32_t from)
 {
 	struct sctp_timer *tmr;
 
 	KASSERT(stcb == NULL || stcb->sctp_ep == inp,
 	    ("sctp_timer_stop of type %d: inp = %p, stcb->sctp_ep %p",
 	    t_type, stcb, stcb->sctp_ep));
 	if (stcb != NULL) {
 		SCTP_TCB_LOCK_ASSERT(stcb);
 	} else if (inp != NULL) {
 		SCTP_INP_WLOCK_ASSERT(inp);
 	} else {
 		SCTP_WQ_ADDR_LOCK_ASSERT();
 	}
 	tmr = NULL;
 	switch (t_type) {
 	case SCTP_TIMER_TYPE_SEND:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		break;
 	case SCTP_TIMER_TYPE_INIT:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		break;
 	case SCTP_TIMER_TYPE_RECV:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.dack_timer;
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWN:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		break;
 	case SCTP_TIMER_TYPE_HEARTBEAT:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->hb_timer;
 		break;
 	case SCTP_TIMER_TYPE_COOKIE:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		break;
 	case SCTP_TIMER_TYPE_NEWCOOKIE:
 		if ((inp == NULL) || (stcb != NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &inp->sctp_ep.signature_change;
 		break;
 	case SCTP_TIMER_TYPE_PATHMTURAISE:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->pmtu_timer;
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWNACK:
 		if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &net->rxt_timer;
 		break;
 	case SCTP_TIMER_TYPE_ASCONF:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.asconf_timer;
 		break;
 	case SCTP_TIMER_TYPE_SHUTDOWNGUARD:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.shut_guard_timer;
 		break;
 	case SCTP_TIMER_TYPE_AUTOCLOSE:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.autoclose_timer;
 		break;
 	case SCTP_TIMER_TYPE_STRRESET:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.strreset_timer;
 		break;
 	case SCTP_TIMER_TYPE_INPKILL:
 		/*
 		 * The inp is setup to die. We re-use the signature_chage
 		 * timer since that has stopped and we are in the GONE
 		 * state.
 		 */
 		if ((inp == NULL) || (stcb != NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &inp->sctp_ep.signature_change;
 		break;
 	case SCTP_TIMER_TYPE_ASOCKILL:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.strreset_timer;
 		break;
 	case SCTP_TIMER_TYPE_ADDR_WQ:
 		if ((inp != NULL) || (stcb != NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &SCTP_BASE_INFO(addr_wq_timer);
 		break;
 	case SCTP_TIMER_TYPE_PRIM_DELETED:
 		if ((inp == NULL) || (stcb == NULL) || (net != NULL)) {
 #ifdef INVARIANTS
 			panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p",
 			    t_type, inp, stcb, net);
 #else
 			return;
 #endif
 		}
 		tmr = &stcb->asoc.delete_prim_timer;
 		break;
 	default:
 #ifdef INVARIANTS
 		panic("Unknown timer type %d", t_type);
 #else
 		return;
 #endif
 	}
 	KASSERT(tmr != NULL, ("tmr is NULL for timer type %d", t_type));
 	if ((tmr->type != SCTP_TIMER_TYPE_NONE) &&
 	    (tmr->type != t_type)) {
 		/*
 		 * Ok we have a timer that is under joint use. Cookie timer
 		 * per chance with the SEND timer. We therefore are NOT
 		 * running the timer that the caller wants stopped.  So just
 		 * return.
 		 */
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Shared timer type %d not running: inp=%p, stcb=%p, net=%p.\n",
 		    t_type, inp, stcb, net);
 		return;
 	}
 	if ((t_type == SCTP_TIMER_TYPE_SEND) && (stcb != NULL)) {
 		stcb->asoc.num_send_timers_up--;
 		if (stcb->asoc.num_send_timers_up < 0) {
 			stcb->asoc.num_send_timers_up = 0;
 		}
 	}
 	tmr->self = NULL;
 	tmr->stopped_from = from;
 	if (SCTP_OS_TIMER_STOP(&tmr->timer) == 1) {
 		KASSERT(tmr->ep == inp,
 		    ("sctp_timer_stop of type %d: inp = %p, tmr->inp = %p",
 		    t_type, inp, tmr->ep));
 		KASSERT(tmr->tcb == stcb,
 		    ("sctp_timer_stop of type %d: stcb = %p, tmr->stcb = %p",
 		    t_type, stcb, tmr->tcb));
 		KASSERT(((t_type == SCTP_TIMER_TYPE_ASCONF) && (tmr->net != NULL)) ||
 		    ((t_type != SCTP_TIMER_TYPE_ASCONF) && (tmr->net == net)),
 		    ("sctp_timer_stop of type %d: net = %p, tmr->net = %p",
 		    t_type, net, tmr->net));
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d stopped: inp=%p, stcb=%p, net=%p.\n",
 		    t_type, inp, stcb, net);
 		/*
 		 * If the timer was actually stopped, decrement reference
 		 * counts that were incremented in sctp_timer_start().
 		 */
 		if (tmr->ep != NULL) {
 			SCTP_INP_DECR_REF(inp);
 			tmr->ep = NULL;
 		}
 		if (tmr->tcb != NULL) {
 			atomic_add_int(&stcb->asoc.refcnt, -1);
 			tmr->tcb = NULL;
 		}
 		if (tmr->net != NULL) {
 			/*
 			 * Can't use net, since it doesn't work for
 			 * SCTP_TIMER_TYPE_ASCONF.
 			 */
 			sctp_free_remote_addr((struct sctp_nets *)tmr->net);
 			tmr->net = NULL;
 		}
 	} else {
 		SCTPDBG(SCTP_DEBUG_TIMER2,
 		    "Timer type %d not stopped: inp=%p, stcb=%p, net=%p.\n",
 		    t_type, inp, stcb, net);
 	}
 	return;
 }
 
 uint32_t
 sctp_calculate_len(struct mbuf *m)
 {
 	uint32_t tlen = 0;
 	struct mbuf *at;
 
 	at = m;
 	while (at) {
 		tlen += SCTP_BUF_LEN(at);
 		at = SCTP_BUF_NEXT(at);
 	}
 	return (tlen);
 }
 
 void
 sctp_mtu_size_reset(struct sctp_inpcb *inp,
     struct sctp_association *asoc, uint32_t mtu)
 {
 	/*
 	 * Reset the P-MTU size on this association, this involves changing
 	 * the asoc MTU, going through ANY chunk+overhead larger than mtu to
 	 * allow the DF flag to be cleared.
 	 */
 	struct sctp_tmit_chunk *chk;
 	unsigned int eff_mtu, ovh;
 
 	asoc->smallest_mtu = mtu;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 		ovh = SCTP_MIN_OVERHEAD;
 	} else {
 		ovh = SCTP_MIN_V4_OVERHEAD;
 	}
 	eff_mtu = mtu - ovh;
 	TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
 		if (chk->send_size > eff_mtu) {
 			chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 		}
 	}
 	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 		if (chk->send_size > eff_mtu) {
 			chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 		}
 	}
 }
 
 
 /*
  * Given an association and starting time of the current RTT period, update
  * RTO in number of msecs. net should point to the current network.
  * Return 1, if an RTO update was performed, return 0 if no update was
  * performed due to invalid starting point.
  */
 
 int
 sctp_calculate_rto(struct sctp_tcb *stcb,
     struct sctp_association *asoc,
     struct sctp_nets *net,
     struct timeval *old,
     int rtt_from_sack)
 {
 	struct timeval now;
 	uint64_t rtt_us;	/* RTT in us */
 	int32_t rtt;		/* RTT in ms */
 	uint32_t new_rto;
 	int first_measure = 0;
 
 	/************************/
 	/* 1. calculate new RTT */
 	/************************/
 	/* get the current time */
 	if (stcb->asoc.use_precise_time) {
 		(void)SCTP_GETPTIME_TIMEVAL(&now);
 	} else {
 		(void)SCTP_GETTIME_TIMEVAL(&now);
 	}
 	if ((old->tv_sec > now.tv_sec) ||
 	    ((old->tv_sec == now.tv_sec) && (old->tv_usec > now.tv_usec))) {
 		/* The starting point is in the future. */
 		return (0);
 	}
 	timevalsub(&now, old);
 	rtt_us = (uint64_t)1000000 * (uint64_t)now.tv_sec + (uint64_t)now.tv_usec;
 	if (rtt_us > SCTP_RTO_UPPER_BOUND * 1000) {
 		/* The RTT is larger than a sane value. */
 		return (0);
 	}
 	/* store the current RTT in us */
 	net->rtt = rtt_us;
 	/* compute rtt in ms */
 	rtt = (int32_t)(net->rtt / 1000);
 	if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) {
 		/*
 		 * Tell the CC module that a new update has just occurred
 		 * from a sack
 		 */
 		(*asoc->cc_functions.sctp_rtt_calculated) (stcb, net, &now);
 	}
 	/*
 	 * Do we need to determine the lan? We do this only on sacks i.e.
 	 * RTT being determined from data not non-data (HB/INIT->INITACK).
 	 */
 	if ((rtt_from_sack == SCTP_RTT_FROM_DATA) &&
 	    (net->lan_type == SCTP_LAN_UNKNOWN)) {
 		if (net->rtt > SCTP_LOCAL_LAN_RTT) {
 			net->lan_type = SCTP_LAN_INTERNET;
 		} else {
 			net->lan_type = SCTP_LAN_LOCAL;
 		}
 	}
 
 	/***************************/
 	/* 2. update RTTVAR & SRTT */
 	/***************************/
 	/*-
 	 * Compute the scaled average lastsa and the
 	 * scaled variance lastsv as described in van Jacobson
 	 * Paper "Congestion Avoidance and Control", Annex A.
 	 *
 	 * (net->lastsa >> SCTP_RTT_SHIFT) is the srtt
 	 * (net->lastsv >> SCTP_RTT_VAR_SHIFT) is the rttvar
 	 */
 	if (net->RTO_measured) {
 		rtt -= (net->lastsa >> SCTP_RTT_SHIFT);
 		net->lastsa += rtt;
 		if (rtt < 0) {
 			rtt = -rtt;
 		}
 		rtt -= (net->lastsv >> SCTP_RTT_VAR_SHIFT);
 		net->lastsv += rtt;
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) {
 			rto_logging(net, SCTP_LOG_RTTVAR);
 		}
 	} else {
 		/* First RTO measurment */
 		net->RTO_measured = 1;
 		first_measure = 1;
 		net->lastsa = rtt << SCTP_RTT_SHIFT;
 		net->lastsv = (rtt / 2) << SCTP_RTT_VAR_SHIFT;
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) {
 			rto_logging(net, SCTP_LOG_INITIAL_RTT);
 		}
 	}
 	if (net->lastsv == 0) {
 		net->lastsv = SCTP_CLOCK_GRANULARITY;
 	}
 	new_rto = (net->lastsa >> SCTP_RTT_SHIFT) + net->lastsv;
 	if ((new_rto > SCTP_SAT_NETWORK_MIN) &&
 	    (stcb->asoc.sat_network_lockout == 0)) {
 		stcb->asoc.sat_network = 1;
 	} else if ((!first_measure) && stcb->asoc.sat_network) {
 		stcb->asoc.sat_network = 0;
 		stcb->asoc.sat_network_lockout = 1;
 	}
 	/* bound it, per C6/C7 in Section 5.3.1 */
 	if (new_rto < stcb->asoc.minrto) {
 		new_rto = stcb->asoc.minrto;
 	}
 	if (new_rto > stcb->asoc.maxrto) {
 		new_rto = stcb->asoc.maxrto;
 	}
 	net->RTO = new_rto;
 	return (1);
 }
 
 /*
  * return a pointer to a contiguous piece of data from the given mbuf chain
  * starting at 'off' for 'len' bytes.  If the desired piece spans more than
  * one mbuf, a copy is made at 'ptr'. caller must ensure that the buffer size
  * is >= 'len' returns NULL if there there isn't 'len' bytes in the chain.
  */
 caddr_t
 sctp_m_getptr(struct mbuf *m, int off, int len, uint8_t *in_ptr)
 {
 	uint32_t count;
 	uint8_t *ptr;
 
 	ptr = in_ptr;
 	if ((off < 0) || (len <= 0))
 		return (NULL);
 
 	/* find the desired start location */
 	while ((m != NULL) && (off > 0)) {
 		if (off < SCTP_BUF_LEN(m))
 			break;
 		off -= SCTP_BUF_LEN(m);
 		m = SCTP_BUF_NEXT(m);
 	}
 	if (m == NULL)
 		return (NULL);
 
 	/* is the current mbuf large enough (eg. contiguous)? */
 	if ((SCTP_BUF_LEN(m) - off) >= len) {
 		return (mtod(m, caddr_t)+off);
 	} else {
 		/* else, it spans more than one mbuf, so save a temp copy... */
 		while ((m != NULL) && (len > 0)) {
 			count = min(SCTP_BUF_LEN(m) - off, len);
 			memcpy(ptr, mtod(m, caddr_t)+off, count);
 			len -= count;
 			ptr += count;
 			off = 0;
 			m = SCTP_BUF_NEXT(m);
 		}
 		if ((m == NULL) && (len > 0))
 			return (NULL);
 		else
 			return ((caddr_t)in_ptr);
 	}
 }
 
 
 
 struct sctp_paramhdr *
 sctp_get_next_param(struct mbuf *m,
     int offset,
     struct sctp_paramhdr *pull,
     int pull_limit)
 {
 	/* This just provides a typed signature to Peter's Pull routine */
 	return ((struct sctp_paramhdr *)sctp_m_getptr(m, offset, pull_limit,
 	    (uint8_t *)pull));
 }
 
 
 struct mbuf *
 sctp_add_pad_tombuf(struct mbuf *m, int padlen)
 {
 	struct mbuf *m_last;
 	caddr_t dp;
 
 	if (padlen > 3) {
 		return (NULL);
 	}
 	if (padlen <= M_TRAILINGSPACE(m)) {
 		/*
 		 * The easy way. We hope the majority of the time we hit
 		 * here :)
 		 */
 		m_last = m;
 	} else {
 		/* Hard way we must grow the mbuf chain */
 		m_last = sctp_get_mbuf_for_msg(padlen, 0, M_NOWAIT, 1, MT_DATA);
 		if (m_last == NULL) {
 			return (NULL);
 		}
 		SCTP_BUF_LEN(m_last) = 0;
 		SCTP_BUF_NEXT(m_last) = NULL;
 		SCTP_BUF_NEXT(m) = m_last;
 	}
 	dp = mtod(m_last, caddr_t)+SCTP_BUF_LEN(m_last);
 	SCTP_BUF_LEN(m_last) += padlen;
 	memset(dp, 0, padlen);
 	return (m_last);
 }
 
 struct mbuf *
 sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf)
 {
 	/* find the last mbuf in chain and pad it */
 	struct mbuf *m_at;
 
 	if (last_mbuf != NULL) {
 		return (sctp_add_pad_tombuf(last_mbuf, padval));
 	} else {
 		for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
 			if (SCTP_BUF_NEXT(m_at) == NULL) {
 				return (sctp_add_pad_tombuf(m_at, padval));
 			}
 		}
 	}
 	return (NULL);
 }
 
 static void
 sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
     uint16_t error, struct sctp_abort_chunk *abort, uint8_t from_peer, int so_locked)
 {
 	struct mbuf *m_notify;
 	struct sctp_assoc_change *sac;
 	struct sctp_queued_to_read *control;
 	unsigned int notif_len;
 	uint16_t abort_len;
 	unsigned int i;
 
 	if (stcb == NULL) {
 		return;
 	}
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) {
 		notif_len = (unsigned int)sizeof(struct sctp_assoc_change);
 		if (abort != NULL) {
 			abort_len = ntohs(abort->ch.chunk_length);
 			/*
 			 * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be
 			 * contiguous.
 			 */
 			if (abort_len > SCTP_CHUNK_BUFFER_SIZE) {
 				abort_len = SCTP_CHUNK_BUFFER_SIZE;
 			}
 		} else {
 			abort_len = 0;
 		}
 		if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) {
 			notif_len += SCTP_ASSOC_SUPPORTS_MAX;
 		} else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) {
 			notif_len += abort_len;
 		}
 		m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
 		if (m_notify == NULL) {
 			/* Retry with smaller value. */
 			notif_len = (unsigned int)sizeof(struct sctp_assoc_change);
 			m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
 			if (m_notify == NULL) {
 				goto set_error;
 			}
 		}
 		SCTP_BUF_NEXT(m_notify) = NULL;
 		sac = mtod(m_notify, struct sctp_assoc_change *);
 		memset(sac, 0, notif_len);
 		sac->sac_type = SCTP_ASSOC_CHANGE;
 		sac->sac_flags = 0;
 		sac->sac_length = sizeof(struct sctp_assoc_change);
 		sac->sac_state = state;
 		sac->sac_error = error;
 		/* XXX verify these stream counts */
 		sac->sac_outbound_streams = stcb->asoc.streamoutcnt;
 		sac->sac_inbound_streams = stcb->asoc.streamincnt;
 		sac->sac_assoc_id = sctp_get_associd(stcb);
 		if (notif_len > sizeof(struct sctp_assoc_change)) {
 			if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) {
 				i = 0;
 				if (stcb->asoc.prsctp_supported == 1) {
 					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_PR;
 				}
 				if (stcb->asoc.auth_supported == 1) {
 					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_AUTH;
 				}
 				if (stcb->asoc.asconf_supported == 1) {
 					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_ASCONF;
 				}
 				if (stcb->asoc.idata_supported == 1) {
 					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_INTERLEAVING;
 				}
 				sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_MULTIBUF;
 				if (stcb->asoc.reconfig_supported == 1) {
 					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_RE_CONFIG;
 				}
 				sac->sac_length += i;
 			} else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) {
 				memcpy(sac->sac_info, abort, abort_len);
 				sac->sac_length += abort_len;
 			}
 		}
 		SCTP_BUF_LEN(m_notify) = sac->sac_length;
 		control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 		    0, 0, stcb->asoc.context, 0, 0, 0,
 		    m_notify);
 		if (control != NULL) {
 			control->length = SCTP_BUF_LEN(m_notify);
 			control->spec_flags = M_NOTIFICATION;
 			/* not that we need this */
 			control->tail_mbuf = m_notify;
 			sctp_add_to_readq(stcb->sctp_ep, stcb,
 			    control,
 			    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD,
 			    so_locked);
 		} else {
 			sctp_m_freem(m_notify);
 		}
 	}
 	/*
 	 * For 1-to-1 style sockets, we send up and error when an ABORT
 	 * comes in.
 	 */
 set_error:
 	if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 	    ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) {
 		SOCK_LOCK(stcb->sctp_socket);
 		if (from_peer) {
 			if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) {
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED);
 				stcb->sctp_socket->so_error = ECONNREFUSED;
 			} else {
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
 				stcb->sctp_socket->so_error = ECONNRESET;
 			}
 		} else {
 			if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 			    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ETIMEDOUT);
 				stcb->sctp_socket->so_error = ETIMEDOUT;
 			} else {
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED);
 				stcb->sctp_socket->so_error = ECONNABORTED;
 			}
 		}
 		SOCK_UNLOCK(stcb->sctp_socket);
 	}
 	/* Wake ANY sleepers */
 	if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 	    ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) {
 		socantrcvmore(stcb->sctp_socket);
 	}
 	sorwakeup(stcb->sctp_socket);
 	sowwakeup(stcb->sctp_socket);
 }
 
 static void
 sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state,
     struct sockaddr *sa, uint32_t error, int so_locked)
 {
 	struct mbuf *m_notify;
 	struct sctp_paddr_change *spc;
 	struct sctp_queued_to_read *control;
 
 	if ((stcb == NULL) ||
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT)) {
 		/* event not enabled */
 		return;
 	}
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		return;
 	SCTP_BUF_LEN(m_notify) = 0;
 	spc = mtod(m_notify, struct sctp_paddr_change *);
 	memset(spc, 0, sizeof(struct sctp_paddr_change));
 	spc->spc_type = SCTP_PEER_ADDR_CHANGE;
 	spc->spc_flags = 0;
 	spc->spc_length = sizeof(struct sctp_paddr_change);
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 #ifdef INET6
 		if (sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 			in6_sin_2_v4mapsin6((struct sockaddr_in *)sa,
 			    (struct sockaddr_in6 *)&spc->spc_aaddr);
 		} else {
 			memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in));
 		}
 #else
 		memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in));
 #endif
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in6));
 
 			sin6 = (struct sockaddr_in6 *)&spc->spc_aaddr;
 			if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
 				if (sin6->sin6_scope_id == 0) {
 					/* recover scope_id for user */
 					(void)sa6_recoverscope(sin6);
 				} else {
 					/* clear embedded scope_id for user */
 					in6_clearscope(&sin6->sin6_addr);
 				}
 			}
 			break;
 		}
 #endif
 	default:
 		/* TSNH */
 		break;
 	}
 	spc->spc_state = state;
 	spc->spc_error = error;
 	spc->spc_assoc_id = sctp_get_associd(stcb);
 
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_paddr_change);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1,
 	    SCTP_READ_LOCK_NOT_HELD,
 	    so_locked);
 }
 
 
 static void
 sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error,
     struct sctp_tmit_chunk *chk, int so_locked)
 {
 	struct mbuf *m_notify;
 	struct sctp_send_failed *ssf;
 	struct sctp_send_failed_event *ssfe;
 	struct sctp_queued_to_read *control;
 	struct sctp_chunkhdr *chkhdr;
 	int notifhdr_len, chk_len, chkhdr_len, padding_len, payload_len;
 
 	if ((stcb == NULL) ||
 	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) &&
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT))) {
 		/* event not enabled */
 		return;
 	}
 
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
 		notifhdr_len = sizeof(struct sctp_send_failed_event);
 	} else {
 		notifhdr_len = sizeof(struct sctp_send_failed);
 	}
 	m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	SCTP_BUF_LEN(m_notify) = notifhdr_len;
 	if (stcb->asoc.idata_supported) {
 		chkhdr_len = sizeof(struct sctp_idata_chunk);
 	} else {
 		chkhdr_len = sizeof(struct sctp_data_chunk);
 	}
 	/* Use some defaults in case we can't access the chunk header */
 	if (chk->send_size >= chkhdr_len) {
 		payload_len = chk->send_size - chkhdr_len;
 	} else {
 		payload_len = 0;
 	}
 	padding_len = 0;
 	if (chk->data != NULL) {
 		chkhdr = mtod(chk->data, struct sctp_chunkhdr *);
 		if (chkhdr != NULL) {
 			chk_len = ntohs(chkhdr->chunk_length);
 			if ((chk_len >= chkhdr_len) &&
 			    (chk->send_size >= chk_len) &&
 			    (chk->send_size - chk_len < 4)) {
 				padding_len = chk->send_size - chk_len;
 				payload_len = chk->send_size - chkhdr_len - padding_len;
 			}
 		}
 	}
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
 		ssfe = mtod(m_notify, struct sctp_send_failed_event *);
 		memset(ssfe, 0, notifhdr_len);
 		ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT;
 		if (sent) {
 			ssfe->ssfe_flags = SCTP_DATA_SENT;
 		} else {
 			ssfe->ssfe_flags = SCTP_DATA_UNSENT;
 		}
 		ssfe->ssfe_length = (uint32_t)(notifhdr_len + payload_len);
 		ssfe->ssfe_error = error;
 		/* not exactly what the user sent in, but should be close :) */
 		ssfe->ssfe_info.snd_sid = chk->rec.data.sid;
 		ssfe->ssfe_info.snd_flags = chk->rec.data.rcv_flags;
 		ssfe->ssfe_info.snd_ppid = chk->rec.data.ppid;
 		ssfe->ssfe_info.snd_context = chk->rec.data.context;
 		ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb);
 		ssfe->ssfe_assoc_id = sctp_get_associd(stcb);
 	} else {
 		ssf = mtod(m_notify, struct sctp_send_failed *);
 		memset(ssf, 0, notifhdr_len);
 		ssf->ssf_type = SCTP_SEND_FAILED;
 		if (sent) {
 			ssf->ssf_flags = SCTP_DATA_SENT;
 		} else {
 			ssf->ssf_flags = SCTP_DATA_UNSENT;
 		}
 		ssf->ssf_length = (uint32_t)(notifhdr_len + payload_len);
 		ssf->ssf_error = error;
 		/* not exactly what the user sent in, but should be close :) */
 		ssf->ssf_info.sinfo_stream = chk->rec.data.sid;
 		ssf->ssf_info.sinfo_ssn = (uint16_t)chk->rec.data.mid;
 		ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags;
 		ssf->ssf_info.sinfo_ppid = chk->rec.data.ppid;
 		ssf->ssf_info.sinfo_context = chk->rec.data.context;
 		ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
 		ssf->ssf_assoc_id = sctp_get_associd(stcb);
 	}
 	if (chk->data != NULL) {
 		/* Trim off the sctp chunk header (it should be there) */
 		if (chk->send_size == chkhdr_len + payload_len + padding_len) {
 			m_adj(chk->data, chkhdr_len);
 			m_adj(chk->data, -padding_len);
 			sctp_mbuf_crush(chk->data);
 			chk->send_size -= (chkhdr_len + padding_len);
 		}
 	}
 	SCTP_BUF_NEXT(m_notify) = chk->data;
 	/* Steal off the mbuf */
 	chk->data = NULL;
 	/*
 	 * For this case, we check the actual socket buffer, since the assoc
 	 * is going away we don't want to overfill the socket buffer for a
 	 * non-reader
 	 */
 	if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
 		sctp_m_freem(m_notify);
 		return;
 	}
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1,
 	    SCTP_READ_LOCK_NOT_HELD,
 	    so_locked);
 }
 
 
 static void
 sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error,
     struct sctp_stream_queue_pending *sp, int so_locked)
 {
 	struct mbuf *m_notify;
 	struct sctp_send_failed *ssf;
 	struct sctp_send_failed_event *ssfe;
 	struct sctp_queued_to_read *control;
 	int notifhdr_len;
 
 	if ((stcb == NULL) ||
 	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) &&
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT))) {
 		/* event not enabled */
 		return;
 	}
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
 		notifhdr_len = sizeof(struct sctp_send_failed_event);
 	} else {
 		notifhdr_len = sizeof(struct sctp_send_failed);
 	}
 	m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL) {
 		/* no space left */
 		return;
 	}
 	SCTP_BUF_LEN(m_notify) = notifhdr_len;
 	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
 		ssfe = mtod(m_notify, struct sctp_send_failed_event *);
 		memset(ssfe, 0, notifhdr_len);
 		ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT;
 		ssfe->ssfe_flags = SCTP_DATA_UNSENT;
 		ssfe->ssfe_length = (uint32_t)(notifhdr_len + sp->length);
 		ssfe->ssfe_error = error;
 		/* not exactly what the user sent in, but should be close :) */
 		ssfe->ssfe_info.snd_sid = sp->sid;
 		if (sp->some_taken) {
 			ssfe->ssfe_info.snd_flags = SCTP_DATA_LAST_FRAG;
 		} else {
 			ssfe->ssfe_info.snd_flags = SCTP_DATA_NOT_FRAG;
 		}
 		ssfe->ssfe_info.snd_ppid = sp->ppid;
 		ssfe->ssfe_info.snd_context = sp->context;
 		ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb);
 		ssfe->ssfe_assoc_id = sctp_get_associd(stcb);
 	} else {
 		ssf = mtod(m_notify, struct sctp_send_failed *);
 		memset(ssf, 0, notifhdr_len);
 		ssf->ssf_type = SCTP_SEND_FAILED;
 		ssf->ssf_flags = SCTP_DATA_UNSENT;
 		ssf->ssf_length = (uint32_t)(notifhdr_len + sp->length);
 		ssf->ssf_error = error;
 		/* not exactly what the user sent in, but should be close :) */
 		ssf->ssf_info.sinfo_stream = sp->sid;
 		ssf->ssf_info.sinfo_ssn = 0;
 		if (sp->some_taken) {
 			ssf->ssf_info.sinfo_flags = SCTP_DATA_LAST_FRAG;
 		} else {
 			ssf->ssf_info.sinfo_flags = SCTP_DATA_NOT_FRAG;
 		}
 		ssf->ssf_info.sinfo_ppid = sp->ppid;
 		ssf->ssf_info.sinfo_context = sp->context;
 		ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
 		ssf->ssf_assoc_id = sctp_get_associd(stcb);
 	}
 	SCTP_BUF_NEXT(m_notify) = sp->data;
 
 	/* Steal off the mbuf */
 	sp->data = NULL;
 	/*
 	 * For this case, we check the actual socket buffer, since the assoc
 	 * is going away we don't want to overfill the socket buffer for a
 	 * non-reader
 	 */
 	if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
 		sctp_m_freem(m_notify);
 		return;
 	}
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked);
 }
 
 
 
 static void
 sctp_notify_adaptation_layer(struct sctp_tcb *stcb)
 {
 	struct mbuf *m_notify;
 	struct sctp_adaptation_event *sai;
 	struct sctp_queued_to_read *control;
 
 	if ((stcb == NULL) ||
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) {
 		/* event not enabled */
 		return;
 	}
 
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	SCTP_BUF_LEN(m_notify) = 0;
 	sai = mtod(m_notify, struct sctp_adaptation_event *);
 	memset(sai, 0, sizeof(struct sctp_adaptation_event));
 	sai->sai_type = SCTP_ADAPTATION_INDICATION;
 	sai->sai_flags = 0;
 	sai->sai_length = sizeof(struct sctp_adaptation_event);
 	sai->sai_adaptation_ind = stcb->asoc.peers_adaptation;
 	sai->sai_assoc_id = sctp_get_associd(stcb);
 
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_adaptation_event);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
 }
 
 /* This always must be called with the read-queue LOCKED in the INP */
 static void
 sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error,
     uint32_t val, int so_locked)
 {
 	struct mbuf *m_notify;
 	struct sctp_pdapi_event *pdapi;
 	struct sctp_queued_to_read *control;
 	struct sockbuf *sb;
 
 	if ((stcb == NULL) ||
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_PDAPIEVNT)) {
 		/* event not enabled */
 		return;
 	}
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) {
 		return;
 	}
 
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	SCTP_BUF_LEN(m_notify) = 0;
 	pdapi = mtod(m_notify, struct sctp_pdapi_event *);
 	memset(pdapi, 0, sizeof(struct sctp_pdapi_event));
 	pdapi->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
 	pdapi->pdapi_flags = 0;
 	pdapi->pdapi_length = sizeof(struct sctp_pdapi_event);
 	pdapi->pdapi_indication = error;
 	pdapi->pdapi_stream = (val >> 16);
 	pdapi->pdapi_seq = (val & 0x0000ffff);
 	pdapi->pdapi_assoc_id = sctp_get_associd(stcb);
 
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_pdapi_event);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sb = &stcb->sctp_socket->so_rcv;
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 		sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m_notify));
 	}
 	sctp_sballoc(stcb, sb, m_notify);
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 		sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
 	}
 	control->end_added = 1;
 	if (stcb->asoc.control_pdapi)
 		TAILQ_INSERT_AFTER(&stcb->sctp_ep->read_queue, stcb->asoc.control_pdapi, control, next);
 	else {
 		/* we really should not see this case */
 		TAILQ_INSERT_TAIL(&stcb->sctp_ep->read_queue, control, next);
 	}
 	if (stcb->sctp_ep && stcb->sctp_socket) {
 		/* This should always be the case */
 		sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
 	}
 }
 
 static void
 sctp_notify_shutdown_event(struct sctp_tcb *stcb)
 {
 	struct mbuf *m_notify;
 	struct sctp_shutdown_event *sse;
 	struct sctp_queued_to_read *control;
 
 	/*
 	 * For TCP model AND UDP connected sockets we will send an error up
 	 * when an SHUTDOWN completes
 	 */
 	if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/* mark socket closed for read/write and wakeup! */
 		socantsendmore(stcb->sctp_socket);
 	}
 	if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) {
 		/* event not enabled */
 		return;
 	}
 
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	sse = mtod(m_notify, struct sctp_shutdown_event *);
 	memset(sse, 0, sizeof(struct sctp_shutdown_event));
 	sse->sse_type = SCTP_SHUTDOWN_EVENT;
 	sse->sse_flags = 0;
 	sse->sse_length = sizeof(struct sctp_shutdown_event);
 	sse->sse_assoc_id = sctp_get_associd(stcb);
 
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_shutdown_event);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
 }
 
 static void
 sctp_notify_sender_dry_event(struct sctp_tcb *stcb,
     int so_locked)
 {
 	struct mbuf *m_notify;
 	struct sctp_sender_dry_event *event;
 	struct sctp_queued_to_read *control;
 
 	if ((stcb == NULL) ||
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DRYEVNT)) {
 		/* event not enabled */
 		return;
 	}
 
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL) {
 		/* no space left */
 		return;
 	}
 	SCTP_BUF_LEN(m_notify) = 0;
 	event = mtod(m_notify, struct sctp_sender_dry_event *);
 	memset(event, 0, sizeof(struct sctp_sender_dry_event));
 	event->sender_dry_type = SCTP_SENDER_DRY_EVENT;
 	event->sender_dry_flags = 0;
 	event->sender_dry_length = sizeof(struct sctp_sender_dry_event);
 	event->sender_dry_assoc_id = sctp_get_associd(stcb);
 
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_sender_dry_event);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb, control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked);
 }
 
 
 void
 sctp_notify_stream_reset_add(struct sctp_tcb *stcb, uint16_t numberin, uint16_t numberout, int flag)
 {
 	struct mbuf *m_notify;
 	struct sctp_queued_to_read *control;
 	struct sctp_stream_change_event *stradd;
 
 	if ((stcb == NULL) ||
 	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_CHANGEEVNT))) {
 		/* event not enabled */
 		return;
 	}
 	if ((stcb->asoc.peer_req_out) && flag) {
 		/* Peer made the request, don't tell the local user */
 		stcb->asoc.peer_req_out = 0;
 		return;
 	}
 	stcb->asoc.peer_req_out = 0;
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_stream_change_event), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	SCTP_BUF_LEN(m_notify) = 0;
 	stradd = mtod(m_notify, struct sctp_stream_change_event *);
 	memset(stradd, 0, sizeof(struct sctp_stream_change_event));
 	stradd->strchange_type = SCTP_STREAM_CHANGE_EVENT;
 	stradd->strchange_flags = flag;
 	stradd->strchange_length = sizeof(struct sctp_stream_change_event);
 	stradd->strchange_assoc_id = sctp_get_associd(stcb);
 	stradd->strchange_instrms = numberin;
 	stradd->strchange_outstrms = numberout;
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_stream_change_event);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 	if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
 		/* no space */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
 }
 
 void
 sctp_notify_stream_reset_tsn(struct sctp_tcb *stcb, uint32_t sending_tsn, uint32_t recv_tsn, int flag)
 {
 	struct mbuf *m_notify;
 	struct sctp_queued_to_read *control;
 	struct sctp_assoc_reset_event *strasoc;
 
 	if ((stcb == NULL) ||
 	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ASSOC_RESETEVNT))) {
 		/* event not enabled */
 		return;
 	}
 	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_reset_event), 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	SCTP_BUF_LEN(m_notify) = 0;
 	strasoc = mtod(m_notify, struct sctp_assoc_reset_event *);
 	memset(strasoc, 0, sizeof(struct sctp_assoc_reset_event));
 	strasoc->assocreset_type = SCTP_ASSOC_RESET_EVENT;
 	strasoc->assocreset_flags = flag;
 	strasoc->assocreset_length = sizeof(struct sctp_assoc_reset_event);
 	strasoc->assocreset_assoc_id = sctp_get_associd(stcb);
 	strasoc->assocreset_local_tsn = sending_tsn;
 	strasoc->assocreset_remote_tsn = recv_tsn;
 	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_assoc_reset_event);
 	SCTP_BUF_NEXT(m_notify) = NULL;
 	if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
 		/* no space */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
 }
 
 
 
 static void
 sctp_notify_stream_reset(struct sctp_tcb *stcb,
     int number_entries, uint16_t *list, int flag)
 {
 	struct mbuf *m_notify;
 	struct sctp_queued_to_read *control;
 	struct sctp_stream_reset_event *strreset;
 	int len;
 
 	if ((stcb == NULL) ||
 	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT))) {
 		/* event not enabled */
 		return;
 	}
 
 	m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
 	SCTP_BUF_LEN(m_notify) = 0;
 	len = sizeof(struct sctp_stream_reset_event) + (number_entries * sizeof(uint16_t));
 	if (len > M_TRAILINGSPACE(m_notify)) {
 		/* never enough room */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	strreset = mtod(m_notify, struct sctp_stream_reset_event *);
 	memset(strreset, 0, len);
 	strreset->strreset_type = SCTP_STREAM_RESET_EVENT;
 	strreset->strreset_flags = flag;
 	strreset->strreset_length = len;
 	strreset->strreset_assoc_id = sctp_get_associd(stcb);
 	if (number_entries) {
 		int i;
 
 		for (i = 0; i < number_entries; i++) {
 			strreset->strreset_stream_list[i] = ntohs(list[i]);
 		}
 	}
 	SCTP_BUF_LEN(m_notify) = len;
 	SCTP_BUF_NEXT(m_notify) = NULL;
 	if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) {
 		/* no space */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	/* append to socket */
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control == NULL) {
 		/* no memory */
 		sctp_m_freem(m_notify);
 		return;
 	}
 	control->length = SCTP_BUF_LEN(m_notify);
 	control->spec_flags = M_NOTIFICATION;
 	/* not that we need this */
 	control->tail_mbuf = m_notify;
 	sctp_add_to_readq(stcb->sctp_ep, stcb,
 	    control,
 	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
 }
 
 
 static void
 sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_error_chunk *chunk)
 {
 	struct mbuf *m_notify;
 	struct sctp_remote_error *sre;
 	struct sctp_queued_to_read *control;
 	unsigned int notif_len;
 	uint16_t chunk_len;
 
 	if ((stcb == NULL) ||
 	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPEERERR)) {
 		return;
 	}
 	if (chunk != NULL) {
 		chunk_len = ntohs(chunk->ch.chunk_length);
 		/*
 		 * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be
 		 * contiguous.
 		 */
 		if (chunk_len > SCTP_CHUNK_BUFFER_SIZE) {
 			chunk_len = SCTP_CHUNK_BUFFER_SIZE;
 		}
 	} else {
 		chunk_len = 0;
 	}
 	notif_len = (unsigned int)(sizeof(struct sctp_remote_error) + chunk_len);
 	m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
 	if (m_notify == NULL) {
 		/* Retry with smaller value. */
 		notif_len = (unsigned int)sizeof(struct sctp_remote_error);
 		m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
 		if (m_notify == NULL) {
 			return;
 		}
 	}
 	SCTP_BUF_NEXT(m_notify) = NULL;
 	sre = mtod(m_notify, struct sctp_remote_error *);
 	memset(sre, 0, notif_len);
 	sre->sre_type = SCTP_REMOTE_ERROR;
 	sre->sre_flags = 0;
 	sre->sre_length = sizeof(struct sctp_remote_error);
 	sre->sre_error = error;
 	sre->sre_assoc_id = sctp_get_associd(stcb);
 	if (notif_len > sizeof(struct sctp_remote_error)) {
 		memcpy(sre->sre_data, chunk, chunk_len);
 		sre->sre_length += chunk_len;
 	}
 	SCTP_BUF_LEN(m_notify) = sre->sre_length;
 	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
 	    0, 0, stcb->asoc.context, 0, 0, 0,
 	    m_notify);
 	if (control != NULL) {
 		control->length = SCTP_BUF_LEN(m_notify);
 		control->spec_flags = M_NOTIFICATION;
 		/* not that we need this */
 		control->tail_mbuf = m_notify;
 		sctp_add_to_readq(stcb->sctp_ep, stcb,
 		    control,
 		    &stcb->sctp_socket->so_rcv, 1,
 		    SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
 	} else {
 		sctp_m_freem(m_notify);
 	}
 }
 
 
 void
 sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
     uint32_t error, void *data, int so_locked)
 {
 	if ((stcb == NULL) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
 		/* If the socket is gone we are out of here */
 		return;
 	}
 	if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		return;
 	}
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 		if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) ||
 		    (notification == SCTP_NOTIFY_INTERFACE_UP) ||
 		    (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) {
 			/* Don't report these in front states */
 			return;
 		}
 	}
 	switch (notification) {
 	case SCTP_NOTIFY_ASSOC_UP:
 		if (stcb->asoc.assoc_up_sent == 0) {
 			sctp_notify_assoc_change(SCTP_COMM_UP, stcb, error, NULL, 0, so_locked);
 			stcb->asoc.assoc_up_sent = 1;
 		}
 		if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) {
 			sctp_notify_adaptation_layer(stcb);
 		}
 		if (stcb->asoc.auth_supported == 0) {
 			sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
 			    NULL, so_locked);
 		}
 		break;
 	case SCTP_NOTIFY_ASSOC_DOWN:
 		sctp_notify_assoc_change(SCTP_SHUTDOWN_COMP, stcb, error, NULL, 0, so_locked);
 		break;
 	case SCTP_NOTIFY_INTERFACE_DOWN:
 		{
 			struct sctp_nets *net;
 
 			net = (struct sctp_nets *)data;
 			sctp_notify_peer_addr_change(stcb, SCTP_ADDR_UNREACHABLE,
 			    (struct sockaddr *)&net->ro._l_addr, error, so_locked);
 			break;
 		}
 	case SCTP_NOTIFY_INTERFACE_UP:
 		{
 			struct sctp_nets *net;
 
 			net = (struct sctp_nets *)data;
 			sctp_notify_peer_addr_change(stcb, SCTP_ADDR_AVAILABLE,
 			    (struct sockaddr *)&net->ro._l_addr, error, so_locked);
 			break;
 		}
 	case SCTP_NOTIFY_INTERFACE_CONFIRMED:
 		{
 			struct sctp_nets *net;
 
 			net = (struct sctp_nets *)data;
 			sctp_notify_peer_addr_change(stcb, SCTP_ADDR_CONFIRMED,
 			    (struct sockaddr *)&net->ro._l_addr, error, so_locked);
 			break;
 		}
 	case SCTP_NOTIFY_SPECIAL_SP_FAIL:
 		sctp_notify_send_failed2(stcb, error,
 		    (struct sctp_stream_queue_pending *)data, so_locked);
 		break;
 	case SCTP_NOTIFY_SENT_DG_FAIL:
 		sctp_notify_send_failed(stcb, 1, error,
 		    (struct sctp_tmit_chunk *)data, so_locked);
 		break;
 	case SCTP_NOTIFY_UNSENT_DG_FAIL:
 		sctp_notify_send_failed(stcb, 0, error,
 		    (struct sctp_tmit_chunk *)data, so_locked);
 		break;
 	case SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION:
 		{
 			uint32_t val;
 
 			val = *((uint32_t *)data);
 
 			sctp_notify_partial_delivery_indication(stcb, error, val, so_locked);
 			break;
 		}
 	case SCTP_NOTIFY_ASSOC_LOC_ABORTED:
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 			sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 0, so_locked);
 		} else {
 			sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 0, so_locked);
 		}
 		break;
 	case SCTP_NOTIFY_ASSOC_REM_ABORTED:
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) {
 			sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 1, so_locked);
 		} else {
 			sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 1, so_locked);
 		}
 		break;
 	case SCTP_NOTIFY_ASSOC_RESTART:
 		sctp_notify_assoc_change(SCTP_RESTART, stcb, error, NULL, 0, so_locked);
 		if (stcb->asoc.auth_supported == 0) {
 			sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
 			    NULL, so_locked);
 		}
 		break;
 	case SCTP_NOTIFY_STR_RESET_SEND:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), SCTP_STREAM_RESET_OUTGOING_SSN);
 		break;
 	case SCTP_NOTIFY_STR_RESET_RECV:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), SCTP_STREAM_RESET_INCOMING);
 		break;
 	case SCTP_NOTIFY_STR_RESET_FAILED_OUT:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *)data),
 		    (SCTP_STREAM_RESET_OUTGOING_SSN | SCTP_STREAM_RESET_FAILED));
 		break;
 	case SCTP_NOTIFY_STR_RESET_DENIED_OUT:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *)data),
 		    (SCTP_STREAM_RESET_OUTGOING_SSN | SCTP_STREAM_RESET_DENIED));
 		break;
 	case SCTP_NOTIFY_STR_RESET_FAILED_IN:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *)data),
 		    (SCTP_STREAM_RESET_INCOMING | SCTP_STREAM_RESET_FAILED));
 		break;
 	case SCTP_NOTIFY_STR_RESET_DENIED_IN:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *)data),
 		    (SCTP_STREAM_RESET_INCOMING | SCTP_STREAM_RESET_DENIED));
 		break;
 	case SCTP_NOTIFY_ASCONF_ADD_IP:
 		sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data,
 		    error, so_locked);
 		break;
 	case SCTP_NOTIFY_ASCONF_DELETE_IP:
 		sctp_notify_peer_addr_change(stcb, SCTP_ADDR_REMOVED, data,
 		    error, so_locked);
 		break;
 	case SCTP_NOTIFY_ASCONF_SET_PRIMARY:
 		sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data,
 		    error, so_locked);
 		break;
 	case SCTP_NOTIFY_PEER_SHUTDOWN:
 		sctp_notify_shutdown_event(stcb);
 		break;
 	case SCTP_NOTIFY_AUTH_NEW_KEY:
 		sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY, error,
 		    (uint16_t)(uintptr_t)data,
 		    so_locked);
 		break;
 	case SCTP_NOTIFY_AUTH_FREE_KEY:
 		sctp_notify_authentication(stcb, SCTP_AUTH_FREE_KEY, error,
 		    (uint16_t)(uintptr_t)data,
 		    so_locked);
 		break;
 	case SCTP_NOTIFY_NO_PEER_AUTH:
 		sctp_notify_authentication(stcb, SCTP_AUTH_NO_AUTH, error,
 		    (uint16_t)(uintptr_t)data,
 		    so_locked);
 		break;
 	case SCTP_NOTIFY_SENDER_DRY:
 		sctp_notify_sender_dry_event(stcb, so_locked);
 		break;
 	case SCTP_NOTIFY_REMOTE_ERROR:
 		sctp_notify_remote_error(stcb, error, data);
 		break;
 	default:
 		SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n",
 		    __func__, notification, notification);
 		break;
 	}			/* end switch */
 }
 
 void
-sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int holds_lock, int so_locked)
+sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int so_locked)
 {
 	struct sctp_association *asoc;
 	struct sctp_stream_out *outs;
 	struct sctp_tmit_chunk *chk, *nchk;
 	struct sctp_stream_queue_pending *sp, *nsp;
 	int i;
 
 	if (stcb == NULL) {
 		return;
 	}
 	asoc = &stcb->asoc;
 	if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 		/* already being freed */
 		return;
 	}
 	if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (asoc->state & SCTP_STATE_CLOSED_SOCKET)) {
 		return;
 	}
 	/* now through all the gunk freeing chunks */
-	if (holds_lock == 0) {
-		SCTP_TCB_SEND_LOCK(stcb);
-	}
 	/* sent queue SHOULD be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
 		asoc->sent_queue_cnt--;
 		if (chk->sent != SCTP_DATAGRAM_NR_ACKED) {
 			if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) {
 				asoc->strmout[chk->rec.data.sid].chunks_on_queues--;
 #ifdef INVARIANTS
 			} else {
 				panic("No chunks on the queues for sid %u.", chk->rec.data.sid);
 #endif
 			}
 		}
 		if (chk->data != NULL) {
 			sctp_free_bufspace(stcb, asoc, chk, 1);
 			sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb,
 			    error, chk, so_locked);
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 		}
 		sctp_free_a_chunk(stcb, chk, so_locked);
 		/* sa_ignore FREED_MEMORY */
 	}
 	/* pending send queue SHOULD be empty */
 	TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
 		TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next);
 		asoc->send_queue_cnt--;
 		if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) {
 			asoc->strmout[chk->rec.data.sid].chunks_on_queues--;
 #ifdef INVARIANTS
 		} else {
 			panic("No chunks on the queues for sid %u.", chk->rec.data.sid);
 #endif
 		}
 		if (chk->data != NULL) {
 			sctp_free_bufspace(stcb, asoc, chk, 1);
 			sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
 			    error, chk, so_locked);
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
 			}
 		}
 		sctp_free_a_chunk(stcb, chk, so_locked);
 		/* sa_ignore FREED_MEMORY */
 	}
 	for (i = 0; i < asoc->streamoutcnt; i++) {
 		/* For each stream */
 		outs = &asoc->strmout[i];
 		/* clean up any sends there */
 		TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
 			atomic_subtract_int(&asoc->stream_queue_cnt, 1);
 			TAILQ_REMOVE(&outs->outqueue, sp, next);
 			stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1);
 			sctp_free_spbufspace(stcb, asoc, sp);
 			if (sp->data) {
 				sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
 				    error, (void *)sp, so_locked);
 				if (sp->data) {
 					sctp_m_freem(sp->data);
 					sp->data = NULL;
 					sp->tail_mbuf = NULL;
 					sp->length = 0;
 				}
 			}
 			if (sp->net) {
 				sctp_free_remote_addr(sp->net);
 				sp->net = NULL;
 			}
 			/* Free the chunk */
 			sctp_free_a_strmoq(stcb, sp, so_locked);
 			/* sa_ignore FREED_MEMORY */
 		}
 	}
-
-	if (holds_lock == 0) {
-		SCTP_TCB_SEND_UNLOCK(stcb);
-	}
 }
 
 void
 sctp_abort_notification(struct sctp_tcb *stcb, uint8_t from_peer, uint16_t error,
     struct sctp_abort_chunk *abort, int so_locked)
 {
 	if (stcb == NULL) {
 		return;
 	}
 	if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 	    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) {
 		stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_WAS_ABORTED;
 	}
 	if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
 		return;
 	}
+	SCTP_TCB_SEND_LOCK(stcb);
+	SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED);
 	/* Tell them we lost the asoc */
-	sctp_report_all_outbound(stcb, error, 0, so_locked);
+	sctp_report_all_outbound(stcb, error, so_locked);
+	SCTP_TCB_SEND_UNLOCK(stcb);
 	if (from_peer) {
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_REM_ABORTED, stcb, error, abort, so_locked);
 	} else {
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_LOC_ABORTED, stcb, error, abort, so_locked);
 	}
 }
 
 void
 sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct mbuf *m, int iphlen,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct mbuf *op_err,
     uint8_t mflowtype, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	uint32_t vtag;
 
 	vtag = 0;
 	if (stcb != NULL) {
 		vtag = stcb->asoc.peer_vtag;
 		vrf_id = stcb->asoc.vrf_id;
 	}
 	sctp_send_abort(m, iphlen, src, dst, sh, vtag, op_err,
 	    mflowtype, mflowid, inp->fibnum,
 	    vrf_id, port);
 	if (stcb != NULL) {
 		/* We have a TCB to abort, send notification too */
 		sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED);
-		SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED);
 		/* Ok, now lets free it */
 		SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 		    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 		}
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_4);
 	}
 }
 #ifdef SCTP_ASOCLOG_OF_TSNS
 void
 sctp_print_out_track_log(struct sctp_tcb *stcb)
 {
 #ifdef NOSIY_PRINTS
 	int i;
 
 	SCTP_PRINTF("Last ep reason:%x\n", stcb->sctp_ep->last_abort_code);
 	SCTP_PRINTF("IN bound TSN log-aaa\n");
 	if ((stcb->asoc.tsn_in_at == 0) && (stcb->asoc.tsn_in_wrapped == 0)) {
 		SCTP_PRINTF("None rcvd\n");
 		goto none_in;
 	}
 	if (stcb->asoc.tsn_in_wrapped) {
 		for (i = stcb->asoc.tsn_in_at; i < SCTP_TSN_LOG_SIZE; i++) {
 			SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
 			    stcb->asoc.in_tsnlog[i].tsn,
 			    stcb->asoc.in_tsnlog[i].strm,
 			    stcb->asoc.in_tsnlog[i].seq,
 			    stcb->asoc.in_tsnlog[i].flgs,
 			    stcb->asoc.in_tsnlog[i].sz);
 		}
 	}
 	if (stcb->asoc.tsn_in_at) {
 		for (i = 0; i < stcb->asoc.tsn_in_at; i++) {
 			SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
 			    stcb->asoc.in_tsnlog[i].tsn,
 			    stcb->asoc.in_tsnlog[i].strm,
 			    stcb->asoc.in_tsnlog[i].seq,
 			    stcb->asoc.in_tsnlog[i].flgs,
 			    stcb->asoc.in_tsnlog[i].sz);
 		}
 	}
 none_in:
 	SCTP_PRINTF("OUT bound TSN log-aaa\n");
 	if ((stcb->asoc.tsn_out_at == 0) &&
 	    (stcb->asoc.tsn_out_wrapped == 0)) {
 		SCTP_PRINTF("None sent\n");
 	}
 	if (stcb->asoc.tsn_out_wrapped) {
 		for (i = stcb->asoc.tsn_out_at; i < SCTP_TSN_LOG_SIZE; i++) {
 			SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
 			    stcb->asoc.out_tsnlog[i].tsn,
 			    stcb->asoc.out_tsnlog[i].strm,
 			    stcb->asoc.out_tsnlog[i].seq,
 			    stcb->asoc.out_tsnlog[i].flgs,
 			    stcb->asoc.out_tsnlog[i].sz);
 		}
 	}
 	if (stcb->asoc.tsn_out_at) {
 		for (i = 0; i < stcb->asoc.tsn_out_at; i++) {
 			SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n",
 			    stcb->asoc.out_tsnlog[i].tsn,
 			    stcb->asoc.out_tsnlog[i].strm,
 			    stcb->asoc.out_tsnlog[i].seq,
 			    stcb->asoc.out_tsnlog[i].flgs,
 			    stcb->asoc.out_tsnlog[i].sz);
 		}
 	}
 #endif
 }
 #endif
 
 void
 sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     struct mbuf *op_err,
     int so_locked)
 {
 
 	if (stcb == NULL) {
 		/* Got to have a TCB */
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 			if (LIST_EMPTY(&inp->sctp_asoc_list)) {
 				sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 				    SCTP_CALLED_DIRECTLY_NOCMPSET);
 			}
 		}
 		return;
-	} else {
-		SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED);
 	}
 	/* notify the peer */
 	sctp_send_abort_tcb(stcb, op_err, so_locked);
 	SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 	    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 		SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 	}
 	/* notify the ulp */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
 		sctp_abort_notification(stcb, 0, 0, NULL, so_locked);
 	}
 	/* now free the asoc */
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	sctp_print_out_track_log(stcb);
 #endif
 	(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 	    SCTP_FROM_SCTPUTIL + SCTP_LOC_5);
 }
 
 void
 sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
     struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_inpcb *inp,
     struct mbuf *cause,
     uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_chunkhdr *ch, chunk_buf;
 	unsigned int chk_length;
 	int contains_init_chunk;
 
 	SCTP_STAT_INCR_COUNTER32(sctps_outoftheblue);
 	/* Generate a TO address for future reference */
 	if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 		if (LIST_EMPTY(&inp->sctp_asoc_list)) {
 			sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 			    SCTP_CALLED_DIRECTLY_NOCMPSET);
 		}
 	}
 	contains_init_chunk = 0;
 	ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
 	    sizeof(*ch), (uint8_t *)&chunk_buf);
 	while (ch != NULL) {
 		chk_length = ntohs(ch->chunk_length);
 		if (chk_length < sizeof(*ch)) {
 			/* break to abort land */
 			break;
 		}
 		switch (ch->chunk_type) {
 		case SCTP_INIT:
 			contains_init_chunk = 1;
 			break;
 		case SCTP_PACKET_DROPPED:
 			/* we don't respond to pkt-dropped */
 			return;
 		case SCTP_ABORT_ASSOCIATION:
 			/* we don't respond with an ABORT to an ABORT */
 			return;
 		case SCTP_SHUTDOWN_COMPLETE:
 			/*
 			 * we ignore it since we are not waiting for it and
 			 * peer is gone
 			 */
 			return;
 		case SCTP_SHUTDOWN_ACK:
 			sctp_send_shutdown_complete2(src, dst, sh,
 			    mflowtype, mflowid, fibnum,
 			    vrf_id, port);
 			return;
 		default:
 			break;
 		}
 		offset += SCTP_SIZE32(chk_length);
 		ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
 		    sizeof(*ch), (uint8_t *)&chunk_buf);
 	}
 	if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) ||
 	    ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) &&
 	    (contains_init_chunk == 0))) {
 		sctp_send_abort(m, iphlen, src, dst, sh, 0, cause,
 		    mflowtype, mflowid, fibnum,
 		    vrf_id, port);
 	}
 }
 
 /*
  * check the inbound datagram to make sure there is not an abort inside it,
  * if there is return 1, else return 0.
  */
 int
 sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t *vtagfill)
 {
 	struct sctp_chunkhdr *ch;
 	struct sctp_init_chunk *init_chk, chunk_buf;
 	int offset;
 	unsigned int chk_length;
 
 	offset = iphlen + sizeof(struct sctphdr);
 	ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch),
 	    (uint8_t *)&chunk_buf);
 	while (ch != NULL) {
 		chk_length = ntohs(ch->chunk_length);
 		if (chk_length < sizeof(*ch)) {
 			/* packet is probably corrupt */
 			break;
 		}
 		/* we seem to be ok, is it an abort? */
 		if (ch->chunk_type == SCTP_ABORT_ASSOCIATION) {
 			/* yep, tell them */
 			return (1);
 		}
 		if (ch->chunk_type == SCTP_INITIATION) {
 			/* need to update the Vtag */
 			init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m,
 			    offset, sizeof(*init_chk), (uint8_t *)&chunk_buf);
 			if (init_chk != NULL) {
 				*vtagfill = ntohl(init_chk->init.initiate_tag);
 			}
 		}
 		/* Nope, move to the next chunk */
 		offset += SCTP_SIZE32(chk_length);
 		ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
 		    sizeof(*ch), (uint8_t *)&chunk_buf);
 	}
 	return (0);
 }
 
 /*
  * currently (2/02), ifa_addr embeds scope_id's and don't have sin6_scope_id
  * set (i.e. it's 0) so, create this function to compare link local scopes
  */
 #ifdef INET6
 uint32_t
 sctp_is_same_scope(struct sockaddr_in6 *addr1, struct sockaddr_in6 *addr2)
 {
 	struct sockaddr_in6 a, b;
 
 	/* save copies */
 	a = *addr1;
 	b = *addr2;
 
 	if (a.sin6_scope_id == 0)
 		if (sa6_recoverscope(&a)) {
 			/* can't get scope, so can't match */
 			return (0);
 		}
 	if (b.sin6_scope_id == 0)
 		if (sa6_recoverscope(&b)) {
 			/* can't get scope, so can't match */
 			return (0);
 		}
 	if (a.sin6_scope_id != b.sin6_scope_id)
 		return (0);
 
 	return (1);
 }
 
 /*
  * returns a sockaddr_in6 with embedded scope recovered and removed
  */
 struct sockaddr_in6 *
 sctp_recover_scope(struct sockaddr_in6 *addr, struct sockaddr_in6 *store)
 {
 	/* check and strip embedded scope junk */
 	if (addr->sin6_family == AF_INET6) {
 		if (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr)) {
 			if (addr->sin6_scope_id == 0) {
 				*store = *addr;
 				if (!sa6_recoverscope(store)) {
 					/* use the recovered scope */
 					addr = store;
 				}
 			} else {
 				/* else, return the original "to" addr */
 				in6_clearscope(&addr->sin6_addr);
 			}
 		}
 	}
 	return (addr);
 }
 #endif
 
 /*
  * are the two addresses the same?  currently a "scopeless" check returns: 1
  * if same, 0 if not
  */
 int
 sctp_cmpaddr(struct sockaddr *sa1, struct sockaddr *sa2)
 {
 
 	/* must be valid */
 	if (sa1 == NULL || sa2 == NULL)
 		return (0);
 
 	/* must be the same family */
 	if (sa1->sa_family != sa2->sa_family)
 		return (0);
 
 	switch (sa1->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			/* IPv6 addresses */
 			struct sockaddr_in6 *sin6_1, *sin6_2;
 
 			sin6_1 = (struct sockaddr_in6 *)sa1;
 			sin6_2 = (struct sockaddr_in6 *)sa2;
 			return (SCTP6_ARE_ADDR_EQUAL(sin6_1,
 			    sin6_2));
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			/* IPv4 addresses */
 			struct sockaddr_in *sin_1, *sin_2;
 
 			sin_1 = (struct sockaddr_in *)sa1;
 			sin_2 = (struct sockaddr_in *)sa2;
 			return (sin_1->sin_addr.s_addr == sin_2->sin_addr.s_addr);
 		}
 #endif
 	default:
 		/* we don't do these... */
 		return (0);
 	}
 }
 
 void
 sctp_print_address(struct sockaddr *sa)
 {
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	switch (sa->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)sa;
 			SCTP_PRINTF("IPv6 address: %s:port:%d scope:%u\n",
 			    ip6_sprintf(ip6buf, &sin6->sin6_addr),
 			    ntohs(sin6->sin6_port),
 			    sin6->sin6_scope_id);
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 			unsigned char *p;
 
 			sin = (struct sockaddr_in *)sa;
 			p = (unsigned char *)&sin->sin_addr;
 			SCTP_PRINTF("IPv4 address: %u.%u.%u.%u:%d\n",
 			    p[0], p[1], p[2], p[3], ntohs(sin->sin_port));
 			break;
 		}
 #endif
 	default:
 		SCTP_PRINTF("?\n");
 		break;
 	}
 }
 
 void
 sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp,
     struct sctp_inpcb *new_inp,
     struct sctp_tcb *stcb,
     int waitflags)
 {
 	/*
 	 * go through our old INP and pull off any control structures that
 	 * belong to stcb and move then to the new inp.
 	 */
 	struct socket *old_so, *new_so;
 	struct sctp_queued_to_read *control, *nctl;
 	struct sctp_readhead tmp_queue;
 	struct mbuf *m;
 	int error = 0;
 
 	old_so = old_inp->sctp_socket;
 	new_so = new_inp->sctp_socket;
 	TAILQ_INIT(&tmp_queue);
 	error = sblock(&old_so->so_rcv, waitflags);
 	if (error) {
 		/*
 		 * Gak, can't get sblock, we have a problem. data will be
 		 * left stranded.. and we don't dare look at it since the
 		 * other thread may be reading something. Oh well, its a
 		 * screwed up app that does a peeloff OR a accept while
 		 * reading from the main socket... actually its only the
 		 * peeloff() case, since I think read will fail on a
 		 * listening socket..
 		 */
 		return;
 	}
 	/* lock the socket buffers */
 	SCTP_INP_READ_LOCK(old_inp);
 	TAILQ_FOREACH_SAFE(control, &old_inp->read_queue, next, nctl) {
 		/* Pull off all for out target stcb */
 		if (control->stcb == stcb) {
 			/* remove it we want it */
 			TAILQ_REMOVE(&old_inp->read_queue, control, next);
 			TAILQ_INSERT_TAIL(&tmp_queue, control, next);
 			m = control->data;
 			while (m) {
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 					sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m));
 				}
 				sctp_sbfree(control, stcb, &old_so->so_rcv, m);
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 					sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
 				}
 				m = SCTP_BUF_NEXT(m);
 			}
 		}
 	}
 	SCTP_INP_READ_UNLOCK(old_inp);
 	/* Remove the sb-lock on the old socket */
 
 	sbunlock(&old_so->so_rcv);
 	/* Now we move them over to the new socket buffer */
 	SCTP_INP_READ_LOCK(new_inp);
 	TAILQ_FOREACH_SAFE(control, &tmp_queue, next, nctl) {
 		TAILQ_INSERT_TAIL(&new_inp->read_queue, control, next);
 		m = control->data;
 		while (m) {
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 				sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m));
 			}
 			sctp_sballoc(stcb, &new_so->so_rcv, m);
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 				sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
 			}
 			m = SCTP_BUF_NEXT(m);
 		}
 	}
 	SCTP_INP_READ_UNLOCK(new_inp);
 }
 
 void
 sctp_wakeup_the_read_socket(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     int so_locked
     SCTP_UNUSED
 )
 {
 	if ((inp != NULL) && (inp->sctp_socket != NULL)) {
 		sctp_sorwakeup(inp, inp->sctp_socket);
 	}
 }
 
 void
 sctp_add_to_readq(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_queued_to_read *control,
     struct sockbuf *sb,
     int end,
     int inp_read_lock_held,
     int so_locked)
 {
 	/*
 	 * Here we must place the control on the end of the socket read
 	 * queue AND increment sb_cc so that select will work properly on
 	 * read.
 	 */
 	struct mbuf *m, *prev = NULL;
 
 	if (inp == NULL) {
 		/* Gak, TSNH!! */
 #ifdef INVARIANTS
 		panic("Gak, inp NULL on add_to_readq");
 #endif
 		return;
 	}
 	if (inp_read_lock_held == 0)
 		SCTP_INP_READ_LOCK(inp);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) {
 		if (!control->on_strm_q) {
 			sctp_free_remote_addr(control->whoFrom);
 			if (control->data) {
 				sctp_m_freem(control->data);
 				control->data = NULL;
 			}
 			sctp_free_a_readq(stcb, control);
 		}
 		if (inp_read_lock_held == 0)
 			SCTP_INP_READ_UNLOCK(inp);
 		return;
 	}
 	if (!(control->spec_flags & M_NOTIFICATION)) {
 		atomic_add_int(&inp->total_recvs, 1);
 		if (!control->do_not_ref_stcb) {
 			atomic_add_int(&stcb->total_recvs, 1);
 		}
 	}
 	m = control->data;
 	control->held_length = 0;
 	control->length = 0;
 	while (m) {
 		if (SCTP_BUF_LEN(m) == 0) {
 			/* Skip mbufs with NO length */
 			if (prev == NULL) {
 				/* First one */
 				control->data = sctp_m_free(m);
 				m = control->data;
 			} else {
 				SCTP_BUF_NEXT(prev) = sctp_m_free(m);
 				m = SCTP_BUF_NEXT(prev);
 			}
 			if (m == NULL) {
 				control->tail_mbuf = prev;
 			}
 			continue;
 		}
 		prev = m;
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 			sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m));
 		}
 		sctp_sballoc(stcb, sb, m);
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 			sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
 		}
 		atomic_add_int(&control->length, SCTP_BUF_LEN(m));
 		m = SCTP_BUF_NEXT(m);
 	}
 	if (prev != NULL) {
 		control->tail_mbuf = prev;
 	} else {
 		/* Everything got collapsed out?? */
 		if (!control->on_strm_q) {
 			sctp_free_remote_addr(control->whoFrom);
 			sctp_free_a_readq(stcb, control);
 		}
 		if (inp_read_lock_held == 0)
 			SCTP_INP_READ_UNLOCK(inp);
 		return;
 	}
 	if (end) {
 		control->end_added = 1;
 	}
 	TAILQ_INSERT_TAIL(&inp->read_queue, control, next);
 	control->on_read_q = 1;
 	if (inp_read_lock_held == 0)
 		SCTP_INP_READ_UNLOCK(inp);
 	if (inp && inp->sctp_socket) {
 		sctp_wakeup_the_read_socket(inp, stcb, so_locked);
 	}
 }
 
 /*************HOLD THIS COMMENT FOR PATCH FILE OF
  *************ALTERNATE ROUTING CODE
  */
 
 /*************HOLD THIS COMMENT FOR END OF PATCH FILE OF
  *************ALTERNATE ROUTING CODE
  */
 
 struct mbuf *
 sctp_generate_cause(uint16_t code, char *info)
 {
 	struct mbuf *m;
 	struct sctp_gen_error_cause *cause;
 	size_t info_len;
 	uint16_t len;
 
 	if ((code == 0) || (info == NULL)) {
 		return (NULL);
 	}
 	info_len = strlen(info);
 	if (info_len > (SCTP_MAX_CAUSE_LENGTH - sizeof(struct sctp_paramhdr))) {
 		return (NULL);
 	}
 	len = (uint16_t)(sizeof(struct sctp_paramhdr) + info_len);
 	m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
 	if (m != NULL) {
 		SCTP_BUF_LEN(m) = len;
 		cause = mtod(m, struct sctp_gen_error_cause *);
 		cause->code = htons(code);
 		cause->length = htons(len);
 		memcpy(cause->info, info, info_len);
 	}
 	return (m);
 }
 
 struct mbuf *
 sctp_generate_no_user_data_cause(uint32_t tsn)
 {
 	struct mbuf *m;
 	struct sctp_error_no_user_data *no_user_data_cause;
 	uint16_t len;
 
 	len = (uint16_t)sizeof(struct sctp_error_no_user_data);
 	m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
 	if (m != NULL) {
 		SCTP_BUF_LEN(m) = len;
 		no_user_data_cause = mtod(m, struct sctp_error_no_user_data *);
 		no_user_data_cause->cause.code = htons(SCTP_CAUSE_NO_USER_DATA);
 		no_user_data_cause->cause.length = htons(len);
 		no_user_data_cause->tsn = htonl(tsn);
 	}
 	return (m);
 }
 
 #ifdef SCTP_MBCNT_LOGGING
 void
 sctp_free_bufspace(struct sctp_tcb *stcb, struct sctp_association *asoc,
     struct sctp_tmit_chunk *tp1, int chk_cnt)
 {
 	if (tp1->data == NULL) {
 		return;
 	}
 	asoc->chunks_on_out_queue -= chk_cnt;
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBCNT_LOGGING_ENABLE) {
 		sctp_log_mbcnt(SCTP_LOG_MBCNT_DECREASE,
 		    asoc->total_output_queue_size,
 		    tp1->book_size,
 		    0,
 		    tp1->mbcnt);
 	}
 	if (asoc->total_output_queue_size >= tp1->book_size) {
 		atomic_add_int(&asoc->total_output_queue_size, -tp1->book_size);
 	} else {
 		asoc->total_output_queue_size = 0;
 	}
 
 	if (stcb->sctp_socket && (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) ||
 	    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)))) {
 		if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) {
 			stcb->sctp_socket->so_snd.sb_cc -= tp1->book_size;
 		} else {
 			stcb->sctp_socket->so_snd.sb_cc = 0;
 
 		}
 	}
 }
 
 #endif
 
 int
 sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1,
     uint8_t sent, int so_locked)
 {
 	struct sctp_stream_out *strq;
 	struct sctp_tmit_chunk *chk = NULL, *tp2;
 	struct sctp_stream_queue_pending *sp;
 	uint32_t mid;
 	uint16_t sid;
 	uint8_t foundeom = 0;
 	int ret_sz = 0;
 	int notdone;
 	int do_wakeup_routine = 0;
 
 	sid = tp1->rec.data.sid;
 	mid = tp1->rec.data.mid;
 	if (sent || !(tp1->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG)) {
 		stcb->asoc.abandoned_sent[0]++;
 		stcb->asoc.abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++;
 		stcb->asoc.strmout[sid].abandoned_sent[0]++;
 #if defined(SCTP_DETAILED_STR_STATS)
 		stcb->asoc.strmout[sid].abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++;
 #endif
 	} else {
 		stcb->asoc.abandoned_unsent[0]++;
 		stcb->asoc.abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++;
 		stcb->asoc.strmout[sid].abandoned_unsent[0]++;
 #if defined(SCTP_DETAILED_STR_STATS)
 		stcb->asoc.strmout[sid].abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++;
 #endif
 	}
 	do {
 		ret_sz += tp1->book_size;
 		if (tp1->data != NULL) {
 			if (tp1->sent < SCTP_DATAGRAM_RESEND) {
 				sctp_flight_size_decrease(tp1);
 				sctp_total_flight_decrease(stcb, tp1);
 			}
 			sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
 			stcb->asoc.peers_rwnd += tp1->send_size;
 			stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh);
 			if (sent) {
 				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, tp1, so_locked);
 			} else {
 				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, tp1, so_locked);
 			}
 			if (tp1->data) {
 				sctp_m_freem(tp1->data);
 				tp1->data = NULL;
 			}
 			do_wakeup_routine = 1;
 			if (PR_SCTP_BUF_ENABLED(tp1->flags)) {
 				stcb->asoc.sent_queue_cnt_removeable--;
 			}
 		}
 		tp1->sent = SCTP_FORWARD_TSN_SKIP;
 		if ((tp1->rec.data.rcv_flags & SCTP_DATA_NOT_FRAG) ==
 		    SCTP_DATA_NOT_FRAG) {
 			/* not frag'ed we ae done   */
 			notdone = 0;
 			foundeom = 1;
 		} else if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
 			/* end of frag, we are done */
 			notdone = 0;
 			foundeom = 1;
 		} else {
 			/*
 			 * Its a begin or middle piece, we must mark all of
 			 * it
 			 */
 			notdone = 1;
 			tp1 = TAILQ_NEXT(tp1, sctp_next);
 		}
 	} while (tp1 && notdone);
 	if (foundeom == 0) {
 		/*
 		 * The multi-part message was scattered across the send and
 		 * sent queue.
 		 */
 		TAILQ_FOREACH_SAFE(tp1, &stcb->asoc.send_queue, sctp_next, tp2) {
 			if ((tp1->rec.data.sid != sid) ||
 			    (!SCTP_MID_EQ(stcb->asoc.idata_supported, tp1->rec.data.mid, mid))) {
 				break;
 			}
 			/*
 			 * save to chk in case we have some on stream out
 			 * queue. If so and we have an un-transmitted one we
 			 * don't have to fudge the TSN.
 			 */
 			chk = tp1;
 			ret_sz += tp1->book_size;
 			sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
 			if (sent) {
 				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, tp1, so_locked);
 			} else {
 				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, tp1, so_locked);
 			}
 			if (tp1->data) {
 				sctp_m_freem(tp1->data);
 				tp1->data = NULL;
 			}
 			/* No flight involved here book the size to 0 */
 			tp1->book_size = 0;
 			if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
 				foundeom = 1;
 			}
 			do_wakeup_routine = 1;
 			tp1->sent = SCTP_FORWARD_TSN_SKIP;
 			TAILQ_REMOVE(&stcb->asoc.send_queue, tp1, sctp_next);
 			/*
 			 * on to the sent queue so we can wait for it to be
 			 * passed by.
 			 */
 			TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, tp1,
 			    sctp_next);
 			stcb->asoc.send_queue_cnt--;
 			stcb->asoc.sent_queue_cnt++;
 		}
 	}
 	if (foundeom == 0) {
 		/*
 		 * Still no eom found. That means there is stuff left on the
 		 * stream out queue.. yuck.
 		 */
 		SCTP_TCB_SEND_LOCK(stcb);
 		strq = &stcb->asoc.strmout[sid];
 		sp = TAILQ_FIRST(&strq->outqueue);
 		if (sp != NULL) {
 			sp->discard_rest = 1;
 			/*
 			 * We may need to put a chunk on the queue that
 			 * holds the TSN that would have been sent with the
 			 * LAST bit.
 			 */
 			if (chk == NULL) {
 				/* Yep, we have to */
 				sctp_alloc_a_chunk(stcb, chk);
 				if (chk == NULL) {
 					/*
 					 * we are hosed. All we can do is
 					 * nothing.. which will cause an
 					 * abort if the peer is paying
 					 * attention.
 					 */
 					goto oh_well;
 				}
 				memset(chk, 0, sizeof(*chk));
 				chk->rec.data.rcv_flags = 0;
 				chk->sent = SCTP_FORWARD_TSN_SKIP;
 				chk->asoc = &stcb->asoc;
 				if (stcb->asoc.idata_supported == 0) {
 					if (sp->sinfo_flags & SCTP_UNORDERED) {
 						chk->rec.data.mid = 0;
 					} else {
 						chk->rec.data.mid = strq->next_mid_ordered;
 					}
 				} else {
 					if (sp->sinfo_flags & SCTP_UNORDERED) {
 						chk->rec.data.mid = strq->next_mid_unordered;
 					} else {
 						chk->rec.data.mid = strq->next_mid_ordered;
 					}
 				}
 				chk->rec.data.sid = sp->sid;
 				chk->rec.data.ppid = sp->ppid;
 				chk->rec.data.context = sp->context;
 				chk->flags = sp->act_flags;
 				chk->whoTo = NULL;
 				chk->rec.data.tsn = atomic_fetchadd_int(&stcb->asoc.sending_seq, 1);
 				strq->chunks_on_queues++;
 				TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, chk, sctp_next);
 				stcb->asoc.sent_queue_cnt++;
 				stcb->asoc.pr_sctp_cnt++;
 			}
 			chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG;
 			if (sp->sinfo_flags & SCTP_UNORDERED) {
 				chk->rec.data.rcv_flags |= SCTP_DATA_UNORDERED;
 			}
 			if (stcb->asoc.idata_supported == 0) {
 				if ((sp->sinfo_flags & SCTP_UNORDERED) == 0) {
 					strq->next_mid_ordered++;
 				}
 			} else {
 				if (sp->sinfo_flags & SCTP_UNORDERED) {
 					strq->next_mid_unordered++;
 				} else {
 					strq->next_mid_ordered++;
 				}
 			}
 	oh_well:
 			if (sp->data) {
 				/*
 				 * Pull any data to free up the SB and allow
 				 * sender to "add more" while we will throw
 				 * away :-)
 				 */
 				sctp_free_spbufspace(stcb, &stcb->asoc, sp);
 				ret_sz += sp->length;
 				do_wakeup_routine = 1;
 				sp->some_taken = 1;
 				sctp_m_freem(sp->data);
 				sp->data = NULL;
 				sp->tail_mbuf = NULL;
 				sp->length = 0;
 			}
 		}
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	if (do_wakeup_routine) {
 		sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket);
 	}
 	return (ret_sz);
 }
 
 /*
  * checks to see if the given address, sa, is one that is currently known by
  * the kernel note: can't distinguish the same address on multiple interfaces
  * and doesn't handle multiple addresses with different zone/scope id's note:
  * ifa_ifwithaddr() compares the entire sockaddr struct
  */
 struct sctp_ifa *
 sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr,
     int holds_lock)
 {
 	struct sctp_laddr *laddr;
 
 	if (holds_lock == 0) {
 		SCTP_INP_RLOCK(inp);
 	}
 
 	LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 		if (laddr->ifa == NULL)
 			continue;
 		if (addr->sa_family != laddr->ifa->address.sa.sa_family)
 			continue;
 #ifdef INET
 		if (addr->sa_family == AF_INET) {
 			if (((struct sockaddr_in *)addr)->sin_addr.s_addr ==
 			    laddr->ifa->address.sin.sin_addr.s_addr) {
 				/* found him. */
 				break;
 			}
 		}
 #endif
 #ifdef INET6
 		if (addr->sa_family == AF_INET6) {
 			if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr,
 			    &laddr->ifa->address.sin6)) {
 				/* found him. */
 				break;
 			}
 		}
 #endif
 	}
 	if (holds_lock == 0) {
 		SCTP_INP_RUNLOCK(inp);
 	}
 	if (laddr != NULL) {
 		return (laddr->ifa);
 	} else {
 		return (NULL);
 	}
 }
 
 uint32_t
 sctp_get_ifa_hash_val(struct sockaddr *addr)
 {
 	switch (addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			sin = (struct sockaddr_in *)addr;
 			return (sin->sin_addr.s_addr ^ (sin->sin_addr.s_addr >> 16));
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 			uint32_t hash_of_addr;
 
 			sin6 = (struct sockaddr_in6 *)addr;
 			hash_of_addr = (sin6->sin6_addr.s6_addr32[0] +
 			    sin6->sin6_addr.s6_addr32[1] +
 			    sin6->sin6_addr.s6_addr32[2] +
 			    sin6->sin6_addr.s6_addr32[3]);
 			hash_of_addr = (hash_of_addr ^ (hash_of_addr >> 16));
 			return (hash_of_addr);
 		}
 #endif
 	default:
 		break;
 	}
 	return (0);
 }
 
 struct sctp_ifa *
 sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock)
 {
 	struct sctp_ifa *sctp_ifap;
 	struct sctp_vrf *vrf;
 	struct sctp_ifalist *hash_head;
 	uint32_t hash_of_addr;
 
 	if (holds_lock == 0) {
 		SCTP_IPI_ADDR_RLOCK();
 	} else {
 		SCTP_IPI_ADDR_LOCK_ASSERT();
 	}
 
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		if (holds_lock == 0)
 			SCTP_IPI_ADDR_RUNLOCK();
 		return (NULL);
 	}
 
 	hash_of_addr = sctp_get_ifa_hash_val(addr);
 
 	hash_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)];
 	if (hash_head == NULL) {
 		SCTP_PRINTF("hash_of_addr:%x mask:%x table:%x - ",
 		    hash_of_addr, (uint32_t)vrf->vrf_addr_hashmark,
 		    (uint32_t)(hash_of_addr & vrf->vrf_addr_hashmark));
 		sctp_print_address(addr);
 		SCTP_PRINTF("No such bucket for address\n");
 		if (holds_lock == 0)
 			SCTP_IPI_ADDR_RUNLOCK();
 
 		return (NULL);
 	}
 	LIST_FOREACH(sctp_ifap, hash_head, next_bucket) {
 		if (addr->sa_family != sctp_ifap->address.sa.sa_family)
 			continue;
 #ifdef INET
 		if (addr->sa_family == AF_INET) {
 			if (((struct sockaddr_in *)addr)->sin_addr.s_addr ==
 			    sctp_ifap->address.sin.sin_addr.s_addr) {
 				/* found him. */
 				break;
 			}
 		}
 #endif
 #ifdef INET6
 		if (addr->sa_family == AF_INET6) {
 			if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr,
 			    &sctp_ifap->address.sin6)) {
 				/* found him. */
 				break;
 			}
 		}
 #endif
 	}
 	if (holds_lock == 0)
 		SCTP_IPI_ADDR_RUNLOCK();
 	return (sctp_ifap);
 }
 
 static void
 sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t *freed_so_far, int hold_rlock,
     uint32_t rwnd_req)
 {
 	/* User pulled some data, do we need a rwnd update? */
 	struct epoch_tracker et;
 	int r_unlocked = 0;
 	uint32_t dif, rwnd;
 	struct socket *so = NULL;
 
 	if (stcb == NULL)
 		return;
 
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 
 	if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) ||
 	    (stcb->asoc.state & (SCTP_STATE_ABOUT_TO_BE_FREED | SCTP_STATE_SHUTDOWN_RECEIVED))) {
 		/* Pre-check If we are freeing no update */
 		goto no_lock;
 	}
 	SCTP_INP_INCR_REF(stcb->sctp_ep);
 	if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		goto out;
 	}
 	so = stcb->sctp_socket;
 	if (so == NULL) {
 		goto out;
 	}
 	atomic_add_int(&stcb->freed_by_sorcv_sincelast, *freed_so_far);
 	/* Have you have freed enough to look */
 	*freed_so_far = 0;
 	/* Yep, its worth a look and the lock overhead */
 
 	/* Figure out what the rwnd would be */
 	rwnd = sctp_calc_rwnd(stcb, &stcb->asoc);
 	if (rwnd >= stcb->asoc.my_last_reported_rwnd) {
 		dif = rwnd - stcb->asoc.my_last_reported_rwnd;
 	} else {
 		dif = 0;
 	}
 	if (dif >= rwnd_req) {
 		if (hold_rlock) {
 			SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
 			r_unlocked = 1;
 		}
 		if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			/*
 			 * One last check before we allow the guy possibly
 			 * to get in. There is a race, where the guy has not
 			 * reached the gate. In that case
 			 */
 			goto out;
 		}
 		SCTP_TCB_LOCK(stcb);
 		if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			/* No reports here */
 			SCTP_TCB_UNLOCK(stcb);
 			goto out;
 		}
 		SCTP_STAT_INCR(sctps_wu_sacks_sent);
 		NET_EPOCH_ENTER(et);
 		sctp_send_sack(stcb, SCTP_SO_LOCKED);
 
 		sctp_chunk_output(stcb->sctp_ep, stcb,
 		    SCTP_OUTPUT_FROM_USR_RCVD, SCTP_SO_LOCKED);
 		/* make sure no timer is running */
 		NET_EPOCH_EXIT(et);
 		sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL,
 		    SCTP_FROM_SCTPUTIL + SCTP_LOC_6);
 		SCTP_TCB_UNLOCK(stcb);
 	} else {
 		/* Update how much we have pending */
 		stcb->freed_by_sorcv_sincelast = dif;
 	}
 out:
 	if (so && r_unlocked && hold_rlock) {
 		SCTP_INP_READ_LOCK(stcb->sctp_ep);
 	}
 
 	SCTP_INP_DECR_REF(stcb->sctp_ep);
 no_lock:
 	atomic_add_int(&stcb->asoc.refcnt, -1);
 	return;
 }
 
 int
 sctp_sorecvmsg(struct socket *so,
     struct uio *uio,
     struct mbuf **mp,
     struct sockaddr *from,
     int fromlen,
     int *msg_flags,
     struct sctp_sndrcvinfo *sinfo,
     int filling_sinfo)
 {
 	/*
 	 * MSG flags we will look at MSG_DONTWAIT - non-blocking IO.
 	 * MSG_PEEK - Look don't touch :-D (only valid with OUT mbuf copy
 	 * mp=NULL thus uio is the copy method to userland) MSG_WAITALL - ??
 	 * On the way out we may send out any combination of:
 	 * MSG_NOTIFICATION MSG_EOR
 	 *
 	 */
 	struct sctp_inpcb *inp = NULL;
 	ssize_t my_len = 0;
 	ssize_t cp_len = 0;
 	int error = 0;
 	struct sctp_queued_to_read *control = NULL, *ctl = NULL, *nxt = NULL;
 	struct mbuf *m = NULL;
 	struct sctp_tcb *stcb = NULL;
 	int wakeup_read_socket = 0;
 	int freecnt_applied = 0;
 	int out_flags = 0, in_flags = 0;
 	int block_allowed = 1;
 	uint32_t freed_so_far = 0;
 	ssize_t copied_so_far = 0;
 	int in_eeor_mode = 0;
 	int no_rcv_needed = 0;
 	uint32_t rwnd_req = 0;
 	int hold_sblock = 0;
 	int hold_rlock = 0;
 	ssize_t slen = 0;
 	uint32_t held_length = 0;
 	int sockbuf_lock = 0;
 
 	if (uio == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		return (EINVAL);
 	}
 
 	if (msg_flags) {
 		in_flags = *msg_flags;
 		if (in_flags & MSG_PEEK)
 			SCTP_STAT_INCR(sctps_read_peeks);
 	} else {
 		in_flags = 0;
 	}
 	slen = uio->uio_resid;
 
 	/* Pull in and set up our int flags */
 	if (in_flags & MSG_OOB) {
 		/* Out of band's NOT supported */
 		return (EOPNOTSUPP);
 	}
 	if ((in_flags & MSG_PEEK) && (mp != NULL)) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		return (EINVAL);
 	}
 	if ((in_flags & (MSG_DONTWAIT
 	    | MSG_NBIO
 	    )) ||
 	    SCTP_SO_IS_NBIO(so)) {
 		block_allowed = 0;
 	}
 	/* setup the endpoint */
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT);
 		return (EFAULT);
 	}
 	rwnd_req = (SCTP_SB_LIMIT_RCV(so) >> SCTP_RWND_HIWAT_SHIFT);
 	/* Must be at least a MTU's worth */
 	if (rwnd_req < SCTP_MIN_RWND)
 		rwnd_req = SCTP_MIN_RWND;
 	in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
 		sctp_misc_ints(SCTP_SORECV_ENTER,
 		    rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, (uint32_t)uio->uio_resid);
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
 		sctp_misc_ints(SCTP_SORECV_ENTERPL,
 		    rwnd_req, block_allowed, so->so_rcv.sb_cc, (uint32_t)uio->uio_resid);
 	}
 
 
 	error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0));
 	if (error) {
 		goto release_unlocked;
 	}
 	sockbuf_lock = 1;
 restart:
 
 restart_nosblocks:
 	if (hold_sblock == 0) {
 		SOCKBUF_LOCK(&so->so_rcv);
 		hold_sblock = 1;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		goto out;
 	}
 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && (so->so_rcv.sb_cc == 0)) {
 		if (so->so_error) {
 			error = so->so_error;
 			if ((in_flags & MSG_PEEK) == 0)
 				so->so_error = 0;
 			goto out;
 		} else {
 			if (so->so_rcv.sb_cc == 0) {
 				/* indicate EOF */
 				error = 0;
 				goto out;
 			}
 		}
 	}
 	if (so->so_rcv.sb_cc <= held_length) {
 		if (so->so_error) {
 			error = so->so_error;
 			if ((in_flags & MSG_PEEK) == 0) {
 				so->so_error = 0;
 			}
 			goto out;
 		}
 		if ((so->so_rcv.sb_cc == 0) &&
 		    ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) {
 				/*
 				 * For active open side clear flags for
 				 * re-use passive open is blocked by
 				 * connect.
 				 */
 				if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) {
 					/*
 					 * You were aborted, passive side
 					 * always hits here
 					 */
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
 					error = ECONNRESET;
 				}
 				so->so_state &= ~(SS_ISCONNECTING |
 				    SS_ISDISCONNECTING |
 				    SS_ISCONFIRMING |
 				    SS_ISCONNECTED);
 				if (error == 0) {
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN);
 						error = ENOTCONN;
 					}
 				}
 				goto out;
 			}
 		}
 		if (block_allowed) {
 			error = sbwait(&so->so_rcv);
 			if (error) {
 				goto out;
 			}
 			held_length = 0;
 			goto restart_nosblocks;
 		} else {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EWOULDBLOCK);
 			error = EWOULDBLOCK;
 			goto out;
 		}
 	}
 	if (hold_sblock == 1) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		hold_sblock = 0;
 	}
 	/* we possibly have data we can read */
 	/* sa_ignore FREED_MEMORY */
 	control = TAILQ_FIRST(&inp->read_queue);
 	if (control == NULL) {
 		/*
 		 * This could be happening since the appender did the
 		 * increment but as not yet did the tailq insert onto the
 		 * read_queue
 		 */
 		if (hold_rlock == 0) {
 			SCTP_INP_READ_LOCK(inp);
 		}
 		control = TAILQ_FIRST(&inp->read_queue);
 		if ((control == NULL) && (so->so_rcv.sb_cc != 0)) {
 #ifdef INVARIANTS
 			panic("Huh, its non zero and nothing on control?");
 #endif
 			so->so_rcv.sb_cc = 0;
 		}
 		SCTP_INP_READ_UNLOCK(inp);
 		hold_rlock = 0;
 		goto restart;
 	}
 
 	if ((control->length == 0) &&
 	    (control->do_not_ref_stcb)) {
 		/*
 		 * Clean up code for freeing assoc that left behind a
 		 * pdapi.. maybe a peer in EEOR that just closed after
 		 * sending and never indicated a EOR.
 		 */
 		if (hold_rlock == 0) {
 			hold_rlock = 1;
 			SCTP_INP_READ_LOCK(inp);
 		}
 		control->held_length = 0;
 		if (control->data) {
 			/* Hmm there is data here .. fix */
 			struct mbuf *m_tmp;
 			int cnt = 0;
 
 			m_tmp = control->data;
 			while (m_tmp) {
 				cnt += SCTP_BUF_LEN(m_tmp);
 				if (SCTP_BUF_NEXT(m_tmp) == NULL) {
 					control->tail_mbuf = m_tmp;
 					control->end_added = 1;
 				}
 				m_tmp = SCTP_BUF_NEXT(m_tmp);
 			}
 			control->length = cnt;
 		} else {
 			/* remove it */
 			TAILQ_REMOVE(&inp->read_queue, control, next);
 			/* Add back any hiddend data */
 			sctp_free_remote_addr(control->whoFrom);
 			sctp_free_a_readq(stcb, control);
 		}
 		if (hold_rlock) {
 			hold_rlock = 0;
 			SCTP_INP_READ_UNLOCK(inp);
 		}
 		goto restart;
 	}
 	if ((control->length == 0) &&
 	    (control->end_added == 1)) {
 		/*
 		 * Do we also need to check for (control->pdapi_aborted ==
 		 * 1)?
 		 */
 		if (hold_rlock == 0) {
 			hold_rlock = 1;
 			SCTP_INP_READ_LOCK(inp);
 		}
 		TAILQ_REMOVE(&inp->read_queue, control, next);
 		if (control->data) {
 #ifdef INVARIANTS
 			panic("control->data not null but control->length == 0");
 #else
 			SCTP_PRINTF("Strange, data left in the control buffer. Cleaning up.\n");
 			sctp_m_freem(control->data);
 			control->data = NULL;
 #endif
 		}
 		if (control->aux_data) {
 			sctp_m_free(control->aux_data);
 			control->aux_data = NULL;
 		}
 #ifdef INVARIANTS
 		if (control->on_strm_q) {
 			panic("About to free ctl:%p so:%p and its in %d",
 			    control, so, control->on_strm_q);
 		}
 #endif
 		sctp_free_remote_addr(control->whoFrom);
 		sctp_free_a_readq(stcb, control);
 		if (hold_rlock) {
 			hold_rlock = 0;
 			SCTP_INP_READ_UNLOCK(inp);
 		}
 		goto restart;
 	}
 	if (control->length == 0) {
 		if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) &&
 		    (filling_sinfo)) {
 			/* find a more suitable one then this */
 			ctl = TAILQ_NEXT(control, next);
 			while (ctl) {
 				if ((ctl->stcb != control->stcb) && (ctl->length) &&
 				    (ctl->some_taken ||
 				    (ctl->spec_flags & M_NOTIFICATION) ||
 				    ((ctl->do_not_ref_stcb == 0) &&
 				    (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0)))
 				    ) {
 					/*-
 					 * If we have a different TCB next, and there is data
 					 * present. If we have already taken some (pdapi), OR we can
 					 * ref the tcb and no delivery as started on this stream, we
 					 * take it. Note we allow a notification on a different
 					 * assoc to be delivered..
 					 */
 					control = ctl;
 					goto found_one;
 				} else if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) &&
 					    (ctl->length) &&
 					    ((ctl->some_taken) ||
 					    ((ctl->do_not_ref_stcb == 0) &&
 					    ((ctl->spec_flags & M_NOTIFICATION) == 0) &&
 				    (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0)))) {
 					/*-
 					 * If we have the same tcb, and there is data present, and we
 					 * have the strm interleave feature present. Then if we have
 					 * taken some (pdapi) or we can refer to tht tcb AND we have
 					 * not started a delivery for this stream, we can take it.
 					 * Note we do NOT allow a notificaiton on the same assoc to
 					 * be delivered.
 					 */
 					control = ctl;
 					goto found_one;
 				}
 				ctl = TAILQ_NEXT(ctl, next);
 			}
 		}
 		/*
 		 * if we reach here, not suitable replacement is available
 		 * <or> fragment interleave is NOT on. So stuff the sb_cc
 		 * into the our held count, and its time to sleep again.
 		 */
 		held_length = so->so_rcv.sb_cc;
 		control->held_length = so->so_rcv.sb_cc;
 		goto restart;
 	}
 	/* Clear the held length since there is something to read */
 	control->held_length = 0;
 found_one:
 	/*
 	 * If we reach here, control has a some data for us to read off.
 	 * Note that stcb COULD be NULL.
 	 */
 	if (hold_rlock == 0) {
 		hold_rlock = 1;
 		SCTP_INP_READ_LOCK(inp);
 	}
 	control->some_taken++;
 	stcb = control->stcb;
 	if (stcb) {
 		if ((control->do_not_ref_stcb == 0) &&
 		    (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) {
 			if (freecnt_applied == 0)
 				stcb = NULL;
 		} else if (control->do_not_ref_stcb == 0) {
 			/* you can't free it on me please */
 			/*
 			 * The lock on the socket buffer protects us so the
 			 * free code will stop. But since we used the
 			 * socketbuf lock and the sender uses the tcb_lock
 			 * to increment, we need to use the atomic add to
 			 * the refcnt
 			 */
 			if (freecnt_applied) {
 #ifdef INVARIANTS
 				panic("refcnt already incremented");
 #else
 				SCTP_PRINTF("refcnt already incremented?\n");
 #endif
 			} else {
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				freecnt_applied = 1;
 			}
 			/*
 			 * Setup to remember how much we have not yet told
 			 * the peer our rwnd has opened up. Note we grab the
 			 * value from the tcb from last time. Note too that
 			 * sack sending clears this when a sack is sent,
 			 * which is fine. Once we hit the rwnd_req, we then
 			 * will go to the sctp_user_rcvd() that will not
 			 * lock until it KNOWs it MUST send a WUP-SACK.
 			 */
 			freed_so_far = (uint32_t)stcb->freed_by_sorcv_sincelast;
 			stcb->freed_by_sorcv_sincelast = 0;
 		}
 	}
 	if (stcb &&
 	    ((control->spec_flags & M_NOTIFICATION) == 0) &&
 	    control->do_not_ref_stcb == 0) {
 		stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1;
 	}
 
 	/* First lets get off the sinfo and sockaddr info */
 	if ((sinfo != NULL) && (filling_sinfo != 0)) {
 		sinfo->sinfo_stream = control->sinfo_stream;
 		sinfo->sinfo_ssn = (uint16_t)control->mid;
 		sinfo->sinfo_flags = control->sinfo_flags;
 		sinfo->sinfo_ppid = control->sinfo_ppid;
 		sinfo->sinfo_context = control->sinfo_context;
 		sinfo->sinfo_timetolive = control->sinfo_timetolive;
 		sinfo->sinfo_tsn = control->sinfo_tsn;
 		sinfo->sinfo_cumtsn = control->sinfo_cumtsn;
 		sinfo->sinfo_assoc_id = control->sinfo_assoc_id;
 		nxt = TAILQ_NEXT(control, next);
 		if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) ||
 		    sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) {
 			struct sctp_extrcvinfo *s_extra;
 
 			s_extra = (struct sctp_extrcvinfo *)sinfo;
 			if ((nxt) &&
 			    (nxt->length)) {
 				s_extra->serinfo_next_flags = SCTP_NEXT_MSG_AVAIL;
 				if (nxt->sinfo_flags & SCTP_UNORDERED) {
 					s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED;
 				}
 				if (nxt->spec_flags & M_NOTIFICATION) {
 					s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION;
 				}
 				s_extra->serinfo_next_aid = nxt->sinfo_assoc_id;
 				s_extra->serinfo_next_length = nxt->length;
 				s_extra->serinfo_next_ppid = nxt->sinfo_ppid;
 				s_extra->serinfo_next_stream = nxt->sinfo_stream;
 				if (nxt->tail_mbuf != NULL) {
 					if (nxt->end_added) {
 						s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE;
 					}
 				}
 			} else {
 				/*
 				 * we explicitly 0 this, since the memcpy
 				 * got some other things beyond the older
 				 * sinfo_ that is on the control's structure
 				 * :-D
 				 */
 				nxt = NULL;
 				s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG;
 				s_extra->serinfo_next_aid = 0;
 				s_extra->serinfo_next_length = 0;
 				s_extra->serinfo_next_ppid = 0;
 				s_extra->serinfo_next_stream = 0;
 			}
 		}
 		/*
 		 * update off the real current cum-ack, if we have an stcb.
 		 */
 		if ((control->do_not_ref_stcb == 0) && stcb)
 			sinfo->sinfo_cumtsn = stcb->asoc.cumulative_tsn;
 		/*
 		 * mask off the high bits, we keep the actual chunk bits in
 		 * there.
 		 */
 		sinfo->sinfo_flags &= 0x00ff;
 		if ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) {
 			sinfo->sinfo_flags |= SCTP_UNORDERED;
 		}
 	}
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	{
 		int index, newindex;
 		struct sctp_pcbtsn_rlog *entry;
 
 		do {
 			index = inp->readlog_index;
 			newindex = index + 1;
 			if (newindex >= SCTP_READ_LOG_SIZE) {
 				newindex = 0;
 			}
 		} while (atomic_cmpset_int(&inp->readlog_index, index, newindex) == 0);
 		entry = &inp->readlog[index];
 		entry->vtag = control->sinfo_assoc_id;
 		entry->strm = control->sinfo_stream;
 		entry->seq = (uint16_t)control->mid;
 		entry->sz = control->length;
 		entry->flgs = control->sinfo_flags;
 	}
 #endif
 	if ((fromlen > 0) && (from != NULL)) {
 		union sctp_sockstore store;
 		size_t len;
 
 		switch (control->whoFrom->ro._l_addr.sa.sa_family) {
 #ifdef INET6
 		case AF_INET6:
 			len = sizeof(struct sockaddr_in6);
 			store.sin6 = control->whoFrom->ro._l_addr.sin6;
 			store.sin6.sin6_port = control->port_from;
 			break;
 #endif
 #ifdef INET
 		case AF_INET:
 #ifdef INET6
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 				len = sizeof(struct sockaddr_in6);
 				in6_sin_2_v4mapsin6(&control->whoFrom->ro._l_addr.sin,
 				    &store.sin6);
 				store.sin6.sin6_port = control->port_from;
 			} else {
 				len = sizeof(struct sockaddr_in);
 				store.sin = control->whoFrom->ro._l_addr.sin;
 				store.sin.sin_port = control->port_from;
 			}
 #else
 			len = sizeof(struct sockaddr_in);
 			store.sin = control->whoFrom->ro._l_addr.sin;
 			store.sin.sin_port = control->port_from;
 #endif
 			break;
 #endif
 		default:
 			len = 0;
 			break;
 		}
 		memcpy(from, &store, min((size_t)fromlen, len));
 #ifdef INET6
 		{
 			struct sockaddr_in6 lsa6, *from6;
 
 			from6 = (struct sockaddr_in6 *)from;
 			sctp_recover_scope_mac(from6, (&lsa6));
 		}
 #endif
 	}
 	if (hold_rlock) {
 		SCTP_INP_READ_UNLOCK(inp);
 		hold_rlock = 0;
 	}
 	if (hold_sblock) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		hold_sblock = 0;
 	}
 	/* now copy out what data we can */
 	if (mp == NULL) {
 		/* copy out each mbuf in the chain up to length */
 get_more_data:
 		m = control->data;
 		while (m) {
 			/* Move out all we can */
 			cp_len = uio->uio_resid;
 			my_len = SCTP_BUF_LEN(m);
 			if (cp_len > my_len) {
 				/* not enough in this buf */
 				cp_len = my_len;
 			}
 			if (hold_rlock) {
 				SCTP_INP_READ_UNLOCK(inp);
 				hold_rlock = 0;
 			}
 			if (cp_len > 0)
 				error = uiomove(mtod(m, char *), (int)cp_len, uio);
 			/* re-read */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 				goto release;
 			}
 
 			if ((control->do_not_ref_stcb == 0) && stcb &&
 			    stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				no_rcv_needed = 1;
 			}
 			if (error) {
 				/* error we are out of here */
 				goto release;
 			}
 			SCTP_INP_READ_LOCK(inp);
 			hold_rlock = 1;
 			if (cp_len == SCTP_BUF_LEN(m)) {
 				if ((SCTP_BUF_NEXT(m) == NULL) &&
 				    (control->end_added)) {
 					out_flags |= MSG_EOR;
 					if ((control->do_not_ref_stcb == 0) &&
 					    (control->stcb != NULL) &&
 					    ((control->spec_flags & M_NOTIFICATION) == 0))
 						control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
 				}
 				if (control->spec_flags & M_NOTIFICATION) {
 					out_flags |= MSG_NOTIFICATION;
 				}
 				/* we ate up the mbuf */
 				if (in_flags & MSG_PEEK) {
 					/* just looking */
 					m = SCTP_BUF_NEXT(m);
 					copied_so_far += cp_len;
 				} else {
 					/* dispose of the mbuf */
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 						sctp_sblog(&so->so_rcv,
 						    control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m));
 					}
 					sctp_sbfree(control, stcb, &so->so_rcv, m);
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 						sctp_sblog(&so->so_rcv,
 						    control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
 					}
 					copied_so_far += cp_len;
 					freed_so_far += (uint32_t)cp_len;
 					freed_so_far += MSIZE;
 					atomic_subtract_int(&control->length, cp_len);
 					control->data = sctp_m_free(m);
 					m = control->data;
 					/*
 					 * been through it all, must hold sb
 					 * lock ok to null tail
 					 */
 					if (control->data == NULL) {
 #ifdef INVARIANTS
 						if ((control->end_added == 0) ||
 						    (TAILQ_NEXT(control, next) == NULL)) {
 							/*
 							 * If the end is not
 							 * added, OR the
 							 * next is NOT null
 							 * we MUST have the
 							 * lock.
 							 */
 							if (mtx_owned(&inp->inp_rdata_mtx) == 0) {
 								panic("Hmm we don't own the lock?");
 							}
 						}
 #endif
 						control->tail_mbuf = NULL;
 #ifdef INVARIANTS
 						if ((control->end_added) && ((out_flags & MSG_EOR) == 0)) {
 							panic("end_added, nothing left and no MSG_EOR");
 						}
 #endif
 					}
 				}
 			} else {
 				/* Do we need to trim the mbuf? */
 				if (control->spec_flags & M_NOTIFICATION) {
 					out_flags |= MSG_NOTIFICATION;
 				}
 				if ((in_flags & MSG_PEEK) == 0) {
 					SCTP_BUF_RESV_UF(m, cp_len);
 					SCTP_BUF_LEN(m) -= (int)cp_len;
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 						sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, (int)cp_len);
 					}
 					atomic_subtract_int(&so->so_rcv.sb_cc, cp_len);
 					if ((control->do_not_ref_stcb == 0) &&
 					    stcb) {
 						atomic_subtract_int(&stcb->asoc.sb_cc, cp_len);
 					}
 					copied_so_far += cp_len;
 					freed_so_far += (uint32_t)cp_len;
 					freed_so_far += MSIZE;
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 						sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb,
 						    SCTP_LOG_SBRESULT, 0);
 					}
 					atomic_subtract_int(&control->length, cp_len);
 				} else {
 					copied_so_far += cp_len;
 				}
 			}
 			if ((out_flags & MSG_EOR) || (uio->uio_resid == 0)) {
 				break;
 			}
 			if (((stcb) && (in_flags & MSG_PEEK) == 0) &&
 			    (control->do_not_ref_stcb == 0) &&
 			    (freed_so_far >= rwnd_req)) {
 				sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
 			}
 		}		/* end while(m) */
 		/*
 		 * At this point we have looked at it all and we either have
 		 * a MSG_EOR/or read all the user wants... <OR>
 		 * control->length == 0.
 		 */
 		if ((out_flags & MSG_EOR) && ((in_flags & MSG_PEEK) == 0)) {
 			/* we are done with this control */
 			if (control->length == 0) {
 				if (control->data) {
 #ifdef INVARIANTS
 					panic("control->data not null at read eor?");
 #else
 					SCTP_PRINTF("Strange, data left in the control buffer .. invarients would panic?\n");
 					sctp_m_freem(control->data);
 					control->data = NULL;
 #endif
 				}
 		done_with_control:
 				if (hold_rlock == 0) {
 					SCTP_INP_READ_LOCK(inp);
 					hold_rlock = 1;
 				}
 				TAILQ_REMOVE(&inp->read_queue, control, next);
 				/* Add back any hiddend data */
 				if (control->held_length) {
 					held_length = 0;
 					control->held_length = 0;
 					wakeup_read_socket = 1;
 				}
 				if (control->aux_data) {
 					sctp_m_free(control->aux_data);
 					control->aux_data = NULL;
 				}
 				no_rcv_needed = control->do_not_ref_stcb;
 				sctp_free_remote_addr(control->whoFrom);
 				control->data = NULL;
 #ifdef INVARIANTS
 				if (control->on_strm_q) {
 					panic("About to free ctl:%p so:%p and its in %d",
 					    control, so, control->on_strm_q);
 				}
 #endif
 				sctp_free_a_readq(stcb, control);
 				control = NULL;
 				if ((freed_so_far >= rwnd_req) &&
 				    (no_rcv_needed == 0))
 					sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
 
 			} else {
 				/*
 				 * The user did not read all of this
 				 * message, turn off the returned MSG_EOR
 				 * since we are leaving more behind on the
 				 * control to read.
 				 */
 #ifdef INVARIANTS
 				if (control->end_added &&
 				    (control->data == NULL) &&
 				    (control->tail_mbuf == NULL)) {
 					panic("Gak, control->length is corrupt?");
 				}
 #endif
 				no_rcv_needed = control->do_not_ref_stcb;
 				out_flags &= ~MSG_EOR;
 			}
 		}
 		if (out_flags & MSG_EOR) {
 			goto release;
 		}
 		if ((uio->uio_resid == 0) ||
 		    ((in_eeor_mode) &&
 		    (copied_so_far >= max(so->so_rcv.sb_lowat, 1)))) {
 			goto release;
 		}
 		/*
 		 * If I hit here the receiver wants more and this message is
 		 * NOT done (pd-api). So two questions. Can we block? if not
 		 * we are done. Did the user NOT set MSG_WAITALL?
 		 */
 		if (block_allowed == 0) {
 			goto release;
 		}
 		/*
 		 * We need to wait for more data a few things: - We don't
 		 * sbunlock() so we don't get someone else reading. - We
 		 * must be sure to account for the case where what is added
 		 * is NOT to our control when we wakeup.
 		 */
 
 		/*
 		 * Do we need to tell the transport a rwnd update might be
 		 * needed before we go to sleep?
 		 */
 		if (((stcb) && (in_flags & MSG_PEEK) == 0) &&
 		    ((freed_so_far >= rwnd_req) &&
 		    (control->do_not_ref_stcb == 0) &&
 		    (no_rcv_needed == 0))) {
 			sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
 		}
 wait_some_more:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			goto release;
 		}
 
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)
 			goto release;
 
 		if (hold_rlock == 1) {
 			SCTP_INP_READ_UNLOCK(inp);
 			hold_rlock = 0;
 		}
 		if (hold_sblock == 0) {
 			SOCKBUF_LOCK(&so->so_rcv);
 			hold_sblock = 1;
 		}
 		if ((copied_so_far) && (control->length == 0) &&
 		    (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE))) {
 			goto release;
 		}
 		if (so->so_rcv.sb_cc <= control->held_length) {
 			error = sbwait(&so->so_rcv);
 			if (error) {
 				goto release;
 			}
 			control->held_length = 0;
 		}
 		if (hold_sblock) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			hold_sblock = 0;
 		}
 		if (control->length == 0) {
 			/* still nothing here */
 			if (control->end_added == 1) {
 				/* he aborted, or is done i.e.did a shutdown */
 				out_flags |= MSG_EOR;
 				if (control->pdapi_aborted) {
 					if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0))
 						control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
 
 					out_flags |= MSG_TRUNC;
 				} else {
 					if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0))
 						control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
 				}
 				goto done_with_control;
 			}
 			if (so->so_rcv.sb_cc > held_length) {
 				control->held_length = so->so_rcv.sb_cc;
 				held_length = 0;
 			}
 			goto wait_some_more;
 		} else if (control->data == NULL) {
 			/*
 			 * we must re-sync since data is probably being
 			 * added
 			 */
 			SCTP_INP_READ_LOCK(inp);
 			if ((control->length > 0) && (control->data == NULL)) {
 				/*
 				 * big trouble.. we have the lock and its
 				 * corrupt?
 				 */
 #ifdef INVARIANTS
 				panic("Impossible data==NULL length !=0");
 #endif
 				out_flags |= MSG_EOR;
 				out_flags |= MSG_TRUNC;
 				control->length = 0;
 				SCTP_INP_READ_UNLOCK(inp);
 				goto done_with_control;
 			}
 			SCTP_INP_READ_UNLOCK(inp);
 			/* We will fall around to get more data */
 		}
 		goto get_more_data;
 	} else {
 		/*-
 		 * Give caller back the mbuf chain,
 		 * store in uio_resid the length
 		 */
 		wakeup_read_socket = 0;
 		if ((control->end_added == 0) ||
 		    (TAILQ_NEXT(control, next) == NULL)) {
 			/* Need to get rlock */
 			if (hold_rlock == 0) {
 				SCTP_INP_READ_LOCK(inp);
 				hold_rlock = 1;
 			}
 		}
 		if (control->end_added) {
 			out_flags |= MSG_EOR;
 			if ((control->do_not_ref_stcb == 0) &&
 			    (control->stcb != NULL) &&
 			    ((control->spec_flags & M_NOTIFICATION) == 0))
 				control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0;
 		}
 		if (control->spec_flags & M_NOTIFICATION) {
 			out_flags |= MSG_NOTIFICATION;
 		}
 		uio->uio_resid = control->length;
 		*mp = control->data;
 		m = control->data;
 		while (m) {
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 				sctp_sblog(&so->so_rcv,
 				    control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m));
 			}
 			sctp_sbfree(control, stcb, &so->so_rcv, m);
 			freed_so_far += (uint32_t)SCTP_BUF_LEN(m);
 			freed_so_far += MSIZE;
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 				sctp_sblog(&so->so_rcv,
 				    control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
 			}
 			m = SCTP_BUF_NEXT(m);
 		}
 		control->data = control->tail_mbuf = NULL;
 		control->length = 0;
 		if (out_flags & MSG_EOR) {
 			/* Done with this control */
 			goto done_with_control;
 		}
 	}
 release:
 	if (hold_rlock == 1) {
 		SCTP_INP_READ_UNLOCK(inp);
 		hold_rlock = 0;
 	}
 	if (hold_sblock == 1) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		hold_sblock = 0;
 	}
 
 	sbunlock(&so->so_rcv);
 	sockbuf_lock = 0;
 
 release_unlocked:
 	if (hold_sblock) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		hold_sblock = 0;
 	}
 	if ((stcb) && (in_flags & MSG_PEEK) == 0) {
 		if ((freed_so_far >= rwnd_req) &&
 		    (control && (control->do_not_ref_stcb == 0)) &&
 		    (no_rcv_needed == 0))
 			sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req);
 	}
 out:
 	if (msg_flags) {
 		*msg_flags = out_flags;
 	}
 	if (((out_flags & MSG_EOR) == 0) &&
 	    ((in_flags & MSG_PEEK) == 0) &&
 	    (sinfo) &&
 	    (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) ||
 	    sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO))) {
 		struct sctp_extrcvinfo *s_extra;
 
 		s_extra = (struct sctp_extrcvinfo *)sinfo;
 		s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG;
 	}
 	if (hold_rlock == 1) {
 		SCTP_INP_READ_UNLOCK(inp);
 	}
 	if (hold_sblock) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 	}
 	if (sockbuf_lock) {
 		sbunlock(&so->so_rcv);
 	}
 
 	if (freecnt_applied) {
 		/*
 		 * The lock on the socket buffer protects us so the free
 		 * code will stop. But since we used the socketbuf lock and
 		 * the sender uses the tcb_lock to increment, we need to use
 		 * the atomic add to the refcnt.
 		 */
 		if (stcb == NULL) {
 #ifdef INVARIANTS
 			panic("stcb for refcnt has gone NULL?");
 			goto stage_left;
 #else
 			goto stage_left;
 #endif
 		}
 		/* Save the value back for next time */
 		stcb->freed_by_sorcv_sincelast = freed_so_far;
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
 		if (stcb) {
 			sctp_misc_ints(SCTP_SORECV_DONE,
 			    freed_so_far,
 			    (uint32_t)((uio) ? (slen - uio->uio_resid) : slen),
 			    stcb->asoc.my_rwnd,
 			    so->so_rcv.sb_cc);
 		} else {
 			sctp_misc_ints(SCTP_SORECV_DONE,
 			    freed_so_far,
 			    (uint32_t)((uio) ? (slen - uio->uio_resid) : slen),
 			    0,
 			    so->so_rcv.sb_cc);
 		}
 	}
 stage_left:
 	if (wakeup_read_socket) {
 		sctp_sorwakeup(inp, so);
 	}
 	return (error);
 }
 
 
 #ifdef SCTP_MBUF_LOGGING
 struct mbuf *
 sctp_m_free(struct mbuf *m)
 {
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mb(m, SCTP_MBUF_IFREE);
 	}
 	return (m_free(m));
 }
 
 void
 sctp_m_freem(struct mbuf *mb)
 {
 	while (mb != NULL)
 		mb = sctp_m_free(mb);
 }
 
 #endif
 
 int
 sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id)
 {
 	/*
 	 * Given a local address. For all associations that holds the
 	 * address, request a peer-set-primary.
 	 */
 	struct sctp_ifa *ifa;
 	struct sctp_laddr *wi;
 
 	ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
 	if (ifa == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EADDRNOTAVAIL);
 		return (EADDRNOTAVAIL);
 	}
 	/*
 	 * Now that we have the ifa we must awaken the iterator with this
 	 * message.
 	 */
 	wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
 	if (wi == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOMEM);
 		return (ENOMEM);
 	}
 	/* Now incr the count and int wi structure */
 	SCTP_INCR_LADDR_COUNT();
 	memset(wi, 0, sizeof(*wi));
 	(void)SCTP_GETTIME_TIMEVAL(&wi->start_time);
 	wi->ifa = ifa;
 	wi->action = SCTP_SET_PRIM_ADDR;
 	atomic_add_int(&ifa->refcount, 1);
 
 	/* Now add it to the work queue */
 	SCTP_WQ_ADDR_LOCK();
 	/*
 	 * Should this really be a tailq? As it is we will process the
 	 * newest first :-0
 	 */
 	LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
 	sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ,
 	    (struct sctp_inpcb *)NULL,
 	    (struct sctp_tcb *)NULL,
 	    (struct sctp_nets *)NULL);
 	SCTP_WQ_ADDR_UNLOCK();
 	return (0);
 }
 
 
 int
 sctp_soreceive(struct socket *so,
     struct sockaddr **psa,
     struct uio *uio,
     struct mbuf **mp0,
     struct mbuf **controlp,
     int *flagsp)
 {
 	int error, fromlen;
 	uint8_t sockbuf[256];
 	struct sockaddr *from;
 	struct sctp_extrcvinfo sinfo;
 	int filling_sinfo = 1;
 	int flags;
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	/* pickup the assoc we are reading from */
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		return (EINVAL);
 	}
 	if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) &&
 	    sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) &&
 	    sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) ||
 	    (controlp == NULL)) {
 		/* user does not want the sndrcv ctl */
 		filling_sinfo = 0;
 	}
 	if (psa) {
 		from = (struct sockaddr *)sockbuf;
 		fromlen = sizeof(sockbuf);
 		from->sa_len = 0;
 	} else {
 		from = NULL;
 		fromlen = 0;
 	}
 
 	if (filling_sinfo) {
 		memset(&sinfo, 0, sizeof(struct sctp_extrcvinfo));
 	}
 	if (flagsp != NULL) {
 		flags = *flagsp;
 	} else {
 		flags = 0;
 	}
 	error = sctp_sorecvmsg(so, uio, mp0, from, fromlen, &flags,
 	    (struct sctp_sndrcvinfo *)&sinfo, filling_sinfo);
 	if (flagsp != NULL) {
 		*flagsp = flags;
 	}
 	if (controlp != NULL) {
 		/* copy back the sinfo in a CMSG format */
 		if (filling_sinfo && ((flags & MSG_NOTIFICATION) == 0)) {
 			*controlp = sctp_build_ctl_nchunk(inp,
 			    (struct sctp_sndrcvinfo *)&sinfo);
 		} else {
 			*controlp = NULL;
 		}
 	}
 	if (psa) {
 		/* copy back the address info */
 		if (from && from->sa_len) {
 			*psa = sodupsockaddr(from, M_NOWAIT);
 		} else {
 			*psa = NULL;
 		}
 	}
 	return (error);
 }
 
 
 
 
 
 int
 sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
     int totaddr, int *error)
 {
 	int added = 0;
 	int i;
 	struct sctp_inpcb *inp;
 	struct sockaddr *sa;
 	size_t incr = 0;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 
 	sa = addr;
 	inp = stcb->sctp_ep;
 	*error = 0;
 	for (i = 0; i < totaddr; i++) {
 		switch (sa->sa_family) {
 #ifdef INET
 		case AF_INET:
 			incr = sizeof(struct sockaddr_in);
 			sin = (struct sockaddr_in *)sa;
 			if ((sin->sin_addr.s_addr == INADDR_ANY) ||
 			    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 			    IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTPUTIL + SCTP_LOC_7);
 				*error = EINVAL;
 				goto out_now;
 			}
 			if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port,
 			    SCTP_DONOT_SETSCOPE,
 			    SCTP_ADDR_IS_CONFIRMED)) {
 				/* assoc gone no un-lock */
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTPUTIL + SCTP_LOC_8);
 				*error = ENOBUFS;
 				goto out_now;
 			}
 			added++;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			incr = sizeof(struct sockaddr_in6);
 			sin6 = (struct sockaddr_in6 *)sa;
 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTPUTIL + SCTP_LOC_9);
 				*error = EINVAL;
 				goto out_now;
 			}
 			if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port,
 			    SCTP_DONOT_SETSCOPE,
 			    SCTP_ADDR_IS_CONFIRMED)) {
 				/* assoc gone no un-lock */
 				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTPUTIL + SCTP_LOC_10);
 				*error = ENOBUFS;
 				goto out_now;
 			}
 			added++;
 			break;
 #endif
 		default:
 			break;
 		}
 		sa = (struct sockaddr *)((caddr_t)sa + incr);
 	}
 out_now:
 	return (added);
 }
 
 int
 sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
     unsigned int totaddr,
     unsigned int *num_v4, unsigned int *num_v6,
     unsigned int limit)
 {
 	struct sockaddr *sa;
 	struct sctp_tcb *stcb;
 	unsigned int incr, at, i;
 
 	at = 0;
 	sa = addr;
 	*num_v6 = *num_v4 = 0;
 	/* account and validate addresses */
 	if (totaddr == 0) {
 		return (EINVAL);
 	}
 	for (i = 0; i < totaddr; i++) {
 		if (at + sizeof(struct sockaddr) > limit) {
 			return (EINVAL);
 		}
 		switch (sa->sa_family) {
 #ifdef INET
 		case AF_INET:
 			incr = (unsigned int)sizeof(struct sockaddr_in);
 			if (sa->sa_len != incr) {
 				return (EINVAL);
 			}
 			(*num_v4) += 1;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)sa;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					/* Must be non-mapped for connectx */
 					return (EINVAL);
 				}
 				incr = (unsigned int)sizeof(struct sockaddr_in6);
 				if (sa->sa_len != incr) {
 					return (EINVAL);
 				}
 				(*num_v6) += 1;
 				break;
 			}
 #endif
 		default:
 			return (EINVAL);
 		}
 		if ((at + incr) > limit) {
 			return (EINVAL);
 		}
 		SCTP_INP_INCR_REF(inp);
 		stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL);
 		if (stcb != NULL) {
 			SCTP_TCB_UNLOCK(stcb);
 			return (EALREADY);
 		} else {
 			SCTP_INP_DECR_REF(inp);
 		}
 		at += incr;
 		sa = (struct sockaddr *)((caddr_t)sa + incr);
 	}
 	return (0);
 }
 
 /*
  * sctp_bindx(ADD) for one address.
  * assumes all arguments are valid/checked by caller.
  */
 void
 sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
     struct sockaddr *sa, uint32_t vrf_id, int *error,
     void *p)
 {
 #if defined(INET) && defined(INET6)
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 #ifdef INET
 	struct sockaddr_in *sinp;
 #endif
 	struct sockaddr *addr_to_use;
 	struct sctp_inpcb *lep;
 	uint16_t port;
 
 	/* see if we're bound all already! */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		*error = EINVAL;
 		return;
 	}
 	switch (sa->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		if (sa->sa_len != sizeof(struct sockaddr_in6)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 			/* can only bind v6 on PF_INET6 sockets */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		sin6 = (struct sockaddr_in6 *)sa;
 		port = sin6->sin6_port;
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 			    SCTP_IPV6_V6ONLY(inp)) {
 				/* can't bind v4-mapped on PF_INET sockets */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 				*error = EINVAL;
 				return;
 			}
 			in6_sin6_2_sin(&sin, sin6);
 			addr_to_use = (struct sockaddr *)&sin;
 		} else {
 			addr_to_use = sa;
 		}
 #else
 		addr_to_use = sa;
 #endif
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		if (sa->sa_len != sizeof(struct sockaddr_in)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 		    SCTP_IPV6_V6ONLY(inp)) {
 			/* can't bind v4 on PF_INET sockets */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		sinp = (struct sockaddr_in *)sa;
 		port = sinp->sin_port;
 		addr_to_use = sa;
 		break;
 #endif
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		*error = EINVAL;
 		return;
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		if (p == NULL) {
 			/* Can't get proc for Net/Open BSD */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		*error = sctp_inpcb_bind(so, addr_to_use, NULL, p);
 		return;
 	}
 	/* Validate the incoming port. */
 	if ((port != 0) && (port != inp->sctp_lport)) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		*error = EINVAL;
 		return;
 	}
 	lep = sctp_pcb_findep(addr_to_use, 1, 0, vrf_id);
 	if (lep == NULL) {
 		/* add the address */
 		*error = sctp_addr_mgmt_ep_sa(inp, addr_to_use,
 		    SCTP_ADD_IP_ADDRESS, vrf_id);
 	} else {
 		if (lep != inp) {
 			*error = EADDRINUSE;
 		}
 		SCTP_INP_DECR_REF(lep);
 	}
 }
 
 /*
  * sctp_bindx(DELETE) for one address.
  * assumes all arguments are valid/checked by caller.
  */
 void
 sctp_bindx_delete_address(struct sctp_inpcb *inp,
     struct sockaddr *sa, uint32_t vrf_id, int *error)
 {
 	struct sockaddr *addr_to_use;
 #if defined(INET) && defined(INET6)
 	struct sockaddr_in6 *sin6;
 	struct sockaddr_in sin;
 #endif
 
 	/* see if we're bound all already! */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		*error = EINVAL;
 		return;
 	}
 	switch (sa->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		if (sa->sa_len != sizeof(struct sockaddr_in6)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 			/* can only bind v6 on PF_INET6 sockets */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 #ifdef INET
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 			    SCTP_IPV6_V6ONLY(inp)) {
 				/* can't bind mapped-v4 on PF_INET sockets */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 				*error = EINVAL;
 				return;
 			}
 			in6_sin6_2_sin(&sin, sin6);
 			addr_to_use = (struct sockaddr *)&sin;
 		} else {
 			addr_to_use = sa;
 		}
 #else
 		addr_to_use = sa;
 #endif
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		if (sa->sa_len != sizeof(struct sockaddr_in)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 		    SCTP_IPV6_V6ONLY(inp)) {
 			/* can't bind v4 on PF_INET sockets */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 			*error = EINVAL;
 			return;
 		}
 		addr_to_use = sa;
 		break;
 #endif
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
 		*error = EINVAL;
 		return;
 	}
 	/* No lock required mgmt_ep_sa does its own locking. */
 	*error = sctp_addr_mgmt_ep_sa(inp, addr_to_use, SCTP_DEL_IP_ADDRESS,
 	    vrf_id);
 }
 
 /*
  * returns the valid local address count for an assoc, taking into account
  * all scoping rules
  */
 int
 sctp_local_addr_count(struct sctp_tcb *stcb)
 {
 	int loopback_scope;
 #if defined(INET)
 	int ipv4_local_scope, ipv4_addr_legal;
 #endif
 #if defined(INET6)
 	int local_scope, site_scope, ipv6_addr_legal;
 #endif
 	struct sctp_vrf *vrf;
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 	int count = 0;
 
 	/* Turn on all the appropriate scopes */
 	loopback_scope = stcb->asoc.scope.loopback_scope;
 #if defined(INET)
 	ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope;
 	ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal;
 #endif
 #if defined(INET6)
 	local_scope = stcb->asoc.scope.local_scope;
 	site_scope = stcb->asoc.scope.site_scope;
 	ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal;
 #endif
 	SCTP_IPI_ADDR_RLOCK();
 	vrf = sctp_find_vrf(stcb->asoc.vrf_id);
 	if (vrf == NULL) {
 		/* no vrf, no addresses */
 		SCTP_IPI_ADDR_RUNLOCK();
 		return (0);
 	}
 
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/*
 		 * bound all case: go through all ifns on the vrf
 		 */
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			if ((loopback_scope == 0) &&
 			    SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 				continue;
 			}
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				if (sctp_is_addr_restricted(stcb, sctp_ifa))
 					continue;
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (ipv4_addr_legal) {
 						struct sockaddr_in *sin;
 
 						sin = &sctp_ifa->address.sin;
 						if (sin->sin_addr.s_addr == 0) {
 							/*
 							 * skip unspecified
 							 * addrs
 							 */
 							continue;
 						}
 						if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
 						    &sin->sin_addr) != 0) {
 							continue;
 						}
 						if ((ipv4_local_scope == 0) &&
 						    (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 							continue;
 						}
 						/* count this one */
 						count++;
 					} else {
 						continue;
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					if (ipv6_addr_legal) {
 						struct sockaddr_in6 *sin6;
 
 						sin6 = &sctp_ifa->address.sin6;
 						if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 							continue;
 						}
 						if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
 						    &sin6->sin6_addr) != 0) {
 							continue;
 						}
 						if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 							if (local_scope == 0)
 								continue;
 							if (sin6->sin6_scope_id == 0) {
 								if (sa6_recoverscope(sin6) != 0)
 									/*
 									 *
 									 * bad
 									 * link
 									 *
 									 * local
 									 *
 									 * address
 									 */
 									continue;
 							}
 						}
 						if ((site_scope == 0) &&
 						    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 							continue;
 						}
 						/* count this one */
 						count++;
 					}
 					break;
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 		}
 	} else {
 		/*
 		 * subset bound case
 		 */
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list,
 		    sctp_nxt_addr) {
 			if (sctp_is_addr_restricted(stcb, laddr->ifa)) {
 				continue;
 			}
 			/* count this one */
 			count++;
 		}
 	}
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (count);
 }
 
 #if defined(SCTP_LOCAL_TRACE_BUF)
 
 void
 sctp_log_trace(uint32_t subsys, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f)
 {
 	uint32_t saveindex, newindex;
 
 	do {
 		saveindex = SCTP_BASE_SYSCTL(sctp_log).index;
 		if (saveindex >= SCTP_MAX_LOGGING_SIZE) {
 			newindex = 1;
 		} else {
 			newindex = saveindex + 1;
 		}
 	} while (atomic_cmpset_int(&SCTP_BASE_SYSCTL(sctp_log).index, saveindex, newindex) == 0);
 	if (saveindex >= SCTP_MAX_LOGGING_SIZE) {
 		saveindex = 0;
 	}
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].timestamp = SCTP_GET_CYCLECOUNT;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].subsys = subsys;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[0] = a;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[1] = b;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[2] = c;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[3] = d;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[4] = e;
 	SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[5] = f;
 }
 
 #endif
 static void
 sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa SCTP_UNUSED, void *ctx SCTP_UNUSED)
 {
 	struct ip *iph;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct mbuf *sp, *last;
 	struct udphdr *uhdr;
 	uint16_t port;
 
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* Can't handle one that is not a pkt hdr */
 		goto out;
 	}
 	/* Pull the src port */
 	iph = mtod(m, struct ip *);
 	uhdr = (struct udphdr *)((caddr_t)iph + off);
 	port = uhdr->uh_sport;
 	/*
 	 * Split out the mbuf chain. Leave the IP header in m, place the
 	 * rest in the sp.
 	 */
 	sp = m_split(m, off, M_NOWAIT);
 	if (sp == NULL) {
 		/* Gak, drop packet, we can't do a split */
 		goto out;
 	}
 	if (sp->m_pkthdr.len < sizeof(struct udphdr) + sizeof(struct sctphdr)) {
 		/* Gak, packet can't have an SCTP header in it - too small */
 		m_freem(sp);
 		goto out;
 	}
 	/* Now pull up the UDP header and SCTP header together */
 	sp = m_pullup(sp, sizeof(struct udphdr) + sizeof(struct sctphdr));
 	if (sp == NULL) {
 		/* Gak pullup failed */
 		goto out;
 	}
 	/* Trim out the UDP header */
 	m_adj(sp, sizeof(struct udphdr));
 
 	/* Now reconstruct the mbuf chain */
 	for (last = m; last->m_next; last = last->m_next);
 	last->m_next = sp;
 	m->m_pkthdr.len += sp->m_pkthdr.len;
 	/*
 	 * The CSUM_DATA_VALID flags indicates that the HW checked the UDP
 	 * checksum and it was valid. Since CSUM_DATA_VALID ==
 	 * CSUM_SCTP_VALID this would imply that the HW also verified the
 	 * SCTP checksum. Therefore, clear the bit.
 	 */
 	SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
 	    "sctp_recv_udp_tunneled_packet(): Packet of length %d received on %s with csum_flags 0x%b.\n",
 	    m->m_pkthdr.len,
 	    if_name(m->m_pkthdr.rcvif),
 	    (int)m->m_pkthdr.csum_flags, CSUM_BITS);
 	m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
 	iph = mtod(m, struct ip *);
 	switch (iph->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
 		sctp_input_with_port(m, off, port);
 		break;
 #endif
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
 		sctp6_input_with_port(&m, &off, port);
 		break;
 #endif
 	default:
 		goto out;
 		break;
 	}
 	return;
 out:
 	m_freem(m);
 }
 
 #ifdef INET
 static void
 sctp_recv_icmp_tunneled_packet(int cmd, struct sockaddr *sa, void *vip, void *ctx SCTP_UNUSED)
 {
 	struct ip *outer_ip, *inner_ip;
 	struct sctphdr *sh;
 	struct icmp *icmp;
 	struct udphdr *udp;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 	struct sctp_init_chunk *ch;
 	struct sockaddr_in src, dst;
 	uint8_t type, code;
 
 	inner_ip = (struct ip *)vip;
 	icmp = (struct icmp *)((caddr_t)inner_ip -
 	    (sizeof(struct icmp) - sizeof(struct ip)));
 	outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
 	if (ntohs(outer_ip->ip_len) <
 	    sizeof(struct ip) + 8 + (inner_ip->ip_hl << 2) + sizeof(struct udphdr) + 8) {
 		return;
 	}
 	udp = (struct udphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2));
 	sh = (struct sctphdr *)(udp + 1);
 	memset(&src, 0, sizeof(struct sockaddr_in));
 	src.sin_family = AF_INET;
 	src.sin_len = sizeof(struct sockaddr_in);
 	src.sin_port = sh->src_port;
 	src.sin_addr = inner_ip->ip_src;
 	memset(&dst, 0, sizeof(struct sockaddr_in));
 	dst.sin_family = AF_INET;
 	dst.sin_len = sizeof(struct sockaddr_in);
 	dst.sin_port = sh->dest_port;
 	dst.sin_addr = inner_ip->ip_dst;
 	/*
 	 * 'dst' holds the dest of the packet that failed to be sent. 'src'
 	 * holds our local endpoint address. Thus we reverse the dst and the
 	 * src in the lookup.
 	 */
 	inp = NULL;
 	net = NULL;
 	stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
 	    (struct sockaddr *)&src,
 	    &inp, &net, 1,
 	    SCTP_DEFAULT_VRFID);
 	if ((stcb != NULL) &&
 	    (net != NULL) &&
 	    (inp != NULL)) {
 		/* Check the UDP port numbers */
 		if ((udp->uh_dport != net->port) ||
 		    (udp->uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) {
 			SCTP_TCB_UNLOCK(stcb);
 			return;
 		}
 		/* Check the verification tag */
 		if (ntohl(sh->v_tag) != 0) {
 			/*
 			 * This must be the verification tag used for
 			 * sending out packets. We don't consider packets
 			 * reflecting the verification tag.
 			 */
 			if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) {
 				SCTP_TCB_UNLOCK(stcb);
 				return;
 			}
 		} else {
 			if (ntohs(outer_ip->ip_len) >=
 			    sizeof(struct ip) +
 			    8 + (inner_ip->ip_hl << 2) + 8 + 20) {
 				/*
 				 * In this case we can check if we got an
 				 * INIT chunk and if the initiate tag
 				 * matches.
 				 */
 				ch = (struct sctp_init_chunk *)(sh + 1);
 				if ((ch->ch.chunk_type != SCTP_INITIATION) ||
 				    (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) {
 					SCTP_TCB_UNLOCK(stcb);
 					return;
 				}
 			} else {
 				SCTP_TCB_UNLOCK(stcb);
 				return;
 			}
 		}
 		type = icmp->icmp_type;
 		code = icmp->icmp_code;
 		if ((type == ICMP_UNREACH) &&
 		    (code == ICMP_UNREACH_PORT)) {
 			code = ICMP_UNREACH_PROTOCOL;
 		}
 		sctp_notify(inp, stcb, net, type, code,
 		    ntohs(inner_ip->ip_len),
 		    (uint32_t)ntohs(icmp->icmp_nextmtu));
 	} else {
 		if ((stcb == NULL) && (inp != NULL)) {
 			/* reduce ref-count */
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			SCTP_INP_WUNLOCK(inp);
 		}
 		if (stcb) {
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	}
 	return;
 }
 #endif
 
 #ifdef INET6
 static void
 sctp_recv_icmp6_tunneled_packet(int cmd, struct sockaddr *sa, void *d, void *ctx SCTP_UNUSED)
 {
 	struct ip6ctlparam *ip6cp;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 	struct sctphdr sh;
 	struct udphdr udp;
 	struct sockaddr_in6 src, dst;
 	uint8_t type, code;
 
 	ip6cp = (struct ip6ctlparam *)d;
 	/*
 	 * XXX: We assume that when IPV6 is non NULL, M and OFF are valid.
 	 */
 	if (ip6cp->ip6c_m == NULL) {
 		return;
 	}
 	/*
 	 * Check if we can safely examine the ports and the verification tag
 	 * of the SCTP common header.
 	 */
 	if (ip6cp->ip6c_m->m_pkthdr.len <
 	    ip6cp->ip6c_off + sizeof(struct udphdr) + offsetof(struct sctphdr, checksum)) {
 		return;
 	}
 	/* Copy out the UDP header. */
 	memset(&udp, 0, sizeof(struct udphdr));
 	m_copydata(ip6cp->ip6c_m,
 	    ip6cp->ip6c_off,
 	    sizeof(struct udphdr),
 	    (caddr_t)&udp);
 	/* Copy out the port numbers and the verification tag. */
 	memset(&sh, 0, sizeof(struct sctphdr));
 	m_copydata(ip6cp->ip6c_m,
 	    ip6cp->ip6c_off + sizeof(struct udphdr),
 	    sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t),
 	    (caddr_t)&sh);
 	memset(&src, 0, sizeof(struct sockaddr_in6));
 	src.sin6_family = AF_INET6;
 	src.sin6_len = sizeof(struct sockaddr_in6);
 	src.sin6_port = sh.src_port;
 	src.sin6_addr = ip6cp->ip6c_ip6->ip6_src;
 	if (in6_setscope(&src.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) {
 		return;
 	}
 	memset(&dst, 0, sizeof(struct sockaddr_in6));
 	dst.sin6_family = AF_INET6;
 	dst.sin6_len = sizeof(struct sockaddr_in6);
 	dst.sin6_port = sh.dest_port;
 	dst.sin6_addr = ip6cp->ip6c_ip6->ip6_dst;
 	if (in6_setscope(&dst.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) {
 		return;
 	}
 	inp = NULL;
 	net = NULL;
 	stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
 	    (struct sockaddr *)&src,
 	    &inp, &net, 1, SCTP_DEFAULT_VRFID);
 	if ((stcb != NULL) &&
 	    (net != NULL) &&
 	    (inp != NULL)) {
 		/* Check the UDP port numbers */
 		if ((udp.uh_dport != net->port) ||
 		    (udp.uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) {
 			SCTP_TCB_UNLOCK(stcb);
 			return;
 		}
 		/* Check the verification tag */
 		if (ntohl(sh.v_tag) != 0) {
 			/*
 			 * This must be the verification tag used for
 			 * sending out packets. We don't consider packets
 			 * reflecting the verification tag.
 			 */
 			if (ntohl(sh.v_tag) != stcb->asoc.peer_vtag) {
 				SCTP_TCB_UNLOCK(stcb);
 				return;
 			}
 		} else {
 			if (ip6cp->ip6c_m->m_pkthdr.len >=
 			    ip6cp->ip6c_off + sizeof(struct udphdr) +
 			    sizeof(struct sctphdr) +
 			    sizeof(struct sctp_chunkhdr) +
 			    offsetof(struct sctp_init, a_rwnd)) {
 				/*
 				 * In this case we can check if we got an
 				 * INIT chunk and if the initiate tag
 				 * matches.
 				 */
 				uint32_t initiate_tag;
 				uint8_t chunk_type;
 
 				m_copydata(ip6cp->ip6c_m,
 				    ip6cp->ip6c_off +
 				    sizeof(struct udphdr) +
 				    sizeof(struct sctphdr),
 				    sizeof(uint8_t),
 				    (caddr_t)&chunk_type);
 				m_copydata(ip6cp->ip6c_m,
 				    ip6cp->ip6c_off +
 				    sizeof(struct udphdr) +
 				    sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr),
 				    sizeof(uint32_t),
 				    (caddr_t)&initiate_tag);
 				if ((chunk_type != SCTP_INITIATION) ||
 				    (ntohl(initiate_tag) != stcb->asoc.my_vtag)) {
 					SCTP_TCB_UNLOCK(stcb);
 					return;
 				}
 			} else {
 				SCTP_TCB_UNLOCK(stcb);
 				return;
 			}
 		}
 		type = ip6cp->ip6c_icmp6->icmp6_type;
 		code = ip6cp->ip6c_icmp6->icmp6_code;
 		if ((type == ICMP6_DST_UNREACH) &&
 		    (code == ICMP6_DST_UNREACH_NOPORT)) {
 			type = ICMP6_PARAM_PROB;
 			code = ICMP6_PARAMPROB_NEXTHEADER;
 		}
 		sctp6_notify(inp, stcb, net, type, code,
 		    ntohl(ip6cp->ip6c_icmp6->icmp6_mtu));
 	} else {
 		if ((stcb == NULL) && (inp != NULL)) {
 			/* reduce inp's ref-count */
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			SCTP_INP_WUNLOCK(inp);
 		}
 		if (stcb) {
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	}
 }
 #endif
 
 void
 sctp_over_udp_stop(void)
 {
 	/*
 	 * This function assumes sysctl caller holds sctp_sysctl_info_lock()
 	 * for writting!
 	 */
 #ifdef INET
 	if (SCTP_BASE_INFO(udp4_tun_socket) != NULL) {
 		soclose(SCTP_BASE_INFO(udp4_tun_socket));
 		SCTP_BASE_INFO(udp4_tun_socket) = NULL;
 	}
 #endif
 #ifdef INET6
 	if (SCTP_BASE_INFO(udp6_tun_socket) != NULL) {
 		soclose(SCTP_BASE_INFO(udp6_tun_socket));
 		SCTP_BASE_INFO(udp6_tun_socket) = NULL;
 	}
 #endif
 }
 
 int
 sctp_over_udp_start(void)
 {
 	uint16_t port;
 	int ret;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 	/*
 	 * This function assumes sysctl caller holds sctp_sysctl_info_lock()
 	 * for writting!
 	 */
 	port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port);
 	if (ntohs(port) == 0) {
 		/* Must have a port set */
 		return (EINVAL);
 	}
 #ifdef INET
 	if (SCTP_BASE_INFO(udp4_tun_socket) != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET6
 	if (SCTP_BASE_INFO(udp6_tun_socket) != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET
 	if ((ret = socreate(PF_INET, &SCTP_BASE_INFO(udp4_tun_socket),
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		sctp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp4_tun_socket),
 	    sctp_recv_udp_tunneled_packet,
 	    sctp_recv_icmp_tunneled_packet,
 	    NULL))) {
 		sctp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin, 0, sizeof(struct sockaddr_in));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_port = htons(port);
 	if ((ret = sobind(SCTP_BASE_INFO(udp4_tun_socket),
 	    (struct sockaddr *)&sin, curthread))) {
 		sctp_over_udp_stop();
 		return (ret);
 	}
 #endif
 #ifdef INET6
 	if ((ret = socreate(PF_INET6, &SCTP_BASE_INFO(udp6_tun_socket),
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		sctp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp6_tun_socket),
 	    sctp_recv_udp_tunneled_packet,
 	    sctp_recv_icmp6_tunneled_packet,
 	    NULL))) {
 		sctp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin6, 0, sizeof(struct sockaddr_in6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_port = htons(port);
 	if ((ret = sobind(SCTP_BASE_INFO(udp6_tun_socket),
 	    (struct sockaddr *)&sin6, curthread))) {
 		sctp_over_udp_stop();
 		return (ret);
 	}
 #endif
 	return (0);
 }
 
 /*
  * sctp_min_mtu ()returns the minimum of all non-zero arguments.
  * If all arguments are zero, zero is returned.
  */
 uint32_t
 sctp_min_mtu(uint32_t mtu1, uint32_t mtu2, uint32_t mtu3)
 {
 	if (mtu1 > 0) {
 		if (mtu2 > 0) {
 			if (mtu3 > 0) {
 				return (min(mtu1, min(mtu2, mtu3)));
 			} else {
 				return (min(mtu1, mtu2));
 			}
 		} else {
 			if (mtu3 > 0) {
 				return (min(mtu1, mtu3));
 			} else {
 				return (mtu1);
 			}
 		}
 	} else {
 		if (mtu2 > 0) {
 			if (mtu3 > 0) {
 				return (min(mtu2, mtu3));
 			} else {
 				return (mtu2);
 			}
 		} else {
 			return (mtu3);
 		}
 	}
 }
 
 void
 sctp_hc_set_mtu(union sctp_sockstore *addr, uint16_t fibnum, uint32_t mtu)
 {
 	struct in_conninfo inc;
 
 	memset(&inc, 0, sizeof(struct in_conninfo));
 	inc.inc_fibnum = fibnum;
 	switch (addr->sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		inc.inc_faddr = addr->sin.sin_addr;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = addr->sin6.sin6_addr;
 		break;
 #endif
 	default:
 		return;
 	}
 	tcp_hc_updatemtu(&inc, (u_long)mtu);
 }
 
 uint32_t
 sctp_hc_get_mtu(union sctp_sockstore *addr, uint16_t fibnum)
 {
 	struct in_conninfo inc;
 
 	memset(&inc, 0, sizeof(struct in_conninfo));
 	inc.inc_fibnum = fibnum;
 	switch (addr->sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		inc.inc_faddr = addr->sin.sin_addr;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = addr->sin6.sin6_addr;
 		break;
 #endif
 	default:
 		return (0);
 	}
 	return ((uint32_t)tcp_hc_getmtu(&inc));
 }
 
 void
 sctp_set_state(struct sctp_tcb *stcb, int new_state)
 {
 #if defined(KDTRACE_HOOKS)
 	int old_state = stcb->asoc.state;
 #endif
 
 	KASSERT((new_state & ~SCTP_STATE_MASK) == 0,
 	    ("sctp_set_state: Can't set substate (new_state = %x)",
 	    new_state));
 	stcb->asoc.state = (stcb->asoc.state & ~SCTP_STATE_MASK) | new_state;
 	if ((new_state == SCTP_STATE_SHUTDOWN_RECEIVED) ||
 	    (new_state == SCTP_STATE_SHUTDOWN_SENT) ||
 	    (new_state == SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 		SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING);
 	}
 #if defined(KDTRACE_HOOKS)
 	if (((old_state & SCTP_STATE_MASK) != new_state) &&
 	    !(((old_state & SCTP_STATE_MASK) == SCTP_STATE_EMPTY) &&
 	    (new_state == SCTP_STATE_INUSE))) {
 		SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state);
 	}
 #endif
 }
 
 void
 sctp_add_substate(struct sctp_tcb *stcb, int substate)
 {
 #if defined(KDTRACE_HOOKS)
 	int old_state = stcb->asoc.state;
 #endif
 
 	KASSERT((substate & SCTP_STATE_MASK) == 0,
 	    ("sctp_add_substate: Can't set state (substate = %x)",
 	    substate));
 	stcb->asoc.state |= substate;
 #if defined(KDTRACE_HOOKS)
 	if (((substate & SCTP_STATE_ABOUT_TO_BE_FREED) &&
 	    ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) ||
 	    ((substate & SCTP_STATE_SHUTDOWN_PENDING) &&
 	    ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) {
 		SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state);
 	}
 #endif
 }
Index: projects/clang1100-import/sys/netinet/sctputil.h
===================================================================
--- projects/clang1100-import/sys/netinet/sctputil.h	(revision 364278)
+++ projects/clang1100-import/sys/netinet/sctputil.h	(revision 364279)
@@ -1,380 +1,380 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_UTIL_H_
 #define _NETINET_SCTP_UTIL_H_
 
 #if defined(_KERNEL) || defined(__Userspace__)
 
 #define SCTP_READ_LOCK_HELD 1
 #define SCTP_READ_LOCK_NOT_HELD 0
 
 #ifdef SCTP_ASOCLOG_OF_TSNS
 void sctp_print_out_track_log(struct sctp_tcb *stcb);
 #endif
 
 #ifdef SCTP_MBUF_LOGGING
 struct mbuf *sctp_m_free(struct mbuf *m);
 void sctp_m_freem(struct mbuf *m);
 #else
 #define sctp_m_free m_free
 #define sctp_m_freem m_freem
 #endif
 
 #if defined(SCTP_LOCAL_TRACE_BUF)
 void
      sctp_log_trace(uint32_t fr, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f);
 #endif
 
 #define sctp_get_associd(stcb) ((sctp_assoc_t)stcb->asoc.assoc_id)
 
 
 /*
  * Function prototypes
  */
 int32_t
         sctp_map_assoc_state(int);
 
 uint32_t
          sctp_get_ifa_hash_val(struct sockaddr *addr);
 
 struct sctp_ifa *sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int hold_lock);
 
 struct sctp_ifa *sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock);
 
 uint32_t sctp_select_initial_TSN(struct sctp_pcb *);
 
 uint32_t sctp_select_a_tag(struct sctp_inpcb *, uint16_t lport, uint16_t rport, int);
 
 int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t, uint16_t);
 
 void sctp_fill_random_store(struct sctp_pcb *);
 
 void
 sctp_notify_stream_reset_add(struct sctp_tcb *stcb, uint16_t numberin,
     uint16_t numberout, int flag);
 void
      sctp_notify_stream_reset_tsn(struct sctp_tcb *stcb, uint32_t sending_tsn, uint32_t recv_tsn, int flag);
 
 /*
  * NOTE: sctp_timer_start() will increment the reference count of any relevant
  * structure the timer is referencing, in order to prevent a race condition
  * between the timer executing and the structure being freed.
  *
  * When the timer fires or sctp_timer_stop() is called, these references are
  * removed.
  */
 void
 sctp_timer_start(int, struct sctp_inpcb *, struct sctp_tcb *,
     struct sctp_nets *);
 
 void
 sctp_timer_stop(int, struct sctp_inpcb *, struct sctp_tcb *,
     struct sctp_nets *, uint32_t);
 
 int
     sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id);
 
 void
      sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t);
 
 void
 sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
     int so_locked
     SCTP_UNUSED
 );
 
 void
 sctp_add_to_readq(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_queued_to_read *control,
     struct sockbuf *sb,
     int end,
     int inpread_locked,
     int so_locked);
 
 void sctp_iterator_worker(void);
 
 uint32_t sctp_get_prev_mtu(uint32_t);
 uint32_t sctp_get_next_mtu(uint32_t);
 
 void
      sctp_timeout_handler(void *);
 
 int
 sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *,
     struct sctp_nets *, struct timeval *, int);
 
 uint32_t sctp_calculate_len(struct mbuf *);
 
 caddr_t sctp_m_getptr(struct mbuf *, int, int, uint8_t *);
 
 struct sctp_paramhdr *
 sctp_get_next_param(struct mbuf *, int,
     struct sctp_paramhdr *, int);
 
 struct mbuf *sctp_add_pad_tombuf(struct mbuf *, int);
 
 struct mbuf *sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *);
 
 void sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int);
 
 void
 sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp,
     struct sctp_inpcb *new_inp,
     struct sctp_tcb *stcb, int waitflags);
 
 
 void sctp_stop_timers_for_shutdown(struct sctp_tcb *);
 
 /* Stop all timers for association and remote addresses. */
 void sctp_stop_association_timers(struct sctp_tcb *, bool);
 
-void sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int, int);
+void sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int);
 
 int sctp_expand_mapping_array(struct sctp_association *, uint32_t);
 
 void
 sctp_abort_notification(struct sctp_tcb *, uint8_t, uint16_t,
     struct sctp_abort_chunk *, int);
 
 /* We abort responding to an IP packet for some reason */
 void
 sctp_abort_association(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *,
     int, struct sockaddr *, struct sockaddr *,
     struct sctphdr *, struct mbuf *,
     uint8_t, uint32_t,
     uint32_t, uint16_t);
 
 
 /* We choose to abort via user input */
 void
 sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *,
     struct mbuf *, int);
 
 void
 sctp_handle_ootb(struct mbuf *, int, int,
     struct sockaddr *, struct sockaddr *,
     struct sctphdr *, struct sctp_inpcb *,
     struct mbuf *,
     uint8_t, uint32_t, uint16_t,
     uint32_t, uint16_t);
 
 int
 sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
     int totaddr, int *error);
 
 int
 sctp_connectx_helper_find(struct sctp_inpcb *, struct sockaddr *,
     unsigned int, unsigned int *, unsigned int *, unsigned int);
 
 int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *);
 #ifdef INET6
 uint32_t sctp_is_same_scope(struct sockaddr_in6 *, struct sockaddr_in6 *);
 
 struct sockaddr_in6 *sctp_recover_scope(struct sockaddr_in6 *, struct sockaddr_in6 *);
 
 #define sctp_recover_scope_mac(addr, store) do { \
 	 if ((addr->sin6_family == AF_INET6) && \
 	     (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr))) { \
 		*store = *addr; \
 		if (addr->sin6_scope_id == 0) { \
 			if (!sa6_recoverscope(store)) { \
 				addr = store; \
 			} \
 		} else { \
 			in6_clearscope(&addr->sin6_addr); \
 			addr = store; \
 		} \
 	 } \
 } while (0)
 #endif
 
 int sctp_cmpaddr(struct sockaddr *, struct sockaddr *);
 
 void sctp_print_address(struct sockaddr *);
 
 int
 sctp_release_pr_sctp_chunk(struct sctp_tcb *, struct sctp_tmit_chunk *,
     uint8_t, int);
 
 struct mbuf *sctp_generate_cause(uint16_t, char *);
 struct mbuf *sctp_generate_no_user_data_cause(uint32_t);
 
 void
 sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
     struct sockaddr *sa, uint32_t vrf_id, int *error,
     void *p);
 void
 sctp_bindx_delete_address(struct sctp_inpcb *inp, struct sockaddr *sa,
     uint32_t vrf_id, int *error);
 
 int sctp_local_addr_count(struct sctp_tcb *stcb);
 
 #ifdef SCTP_MBCNT_LOGGING
 void
 sctp_free_bufspace(struct sctp_tcb *, struct sctp_association *,
     struct sctp_tmit_chunk *, int);
 
 #else
 #define sctp_free_bufspace(stcb, asoc, tp1, chk_cnt)  \
 do { \
 	if (tp1->data != NULL) { \
 		atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \
 		if ((asoc)->total_output_queue_size >= tp1->book_size) { \
 			atomic_subtract_int(&((asoc)->total_output_queue_size), tp1->book_size); \
 		} else { \
 			(asoc)->total_output_queue_size = 0; \
 		} \
 		if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
 		    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
 			if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \
 				atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \
 			} else { \
 				stcb->sctp_socket->so_snd.sb_cc = 0; \
 			} \
 		} \
 	} \
 } while (0)
 
 #endif
 
 #define sctp_free_spbufspace(stcb, asoc, sp)  \
 do { \
 	if (sp->data != NULL) { \
 		if ((asoc)->total_output_queue_size >= sp->length) { \
 			atomic_subtract_int(&(asoc)->total_output_queue_size, sp->length); \
 		} else { \
 			(asoc)->total_output_queue_size = 0; \
 		} \
 		if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
 		    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
 			if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \
 				atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \
 			} else { \
 				stcb->sctp_socket->so_snd.sb_cc = 0; \
 			} \
 		} \
 	} \
 } while (0)
 
 #define sctp_snd_sb_alloc(stcb, sz)  \
 do { \
 	atomic_add_int(&stcb->asoc.total_output_queue_size,sz); \
 	if ((stcb->sctp_socket != NULL) && \
 	    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
 	     (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
 		atomic_add_int(&stcb->sctp_socket->so_snd.sb_cc,sz); \
 	} \
 } while (0)
 
 /* functions to start/stop udp tunneling */
 void sctp_over_udp_stop(void);
 int sctp_over_udp_start(void);
 
 int
 sctp_soreceive(struct socket *so, struct sockaddr **psa,
     struct uio *uio,
     struct mbuf **mp0,
     struct mbuf **controlp,
     int *flagsp);
 
 void
      sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d);
 
 void
 sctp_wakeup_log(struct sctp_tcb *stcb,
     uint32_t wake_cnt, int from);
 
 void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t, uint16_t, uint16_t, int);
 
 void sctp_log_nagle_event(struct sctp_tcb *stcb, int action);
 
 
 #ifdef SCTP_MBUF_LOGGING
 void
      sctp_log_mb(struct mbuf *m, int from);
 
 void
      sctp_log_mbc(struct mbuf *m, int from);
 #endif
 
 void
 sctp_sblog(struct sockbuf *sb,
     struct sctp_tcb *stcb, int from, int incr);
 
 void
 sctp_log_strm_del(struct sctp_queued_to_read *control,
     struct sctp_queued_to_read *poschk,
     int from);
 void sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *, int, uint8_t);
 void rto_logging(struct sctp_nets *net, int from);
 
 void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc);
 
 void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from);
 void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t);
 void sctp_log_block(uint8_t, struct sctp_association *, ssize_t);
 void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t);
 void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t);
 int sctp_fill_stat_log(void *, size_t *);
 void sctp_log_fr(uint32_t, uint32_t, uint32_t, int);
 void sctp_log_sack(uint32_t, uint32_t, uint32_t, uint16_t, uint16_t, int);
 void sctp_log_map(uint32_t, uint32_t, uint32_t, int);
 void sctp_print_mapping_array(struct sctp_association *asoc);
 void sctp_clr_stat_log(void);
 
 
 #ifdef SCTP_AUDITING_ENABLED
 void
 sctp_auditing(int, struct sctp_inpcb *, struct sctp_tcb *,
     struct sctp_nets *);
 void sctp_audit_log(uint8_t, uint8_t);
 
 #endif
 uint32_t sctp_min_mtu(uint32_t, uint32_t, uint32_t);
 void sctp_hc_set_mtu(union sctp_sockstore *, uint16_t, uint32_t);
 uint32_t sctp_hc_get_mtu(union sctp_sockstore *, uint16_t);
 void sctp_set_state(struct sctp_tcb *, int);
 void sctp_add_substate(struct sctp_tcb *, int);
 uint32_t sctp_ticks_to_msecs(uint32_t);
 uint32_t sctp_msecs_to_ticks(uint32_t);
 uint32_t sctp_ticks_to_secs(uint32_t);
 uint32_t sctp_secs_to_ticks(uint32_t);
 
 #endif				/* _KERNEL */
 #endif
Index: projects/clang1100-import/sys/sys/namei.h
===================================================================
--- projects/clang1100-import/sys/sys/namei.h	(revision 364278)
+++ projects/clang1100-import/sys/sys/namei.h	(revision 364279)
@@ -1,276 +1,277 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1985, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)namei.h	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_NAMEI_H_
 #define	_SYS_NAMEI_H_
 
 #include <sys/caprights.h>
 #include <sys/filedesc.h>
 #include <sys/queue.h>
 #include <sys/_uio.h>
 
 struct componentname {
 	/*
 	 * Arguments to lookup.
 	 */
 	u_long	cn_nameiop;	/* namei operation */
 	u_int64_t cn_flags;	/* flags to namei */
 	struct	thread *cn_thread;/* thread requesting lookup */
 	struct	ucred *cn_cred;	/* credentials */
 	int	cn_lkflags;	/* Lock flags LK_EXCLUSIVE or LK_SHARED */
 	/*
 	 * Shared between lookup and commit routines.
 	 */
 	char	*cn_pnbuf;	/* pathname buffer */
 	char	*cn_nameptr;	/* pointer to looked up name */
 	long	cn_namelen;	/* length of looked up component */
 };
 
 struct nameicap_tracker;
 TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker);
 
 /*
  * Encapsulation of namei parameters.
  */
 struct nameidata {
 	/*
 	 * Arguments to namei/lookup.
 	 */
 	const	char *ni_dirp;		/* pathname pointer */
 	enum	uio_seg ni_segflg;	/* location of pathname */
 	cap_rights_t *ni_rightsneeded;	/* rights required to look up vnode */
 	/*
 	 * Arguments to lookup.
 	 */
 	struct  vnode *ni_startdir;	/* starting directory */
 	struct	vnode *ni_rootdir;	/* logical root directory */
 	struct	vnode *ni_topdir;	/* logical top directory */
 	int	ni_dirfd;		/* starting directory for *at functions */
 	int	ni_lcf;			/* local call flags */
 	/*
 	 * Results: returned from namei
 	 */
 	struct filecaps ni_filecaps;	/* rights the *at base has */
 	/*
 	 * Results: returned from/manipulated by lookup
 	 */
 	struct	vnode *ni_vp;		/* vnode of result */
 	struct	vnode *ni_dvp;		/* vnode of intermediate directory */
 	/*
 	 * Results: flags returned from namei
 	 */
 	u_int	ni_resflags;
 	/*
 	 * Shared between namei and lookup/commit routines.
 	 */
 	size_t	ni_pathlen;		/* remaining chars in path */
 	char	*ni_next;		/* next location in pathname */
 	u_int	ni_loopcnt;		/* count of symlinks encountered */
 	/*
 	 * Lookup parameters: this structure describes the subset of
 	 * information from the nameidata structure that is passed
 	 * through the VOP interface.
 	 */
 	struct componentname ni_cnd;
 	struct nameicap_tracker_head ni_cap_tracker;
 	struct vnode *ni_beneath_latch;
 };
 
 #ifdef _KERNEL
 
 enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL,
     CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
 int	cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
     struct pwd **pwdp);
 
 /*
  * namei operations
  */
 #define	LOOKUP		0	/* perform name lookup only */
 #define	CREATE		1	/* setup for file creation */
 #define	DELETE		2	/* setup for file deletion */
 #define	RENAME		3	/* setup for file renaming */
 #define	OPMASK		3	/* mask for operation */
 /*
  * namei operational modifier flags, stored in ni_cnd.flags
  */
 #define	LOCKLEAF	0x0004	/* lock vnode on return */
 #define	LOCKPARENT	0x0008	/* want parent vnode returned locked */
 #define	WANTPARENT	0x0010	/* want parent vnode returned unlocked */
 #define	NOCACHE		0x0020	/* name must not be left in cache */
 #define	FOLLOW		0x0040	/* follow symbolic links */
 #define	BENEATH		0x0080	/* No escape from the start dir */
 #define	LOCKSHARED	0x0100	/* Shared lock leaf */
 #define	NOFOLLOW	0x0000	/* do not follow symbolic links (pseudo) */
 #define	MODMASK		0x01fc	/* mask of operational modifiers */
 /*
  * Namei parameter descriptors.
  *
  * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP.
  * If the caller of namei sets the flag (for example execve wants to
  * know the name of the program that is being executed), then it must
  * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must
  * be freed by either the commit routine or the VOP_ABORT routine.
  * SAVESTART is set only by the callers of namei. It implies SAVENAME
  * plus the addition of saving the parent directory that contains the
  * name in ni_startdir. It allows repeated calls to lookup for the
  * name being sought. The caller is responsible for releasing the
  * buffer and for vrele'ing ni_startdir.
  */
 #define	RDONLY		0x00000200 /* lookup with read-only semantics */
-#define	HASBUF		0x00000400 /* has allocated pathname buffer */
-#define	SAVENAME	0x00000800 /* save pathname buffer */
-#define	SAVESTART	0x00001000 /* save starting directory */
-#define	ISWHITEOUT	0x00002000 /* found whiteout */
-#define	DOWHITEOUT	0x00004000 /* do whiteouts */
-#define	WILLBEDIR	0x00008000 /* new files will be dirs; allow trailing / */
-#define	ISOPEN		0x00010000 /* caller is opening; return a real vnode. */
-#define	NOCROSSMOUNT	0x00020000 /* do not cross mount points */
-#define	NOMACCHECK	0x00040000 /* do not perform MAC checks */
-#define	AUDITVNODE1	0x00080000 /* audit the looked up vnode information */
-#define	AUDITVNODE2	0x00100000 /* audit the looked up vnode information */
-#define	NOCAPCHECK	0x00200000 /* do not perform capability checks */
+#define	SAVENAME	0x00000400 /* save pathname buffer */
+#define	SAVESTART	0x00000800 /* save starting directory */
+#define	ISWHITEOUT	0x00001000 /* found whiteout */
+#define	DOWHITEOUT	0x00002000 /* do whiteouts */
+#define	WILLBEDIR	0x00004000 /* new files will be dirs; allow trailing / */
+#define	ISOPEN		0x00008000 /* caller is opening; return a real vnode. */
+#define	NOCROSSMOUNT	0x00010000 /* do not cross mount points */
+#define	NOMACCHECK	0x00020000 /* do not perform MAC checks */
+#define	AUDITVNODE1	0x00040000 /* audit the looked up vnode information */
+#define	AUDITVNODE2	0x00080000 /* audit the looked up vnode information */
+#define	NOCAPCHECK	0x00100000 /* do not perform capability checks */
 /* UNUSED		0x00400000 */
+/* UNUSED		0x00200000 */
 /* UNUSED		0x00800000 */
-/* UNUSED		0x01000000 */
+#define	HASBUF		0x01000000 /* has allocated pathname buffer */
 #define	NOEXECCHECK	0x02000000 /* do not perform exec check on dir */
 #define	MAKEENTRY	0x04000000 /* entry is to be added to name cache */
 #define	ISSYMLINK	0x08000000 /* symlink needs interpretation */
 #define	ISLASTCN	0x10000000 /* this is last component of pathname */
 #define	ISDOTDOT	0x20000000 /* current component name is .. */
 #define	TRAILINGSLASH	0x40000000 /* path ended in a slash */
 #define	PARAMASK	0x7ffffe00 /* mask of parameter descriptors */
 
 /*
  * Flags which must not be passed in by callers.
  */
 #define NAMEI_INTERNAL_FLAGS	\
-	(NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | TRAILINGSLASH)
+	(HASBUF | NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \
+	 TRAILINGSLASH)
 
 /*
  * Namei results flags
  */
 #define	NIRES_ABS	0x00000001 /* Path was absolute */
 
 /*
  * Flags in ni_lcf, valid for the duration of the namei call.
  */
 #define	NI_LCF_STRICTRELATIVE	0x0001	/* relative lookup only */
 #define	NI_LCF_CAP_DOTDOT	0x0002	/* ".." in strictrelative case */
 #define	NI_LCF_BENEATH_ABS	0x0004	/* BENEATH with absolute path */
 #define	NI_LCF_BENEATH_LATCHED	0x0008	/* BENEATH_ABS traversed starting dir */
 #define	NI_LCF_LATCH		0x0010	/* ni_beneath_latch valid */
 
 /*
  * Initialization of a nameidata structure.
  */
 #define	NDINIT(ndp, op, flags, segflg, namep, td)			\
 	NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, &cap_no_rights, td)
 #define	NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td)		\
 	NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, &cap_no_rights, td)
 #define	NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp, td) \
 	NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp, td)
 #define	NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td)		\
 	NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, &cap_no_rights, td)
 
 /*
  * Note the constant pattern may *hide* bugs.
  */
 #ifdef INVARIANTS
 #define NDINIT_PREFILL(arg)	memset(arg, 0xff, sizeof(*arg))
 #else
 #define NDINIT_PREFILL(arg)	do { } while (0)
 #endif
 
 #define NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, startdir, rightsp, td)	\
 do {										\
 	struct nameidata *_ndp = (ndp);						\
 	cap_rights_t *_rightsp = (rightsp);					\
 	MPASS(_rightsp != NULL);						\
 	NDINIT_PREFILL(_ndp);							\
 	_ndp->ni_cnd.cn_nameiop = op;						\
 	_ndp->ni_cnd.cn_flags = flags;						\
 	_ndp->ni_segflg = segflg;						\
 	_ndp->ni_dirp = namep;							\
 	_ndp->ni_dirfd = dirfd;							\
 	_ndp->ni_startdir = startdir;						\
 	_ndp->ni_resflags = 0;							\
 	filecaps_init(&_ndp->ni_filecaps);					\
 	_ndp->ni_cnd.cn_thread = td;						\
 	_ndp->ni_rightsneeded = _rightsp;					\
 } while (0)
 
 #define NDF_NO_DVP_RELE		0x00000001
 #define NDF_NO_DVP_UNLOCK	0x00000002
 #define NDF_NO_DVP_PUT		0x00000003
 #define NDF_NO_VP_RELE		0x00000004
 #define NDF_NO_VP_UNLOCK	0x00000008
 #define NDF_NO_VP_PUT		0x0000000c
 #define NDF_NO_STARTDIR_RELE	0x00000010
 #define NDF_NO_FREE_PNBUF	0x00000020
 #define NDF_ONLY_PNBUF		(~NDF_NO_FREE_PNBUF)
 
 void NDFREE_PNBUF(struct nameidata *);
 void NDFREE(struct nameidata *, const u_int);
 #define NDFREE(ndp, flags) do {						\
 	struct nameidata *_ndp = (ndp);					\
 	if (__builtin_constant_p(flags) && flags == NDF_ONLY_PNBUF)	\
 		NDFREE_PNBUF(_ndp);					\
 	else								\
 		NDFREE(_ndp, flags);					\
 } while (0)
 
 int	namei(struct nameidata *ndp);
 int	lookup(struct nameidata *ndp);
 int	relookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp);
 #endif
 
 /*
  * Stats on usefulness of namei caches.
  */
 struct nchstats {
 	long	ncs_goodhits;		/* hits that we can really use */
 	long	ncs_neghits;		/* negative hits that we can use */
 	long	ncs_badhits;		/* hits we must drop */
 	long	ncs_falsehits;		/* hits with id mismatch */
 	long	ncs_miss;		/* misses */
 	long	ncs_long;		/* long names that ignore cache */
 	long	ncs_pass2;		/* names found with passes == 2 */
 	long	ncs_2passes;		/* number of times we attempt it */
 };
 
 extern struct nchstats nchstats;
 
 #endif /* !_SYS_NAMEI_H_ */
Index: projects/clang1100-import/sys/sys/param.h
===================================================================
--- projects/clang1100-import/sys/sys/param.h	(revision 364278)
+++ projects/clang1100-import/sys/sys/param.h	(revision 364279)
@@ -1,370 +1,370 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /*
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		X.0-CURRENT before releng/X.0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1300108	/* Master, propagated to newvers */
+#define __FreeBSD_version 1300109	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #if defined(_KERNEL) || defined(IN_RTLD)
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 #define	P_OSREL_MAP_GUARD		1200035
 #define	P_OSREL_WRFSBASE		1200041
 #define	P_OSREL_CK_CYLGRP		1200046
 #define	P_OSREL_VMTOTAL64		1200054
 #define	P_OSREL_CK_SUPERBLOCK		1300000
 #define	P_OSREL_CK_INODE		1300005
 #define	P_OSREL_POWERPC_NEW_AUX_ARGS	1300070
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	255		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 #ifndef LOCORE
 /* Signals. */
 #include <sys/signal.h>
 #endif
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
  *		basis by defining it in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use
  *		the KVM memory reserved for the buffer cache and will wind
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
 
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
Index: projects/clang1100-import/sys/sys/vnode.h
===================================================================
--- projects/clang1100-import/sys/sys/vnode.h	(revision 364278)
+++ projects/clang1100-import/sys/sys/vnode.h	(revision 364279)
@@ -1,1070 +1,1070 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 #include <sys/_seqc.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 enum vgetstate	{ VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	i - interlock
  *	l - mp mnt_listmtx or freelist mutex
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type:8;			/* u vnode type */
 	short	v_irflag;			/* i frequently read flags */
 	seqc_t	v_seqc;				/* i modification count */
 	uint32_t v_nchash;			/* u namecache hash */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 */
 	union {
 		struct mount	*v_mountedhere;	/* v ptr to mountpoint (VDIR) */
 		struct unpcb	*v_unpcb;	/* v unix domain net (VSOCK) */
 		struct cdev	*v_rdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*v_fifoinfo;	/* v fifo (VFIFO) */
 	};
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_vnodelist;		/* l vnode lists */
 	TAILQ_ENTRY(vnode) v_lazylist;		/* l vnode lazy list */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_short	v_iflag;			/* i vnode flags (see below) */
 	u_short	v_vflag;			/* v vnode flags */
 	u_short	v_mflag;			/* l mnt-specific vnode flags */
 	short	v_dbatchcpu;			/* i LRU requeue deferral batch */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	int	v_seqc_users;			/* i modifications pending */
 	u_int	v_hash;
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	bo2vnode(bo)	__containerof((bo), struct vnode, v_bufobj)
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* unpcb, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VIRF_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VHOLD_NO_SMR	(1<<29)	/* Disable vhold_smr */
 #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
 
 #define	VIRF_DOOMED	0x0001	/* This vnode is being recycled */
 
 #define	VI_TEXT_REF	0x0001	/* Text ref grabbed use ref */
 #define	VI_MOUNT	0x0002	/* Mount in progress */
 #define	VI_DOINGINACT	0x0004	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x0008	/* Need to call inactive */
 #define	VI_DEFINACT	0x0010	/* deferred inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_VMSIZEVNLOCK	0x0020	/* object size check requires vnode lock */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 #define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 
 #define	VMP_LAZYLIST	0x0001	/* Vnode is on mnt's lazy list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	u_short		va_padding0;
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	nlink_t		va_nlink;	/* number of references to file */
 	dev_t		va_fsid;	/* filesystem id */
 	ino_t		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
 #define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_MNTREF	0x0010	/* vn_start_write: mp is already ref-ed */
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	u_long desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	int	vdesc_vop_offset;
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 void	assert_vop_locked(struct vnode *vp, const char *str);
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #define ASSERT_VOP_IN_SEQC(vp)	do {				\
 	struct vnode *_vp = (vp);				\
 								\
 	VNPASS(seqc_in_modify(_vp->v_seqc), _vp);		\
 } while (0)
 
 #define ASSERT_VOP_NOT_IN_SEQC(vp)	do {			\
 	struct vnode *_vp = (vp);				\
 								\
 	VNPASS(!seqc_in_modify(_vp->v_seqc), _vp);		\
 } while (0)
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 
 #define ASSERT_VOP_IN_SEQC(vp)		((void)0)
 #define ASSERT_VOP_NOT_IN_SEQC(vp)	((void)0)
 
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 #define	VN_OPEN_INVFS		0x00000008
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct freebsd11_stat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(u_long newhashsize);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_vnode_init(struct vnode *vp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_vgone(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp, bool force);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
 int	freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(void);
 void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf,
 	    size_t *buflen);
 int	vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen);
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred);
 int	vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
 	    struct ucred *cred);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred);
 void	vattr_null(struct vattr *vap);
 void	vlazy(struct vnode *);
 void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
-int	vget(struct vnode *vp, int flags, struct thread *td);
+int	vget(struct vnode *vp, int flags);
 enum vgetstate	vget_prep_smr(struct vnode *vp);
 enum vgetstate	vget_prep(struct vnode *vp);
 int	vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
 void	vget_finish_ref(struct vnode *vp, enum vgetstate vs);
 void	vget_abort(struct vnode *vp, enum vgetstate vs);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 void	vholdnz(struct vnode *);
 bool	vhold_smr(struct vnode *);
 void	vinactive(struct vnode *vp);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 	    int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vrecyclel(struct vnode *vp);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 int	vn_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_fsync_buf(struct vnode *vp, int waitfor);
 int	vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 int	vn_need_pageq_flush(struct vnode *vp);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	_vn_lock(struct vnode *vp, int flags, const char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
 	    struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 	    struct ucred *cred);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 void	vn_seqc_write_begin_unheld_locked(struct vnode *vp);
 void	vn_seqc_write_begin_unheld(struct vnode *vp);
 void	vn_seqc_write_begin_locked(struct vnode *vp);
 void	vn_seqc_write_begin(struct vnode *vp);
 void	vn_seqc_write_end_locked(struct vnode *vp);
 void	vn_seqc_write_end(struct vnode *vp);
 #define	vn_seqc_read_any(vp)		seqc_read_any(&(vp)->v_seqc)
 #define	vn_seqc_consistent(vp, seq)	seqc_consistent(&(vp)->v_seqc, seq)
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_tryrlock(vp, start, end)				\
 	rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_trywlock(vp, start, end)				\
 	rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 int	vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 void	vnlru_free(int, struct vfsops *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdioctl(struct vop_ioctl_args *);
 int	vop_stdneed_inactive(struct vop_need_inactive_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_lock(struct vop_lock1_args *);
 int	vop_unlock(struct vop_unlock_args *);
 int	vop_islocked(struct vop_islocked_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_pre(void *a);
 void	vop_create_post(void *a, int rc);
 void	vop_whiteout_pre(void *a);
 void	vop_whiteout_post(void *a, int rc);
 void	vop_deleteextattr_pre(void *a);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_pre(void *a);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_pre(void *a);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_pre(void *a);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_pre(void *a);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setacl_pre(void *a);
 void	vop_setacl_post(void *a, int rc);
 void	vop_setextattr_pre(void *a);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
 int	vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_fplookup_vexec_debugpre(void *a);
 void	vop_fplookup_vexec_debugpost(void *a, int rc);
 void	vop_strategy_debugpre(void *a);
 void	vop_lock_debugpre(void *a);
 void	vop_lock_debugpost(void *a, int rc);
 void	vop_unlock_debugpre(void *a);
 void	vop_need_inactive_debugpre(void *a);
 void	vop_need_inactive_debugpost(void *a, int rc);
 #else
 #define	vop_fplookup_vexec_debugpre(x)		do { } while (0)
 #define	vop_fplookup_vexec_debugpost(x, y)	do { } while (0)
 #define	vop_strategy_debugpre(x)		do { } while (0)
 #define	vop_lock_debugpre(x)			do { } while (0)
 #define	vop_lock_debugpost(x, y)		do { } while (0)
 #define	vop_unlock_debugpre(x)			do { } while (0)
 #define	vop_need_inactive_debugpre(x)		do { } while (0)
 #define	vop_need_inactive_debugpost(x, y)	do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	vop_stat_helper_pre(ap)	({						\
 	int _error;								\
 	AUDIT_ARG_VNODE1(ap->a_vp);						\
 	_error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\
 	if (__predict_true(_error == 0))					\
 		bzero(ap->a_sb, sizeof(*ap->a_sb));				\
 	_error;									\
 })
 
 #define	vop_stat_helper_post(ap, error)	({					\
 	int _error = (error);							\
 	if (priv_check_cred_vfs_generation(ap->a_td->td_ucred))			\
 		ap->a_sb->st_gen = 0;						\
 	_error;									\
 })
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 #ifdef INVARIANTS
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
 	VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_SET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_SET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_UNSET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_UNSET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #else
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
 #define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
 #define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
 #endif
 
 #define	VN_IS_DOOMED(vp)	__predict_false((vp)->v_irflag & VIRF_DOOMED)
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefl(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 static __inline int
 vrefcnt(struct vnode *vp)
 {
 
 	return (vp->v_usecount);
 }
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(u_long newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 int vfs_emptydir(struct vnode *vp);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 
 void vn_fsid(struct vnode *vp, struct vattr *va);
 
 int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp);
 
 #define VOP_UNLOCK_FLAGS(vp, flags)	({				\
 	struct vnode *_vp = (vp);					\
 	int _flags = (flags);						\
 	int _error;							\
 									\
         if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0)		\
                 panic("%s: unsupported flags %x\n", __func__, flags);	\
         _error = VOP_UNLOCK(_vp);					\
         if (_flags & LK_INTERLOCK)					\
                 VI_UNLOCK(_vp);						\
         _error;								\
 })
 
 #include <sys/kernel.h>
 
 #define VFS_VOP_VECTOR_REGISTER(vnodeops) \
 	SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
 	    vfs_vector_op_register, &vnodeops)
 
 #define VFS_SMR_DECLARE				\
 	extern smr_t vfs_smr
 
 #define VFS_SMR()	vfs_smr
 #define vfs_smr_enter()	smr_enter(VFS_SMR())
 #define vfs_smr_exit()	smr_exit(VFS_SMR())
 #define vfs_smr_entered_load(ptr)	smr_entered_load((ptr), VFS_SMR())
 #define VFS_SMR_ASSERT_ENTERED()	SMR_ASSERT_ENTERED(VFS_SMR())
 #define VFS_SMR_ASSERT_NOT_ENTERED()	SMR_ASSERT_NOT_ENTERED(VFS_SMR())
 #define VFS_SMR_ZONE_SET(zone)	uma_zone_set_smr((zone), VFS_SMR())
 
 #define vn_load_v_data_smr(vp)	({		\
 	struct vnode *_vp = (vp);		\
 						\
 	VFS_SMR_ASSERT_ENTERED();		\
 	atomic_load_ptr(&(_vp)->v_data);	\
 })
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: projects/clang1100-import/sys/ufs/ffs/ffs_alloc.c
===================================================================
--- projects/clang1100-import/sys/ufs/ffs/ffs_alloc.c	(revision 364278)
+++ projects/clang1100-import/sys/ufs/ffs/ffs_alloc.c	(revision 364279)
@@ -1,3519 +1,3519 @@
 /*-
  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
  *
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Marshall
  * Kirk McKusick and Network Associates Laboratories, the Security
  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  * research program
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_alloc.c	8.18 (Berkeley) 5/26/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/gsb_crc32.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 
 #include <security/audit/audit.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 #include <ufs/ffs/softdep.h>
 
 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
 				  int size, int rsize);
 
 static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
 static ufs2_daddr_t
 	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
 static void	ffs_blkfree_cg(struct ufsmount *, struct fs *,
 		    struct vnode *, ufs2_daddr_t, long, ino_t,
 		    struct workhead *);
 #ifdef INVARIANTS
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
 static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
 static ino_t	ffs_dirpref(struct inode *);
 static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
 		    int, int);
 static ufs2_daddr_t	ffs_hashalloc
 		(struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
 static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
 		    int);
 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
 static int	ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
 static int	ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
 static void	ffs_ckhash_cg(struct buf *);
 
 /*
  * Allocate a block in the filesystem.
  *
  * The size of the requested block is given, which must be some
  * multiple of fs_fsize and <= fs_bsize.
  * A preference may be optionally specified. If a preference is given
  * the following hierarchy is used to allocate a block:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate a block in the same cylinder group.
  *   4) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  * If no block preference is given the following hierarchy is used
  * to allocate a block:
  *   1) allocate a block in the cylinder group that contains the
  *      inode for the file.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  */
 int
 ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp)
 	struct inode *ip;
 	ufs2_daddr_t lbn, bpref;
 	int size, flags;
 	struct ucred *cred;
 	ufs2_daddr_t *bnp;
 {
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t bno;
 	u_int cg, reclaimed;
 	int64_t delta;
 #ifdef QUOTA
 	int error;
 #endif
 
 	*bnp = 0;
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef INVARIANTS
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_bsize, size,
 		    fs->fs_fsmnt);
 		panic("ffs_alloc: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_alloc: missing credential");
 #endif /* INVARIANTS */
 	reclaimed = 0;
 retry:
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	error = chkdq(ip, btodb(size), cred, 0);
 	if (error)
 		return (error);
 	UFS_LOCK(ump);
 #endif
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
 	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) &&
 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
 		goto nospace;
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	if (bpref == 0)
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
 	bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
 	if (bno > 0) {
 		delta = btodb(size);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		else
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		*bnp = bno;
 		return (0);
 	}
 nospace:
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, -btodb(size), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
 	if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
 		goto retry;
 	}
 	if (ffs_fsfail_cleanup_locked(ump, 0)) {
 		UFS_UNLOCK(ump);
 		return (ENXIO);
 	}
 	if (reclaimed > 0 &&
 	    ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
 		UFS_UNLOCK(ump);
 		ffs_fserr(fs, ip->i_number, "filesystem full");
 		uprintf("\n%s: write failed, filesystem is full\n",
 		    fs->fs_fsmnt);
 	} else {
 		UFS_UNLOCK(ump);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a fragment to a bigger size
  *
  * The number and size of the old block is given, and a preference
  * and new size is also specified. The allocator attempts to extend
  * the original block. Failing that, the regular block allocator is
  * invoked to get an appropriate block.
  */
 int
 ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
 	struct inode *ip;
 	ufs2_daddr_t lbprev;
 	ufs2_daddr_t bprev;
 	ufs2_daddr_t bpref;
 	int osize, nsize, flags;
 	struct ucred *cred;
 	struct buf **bpp;
 {
 	struct vnode *vp;
 	struct fs *fs;
 	struct buf *bp;
 	struct ufsmount *ump;
 	u_int cg, request, reclaimed;
 	int error, gbflags;
 	ufs2_daddr_t bno;
 	int64_t delta;
 
 	vp = ITOV(ip);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	bp = NULL;
 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef INVARIANTS
 	if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_realloccg: allocation on suspended filesystem");
 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 		printf(
 		"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
 		    nsize, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential");
 #endif /* INVARIANTS */
 	reclaimed = 0;
 retry:
 	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) &&
 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0) {
 		goto nospace;
 	}
 	if (bprev == 0) {
 		printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
 		    fs->fs_fsmnt);
 		panic("ffs_realloccg: bad bprev");
 	}
 	UFS_UNLOCK(ump);
 	/*
 	 * Allocate the extra space in the buffer.
 	 */
 	error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp);
 	if (error) {
 		return (error);
 	}
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		if (lbprev >= UFS_NDADDR)
 			panic("ffs_realloccg: lbprev out of range");
 		bp->b_blkno = fsbtodb(fs, bprev);
 	}
 
 #ifdef QUOTA
 	error = chkdq(ip, btodb(nsize - osize), cred, 0);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 #endif
 	/*
 	 * Check for extension in the existing location.
 	 */
 	*bpp = NULL;
 	cg = dtog(fs, bprev);
 	UFS_LOCK(ump);
 	bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
 	if (bno) {
 		if (bp->b_blkno != fsbtodb(fs, bno))
 			panic("ffs_realloccg: bad blockno");
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		else
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		vfs_bio_bzero_buf(bp, osize, nsize - osize);
 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
 			vfs_bio_set_valid(bp, osize, nsize - osize);
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Allocate a new disk location.
 	 */
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	switch ((int)fs->fs_optim) {
 	case FS_OPTSPACE:
 		/*
 		 * Allocate an exact sized fragment. Although this makes
 		 * best use of space, we will waste time relocating it if
 		 * the file continues to grow. If the fragmentation is
 		 * less than half of the minimum free reserve, we choose
 		 * to begin optimizing for time.
 		 */
 		request = nsize;
 		if (fs->fs_minfree <= 5 ||
 		    fs->fs_cstotal.cs_nffree >
 		    (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTTIME;
 		break;
 	case FS_OPTTIME:
 		/*
 		 * At this point we have discovered a file that is trying to
 		 * grow a small fragment to a larger fragment. To save time,
 		 * we allocate a full sized block, then free the unused portion.
 		 * If the file continues to grow, the `ffs_fragextend' call
 		 * above will be able to grow it in place without further
 		 * copying. If aberrant programs cause disk fragmentation to
 		 * grow within 2% of the free reserve, we choose to begin
 		 * optimizing for space.
 		 */
 		request = fs->fs_bsize;
 		if (fs->fs_cstotal.cs_nffree <
 		    (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTSPACE;
 		break;
 	default:
 		printf("dev = %s, optim = %ld, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
 	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
 			/*
 			 * The usual case is that a smaller fragment that
 			 * was just allocated has been replaced with a bigger
 			 * fragment or a full-size block. If it is marked as
 			 * B_DELWRI, the current contents have not been written
 			 * to disk. It is possible that the block was written
 			 * earlier, but very uncommon. If the block has never
 			 * been written, there is no need to send a BIO_DELETE
 			 * for it when it is freed. The gain from avoiding the
 			 * TRIMs for the common case of unwritten blocks far
 			 * exceeds the cost of the write amplification for the
 			 * uncommon case of failing to send a TRIM for a block
 			 * that had been written.
 			 */
 			ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
 			    ip->i_number, vp->v_type, NULL,
 			    (bp->b_flags & B_DELWRI) != 0 ?
 			    NOTRIM_KEY : SINGLETON_KEY);
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		else
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		vfs_bio_bzero_buf(bp, osize, nsize - osize);
 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
 			vfs_bio_set_valid(bp, osize, nsize - osize);
 		*bpp = bp;
 		return (0);
 	}
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
 nospace:
 	/*
 	 * no space available
 	 */
 	if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
 		reclaimed = 1;
 		UFS_UNLOCK(ump);
 		if (bp) {
 			brelse(bp);
 			bp = NULL;
 		}
 		UFS_LOCK(ump);
 		softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
 		goto retry;
 	}
 	if (bp)
 		brelse(bp);
 	if (ffs_fsfail_cleanup_locked(ump, 0)) {
 		UFS_UNLOCK(ump);
 		return (ENXIO);
 	}
 	if (reclaimed > 0 &&
 	    ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
 		UFS_UNLOCK(ump);
 		ffs_fserr(fs, ip->i_number, "filesystem full");
 		uprintf("\n%s: write failed, filesystem is full\n",
 		    fs->fs_fsmnt);
 	} else {
 		UFS_UNLOCK(ump);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a sequence of blocks into a contiguous sequence of blocks.
  *
  * The vnode and an array of buffer pointers for a range of sequential
  * logical blocks to be made contiguous is given. The allocator attempts
  * to find a range of sequential blocks starting as close as possible
  * from the end of the allocation for the logical block immediately
  * preceding the current range. If successful, the physical block numbers
  * in the buffer pointers and in the inode are changed to reflect the new
  * allocation. If unsuccessful, the allocation is left unchanged. The
  * success in doing the reallocation is returned. Note that the error
  * return is not reflected back to the user. Rather the previous block
  * allocation will be used.
  */
 
 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "FFS filesystem");
 
 static int doasyncfree = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
 "do not force synchronous writes when blocks are reallocated");
 
 static int doreallocblks = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
 "enable block reallocation");
 
 static int dotrimcons = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0,
 "enable BIO_DELETE / TRIM consolidation");
 
 static int maxclustersearch = 10;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
 0, "max number of cylinder group to search for contigous blocks");
 
 #ifdef DIAGNOSTIC
 static int prtrealloc = 0;
 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0,
 	"print out FFS filesystem block reallocation operations");
 #endif
 
 int
 ffs_reallocblks(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct ufsmount *ump;
 
 	/*
 	 * We used to skip reallocating the blocks of a file into a
 	 * contiguous sequence if the underlying flash device requested
 	 * BIO_DELETE notifications, because devices that benefit from
 	 * BIO_DELETE also benefit from not moving the data. However,
 	 * the destination for the data is usually moved before the data
 	 * is written to the initially allocated location, so we rarely
 	 * suffer the penalty of extra writes. With the addition of the
 	 * consolidation of contiguous blocks into single BIO_DELETE
 	 * operations, having fewer but larger contiguous blocks reduces
 	 * the number of (slow and expensive) BIO_DELETE operations. So
 	 * when doing BIO_DELETE consolidation, we do block reallocation.
 	 *
 	 * Skip if reallocblks has been disabled globally.
 	 */
 	ump = ap->a_vp->v_mount->mnt_data;
 	if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) ||
 	    doreallocblks == 0)
 		return (ENOSPC);
 
 	/*
 	 * We can't wait in softdep prealloc as it may fsync and recurse
 	 * here.  Instead we simply fail to reallocate blocks if this
 	 * rare condition arises.
 	 */
 	if (DOINGSOFTDEP(ap->a_vp))
 		if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
 			return (ENOSPC);
 	if (ump->um_fstype == UFS1)
 		return (ffs_reallocblks_ufs1(ap));
 	return (ffs_reallocblks_ufs2(ap));
 }
 	
 static int
 ffs_reallocblks_ufs1(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp, *bp;
 	ufs1_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
 	ufs_lbn_t start_lbn, end_lbn;
 	ufs1_daddr_t soff, newblk, blkno;
 	ufs2_daddr_t pref;
 	struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
 	int i, cg, len, start_lvl, end_lvl, ssize;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	/*
 	 * If we are not tracking block clusters or if we have less than 4%
 	 * free blocks left, then do not attempt to cluster. Running with
 	 * less than 5% free block reserve is not recommended and those that
 	 * choose to do so do not expect to have good file layout.
 	 */
 	if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef INVARIANTS
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the cluster crosses the boundary for the first indirect
 	 * block, leave space for the indirect block. Indirect blocks
 	 * are initially laid out in a position after the last direct
 	 * block. Block reallocation would usually destroy locality by
 	 * moving the indirect block out of the way to make room for
 	 * data blocks if we didn't compensate here. We should also do
 	 * this for other indirect block boundaries, but it is only
 	 * important for the first one.
 	 */
 	if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
 		return (ENOSPC);
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_din1->di_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs1_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	ebap = NULL;
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef INVARIANTS
 		if (start_lvl > 0 &&
 		    start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs1_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Find the preferred location for the cluster. If we have not
 	 * previously failed at this endeavor, then follow our standard
 	 * preference calculation. If we have failed at it, then pick up
 	 * where we last ended our search.
 	 */
 	UFS_LOCK(ump);
 	if (ip->i_nextclustercg == -1)
 		pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
 	else
 		pref = cgdata(fs, ip->i_nextclustercg);
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 * To avoid wasting too much time, we limit the number of cylinder
 	 * groups that we will search.
 	 */
 	cg = dtog(fs, pref);
 	for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
 		if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
 			break;
 		cg += 1;
 		if (cg >= fs->fs_ncg)
 			cg = 0;
 	}
 	/*
 	 * If we have failed in our search, record where we gave up for
 	 * next time. Otherwise, fall back to our usual search citerion.
 	 */
 	if (newblk == 0) {
 		ip->i_nextclustercg = cg;
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
 	ip->i_nextclustercg = -1;
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("realloc: ino %ju, lbns %jd-%jd\n\told:",
 		    (uintmax_t)ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %d,", *bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_din1->di_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_din1->di_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		if (!doasyncfree)
 			ffs_update(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
 			/*
 			 * The usual case is that a set of N-contiguous blocks
 			 * that was just allocated has been replaced with a
 			 * set of N+1-contiguous blocks. If they are marked as
 			 * B_DELWRI, the current contents have not been written
 			 * to disk. It is possible that the blocks were written
 			 * earlier, but very uncommon. If the blocks have never
 			 * been written, there is no need to send a BIO_DELETE
 			 * for them when they are freed. The gain from avoiding
 			 * the TRIMs for the common case of unwritten blocks
 			 * far exceeds the cost of the write amplification for
 			 * the uncommon case of failing to send a TRIM for the
 			 * blocks that had been written.
 			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
 			    dbtofsb(fs, bp->b_blkno),
 			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
 			    (bp->b_flags & B_DELWRI) != 0 ?
 			    NOTRIM_KEY : SINGLETON_KEY);
 		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %d,", blkno);
 #endif
 	}
 #ifdef DIAGNOSTIC
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_din1->di_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 static int
 ffs_reallocblks_ufs2(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp, *bp;
 	ufs2_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
 	ufs_lbn_t start_lbn, end_lbn;
 	ufs2_daddr_t soff, newblk, blkno, pref;
 	struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
 	int i, cg, len, start_lvl, end_lvl, ssize;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	/*
 	 * If we are not tracking block clusters or if we have less than 4%
 	 * free blocks left, then do not attempt to cluster. Running with
 	 * less than 5% free block reserve is not recommended and those that
 	 * choose to do so do not expect to have good file layout.
 	 */
 	if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef INVARIANTS
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the cluster crosses the boundary for the first indirect
 	 * block, do not move anything in it. Indirect blocks are
 	 * usually initially laid out in a position between the data
 	 * blocks. Block reallocation would usually destroy locality by
 	 * moving the indirect block out of the way to make room for
 	 * data blocks if we didn't compensate here. We should also do
 	 * this for other indirect block boundaries, but it is only
 	 * important for the first one.
 	 */
 	if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
 		return (ENOSPC);
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_din2->di_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs2_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	ebap = NULL;
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef INVARIANTS
 		if (start_lvl > 0 &&
 		    start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs2_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Find the preferred location for the cluster. If we have not
 	 * previously failed at this endeavor, then follow our standard
 	 * preference calculation. If we have failed at it, then pick up
 	 * where we last ended our search.
 	 */
 	UFS_LOCK(ump);
 	if (ip->i_nextclustercg == -1)
 		pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
 	else
 		pref = cgdata(fs, ip->i_nextclustercg);
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 * To avoid wasting too much time, we limit the number of cylinder
 	 * groups that we will search.
 	 */
 	cg = dtog(fs, pref);
 	for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
 		if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
 			break;
 		cg += 1;
 		if (cg >= fs->fs_ncg)
 			cg = 0;
 	}
 	/*
 	 * If we have failed in our search, record where we gave up for
 	 * next time. Otherwise, fall back to our usual search citerion.
 	 */
 	if (newblk == 0) {
 		ip->i_nextclustercg = cg;
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
 	ip->i_nextclustercg = -1;
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %jd,", (intmax_t)*bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_din2->di_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_din2->di_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		if (!doasyncfree)
 			ffs_update(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
 			/*
 			 * The usual case is that a set of N-contiguous blocks
 			 * that was just allocated has been replaced with a
 			 * set of N+1-contiguous blocks. If they are marked as
 			 * B_DELWRI, the current contents have not been written
 			 * to disk. It is possible that the blocks were written
 			 * earlier, but very uncommon. If the blocks have never
 			 * been written, there is no need to send a BIO_DELETE
 			 * for them when they are freed. The gain from avoiding
 			 * the TRIMs for the common case of unwritten blocks
 			 * far exceeds the cost of the write amplification for
 			 * the uncommon case of failing to send a TRIM for the
 			 * blocks that had been written.
 			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
 			    dbtofsb(fs, bp->b_blkno),
 			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
 			    (bp->b_flags & B_DELWRI) != 0 ?
 			    NOTRIM_KEY : SINGLETON_KEY);
 		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %jd,", (intmax_t)blkno);
 #endif
 	}
 #ifdef DIAGNOSTIC
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_din2->di_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 /*
  * Allocate an inode in the filesystem.
  *
  * If allocating a directory, use ffs_dirpref to select the inode.
  * If allocating in a directory, the following hierarchy is followed:
  *   1) allocate the preferred inode.
  *   2) allocate an inode in the same cylinder group.
  *   3) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  * If no inode preference is given the following hierarchy is used
  * to allocate an inode:
  *   1) allocate an inode in cylinder group 0.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  */
 int
 ffs_valloc(pvp, mode, cred, vpp)
 	struct vnode *pvp;
 	int mode;
 	struct ucred *cred;
 	struct vnode **vpp;
 {
 	struct inode *pip;
 	struct fs *fs;
 	struct inode *ip;
 	struct timespec ts;
 	struct ufsmount *ump;
 	ino_t ino, ipref;
 	u_int cg;
 	int error, reclaimed;
 
 	*vpp = NULL;
 	pip = VTOI(pvp);
 	ump = ITOUMP(pip);
 	fs = ump->um_fs;
 
 	UFS_LOCK(ump);
 	reclaimed = 0;
 retry:
 	if (fs->fs_cstotal.cs_nifree == 0)
 		goto noinodes;
 
 	if ((mode & IFMT) == IFDIR)
 		ipref = ffs_dirpref(pip);
 	else
 		ipref = pip->i_number;
 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
 		ipref = 0;
 	cg = ino_to_cg(fs, ipref);
 	/*
 	 * Track number of dirs created one after another
 	 * in a same cg without intervening by files.
 	 */
 	if ((mode & IFMT) == IFDIR) {
 		if (fs->fs_contigdirs[cg] < 255)
 			fs->fs_contigdirs[cg]++;
 	} else {
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
 	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
 					(allocfcn_t *)ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
 	/*
 	 * Get rid of the cached old vnode, force allocation of a new vnode
 	 * for this inode. If this fails, release the allocated ino and
 	 * return the error.
 	 */
 	if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp,
 	    FFSV_FORCEINSMQ | FFSV_REPLACE)) != 0) {
 		ffs_vfree(pvp, ino, mode);
 		return (error);
 	}
 	/*
 	 * We got an inode, so check mode and panic if it is already allocated.
 	 */
 	ip = VTOI(*vpp);
 	if (ip->i_mode) {
 		printf("mode = 0%o, inum = %ju, fs = %s\n",
 		    ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
 		panic("ffs_valloc: dup alloc");
 	}
 	if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) {  /* XXX */
 		printf("free inode %s/%lu had %ld blocks\n",
 		    fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
 		DIP_SET(ip, i_blocks, 0);
 	}
 	ip->i_flags = 0;
 	DIP_SET(ip, i_flags, 0);
 	/*
 	 * Set up a new generation number for this inode.
 	 */
 	while (ip->i_gen == 0 || ++ip->i_gen == 0)
 		ip->i_gen = arc4random();
 	DIP_SET(ip, i_gen, ip->i_gen);
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		vfs_timestamp(&ts);
 		ip->i_din2->di_birthtime = ts.tv_sec;
 		ip->i_din2->di_birthnsec = ts.tv_nsec;
 	}
 	ip->i_flag = 0;
 	(*vpp)->v_vflag = 0;
 	(*vpp)->v_type = VNON;
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		(*vpp)->v_op = &ffs_vnodeops2;
 		UFS_INODE_SET_FLAG(ip, IN_UFS2);
 	} else {
 		(*vpp)->v_op = &ffs_vnodeops1;
 	}
 	return (0);
 noinodes:
 	if (reclaimed == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT);
 		goto retry;
 	}
 	if (ffs_fsfail_cleanup_locked(ump, 0)) {
 		UFS_UNLOCK(ump);
 		return (ENXIO);
 	}
 	if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
 		UFS_UNLOCK(ump);
 		ffs_fserr(fs, pip->i_number, "out of inodes");
 		uprintf("\n%s: create/symlink failed, no inodes free\n",
 		    fs->fs_fsmnt);
 	} else {
 		UFS_UNLOCK(ump);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Find a cylinder group to place a directory.
  *
  * The policy implemented by this algorithm is to allocate a
  * directory inode in the same cylinder group as its parent
  * directory, but also to reserve space for its files inodes
  * and data. Restrict the number of directories which may be
  * allocated one after another in the same cylinder group
  * without intervening allocation of files.
  *
  * If we allocate a first level directory then force allocation
  * in another cylinder group.
  */
 static ino_t
 ffs_dirpref(pip)
 	struct inode *pip;
 {
 	struct fs *fs;
 	int cg, prefcg, dirsize, cgsize;
 	u_int avgifree, avgbfree, avgndir, curdirsize;
 	u_int minifree, minbfree, maxndir;
 	u_int mincg, minndir;
 	u_int maxcontigdirs;
 
 	mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
 	fs = ITOFS(pip);
 
 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
 
 	/*
 	 * Force allocation in another cg if creating a first level dir.
 	 */
 	ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
 	if (ITOV(pip)->v_vflag & VV_ROOT) {
 		prefcg = arc4random() % fs->fs_ncg;
 		mincg = prefcg;
 		minndir = fs->fs_ipg;
 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		for (cg = 0; cg < prefcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		return ((ino_t)(fs->fs_ipg * mincg));
 	}
 
 	/*
 	 * Count various limits which used for
 	 * optimal allocation of a directory inode.
 	 */
 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
 	minifree = avgifree - avgifree / 4;
 	if (minifree < 1)
 		minifree = 1;
 	minbfree = avgbfree - avgbfree / 4;
 	if (minbfree < 1)
 		minbfree = 1;
 	cgsize = fs->fs_fsize * fs->fs_fpg;
 	dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
 	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
 	if (dirsize < curdirsize)
 		dirsize = curdirsize;
 	if (dirsize <= 0)
 		maxcontigdirs = 0;		/* dirsize overflowed */
 	else
 		maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
 	if (fs->fs_avgfpdir > 0)
 		maxcontigdirs = min(maxcontigdirs,
 				    fs->fs_ipg / fs->fs_avgfpdir);
 	if (maxcontigdirs == 0)
 		maxcontigdirs = 1;
 
 	/*
 	 * Limit number of dirs in one cg and reserve space for 
 	 * regular files, but only if we have no deficit in
 	 * inodes or space.
 	 *
 	 * We are trying to find a suitable cylinder group nearby
 	 * our preferred cylinder group to place a new directory.
 	 * We scan from our preferred cylinder group forward looking
 	 * for a cylinder group that meets our criterion. If we get
 	 * to the final cylinder group and do not find anything,
 	 * we start scanning forwards from the beginning of the
 	 * filesystem. While it might seem sensible to start scanning
 	 * backwards or even to alternate looking forward and backward,
 	 * this approach fails badly when the filesystem is nearly full.
 	 * Specifically, we first search all the areas that have no space
 	 * and finally try the one preceding that. We repeat this on
 	 * every request and in the case of the final block end up
 	 * searching the entire filesystem. By jumping to the front
 	 * of the filesystem, our future forward searches always look
 	 * in new cylinder groups so finds every possible block after
 	 * one pass over the filesystem.
 	 */
 	prefcg = ino_to_cg(fs, pip->i_number);
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 		    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 		    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	/*
 	 * This is a backstop when we have deficit in space.
 	 */
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			return ((ino_t)(fs->fs_ipg * cg));
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			break;
 	return ((ino_t)(fs->fs_ipg * cg));
 }
 
 /*
  * Select the desired position for the next block in a file.  The file is
  * logically divided into sections. The first section is composed of the
  * direct blocks and the next fs_maxbpg blocks. Each additional section
  * contains fs_maxbpg blocks.
  *
  * If no blocks have been allocated in the first section, the policy is to
  * request a block in the same cylinder group as the inode that describes
  * the file. The first indirect is allocated immediately following the last
  * direct block and the data blocks for the first indirect immediately
  * follow it.
  *
  * If no blocks have been allocated in any other section, the indirect 
  * block(s) are allocated in the same cylinder group as its inode in an
  * area reserved immediately following the inode blocks. The policy for
  * the data blocks is to place them in a cylinder group with a greater than
  * average number of free blocks. An appropriate cylinder group is found
  * by using a rotor that sweeps the cylinder groups. When a new group of
  * blocks is needed, the sweep begins in the cylinder group following the
  * cylinder group from which the previous allocation was made. The sweep
  * continues until a cylinder group with greater than the average number
  * of free blocks is found. If the allocation is for the first block in an
  * indirect block or the previous block is a hole, then the information on
  * the previous allocation is unavailable; here a best guess is made based
  * on the logical block number being allocated.
  *
  * If a section is already partially allocated, the policy is to
  * allocate blocks contiguously within the section if possible.
  */
 ufs2_daddr_t
 ffs_blkpref_ufs1(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int indx;
 	ufs1_daddr_t *bap;
 {
 	struct fs *fs;
 	u_int cg, inocg;
 	u_int avgbfree, startcg;
 	ufs2_daddr_t pref, prevbn;
 
 	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
 	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 	fs = ITOFS(ip);
 	/*
 	 * Allocation of indirect blocks is indicated by passing negative
 	 * values in indx: -1 for single indirect, -2 for double indirect,
 	 * -3 for triple indirect. As noted below, we attempt to allocate
 	 * the first indirect inline with the file data. For all later
 	 * indirect blocks, the data is often allocated in other cylinder
 	 * groups. However to speed random file access and to speed up
 	 * fsck, the filesystem reserves the first fs_metaspace blocks
 	 * (typically half of fs_minfree) of the data area of each cylinder
 	 * group to hold these later indirect blocks.
 	 */
 	inocg = ino_to_cg(fs, ip->i_number);
 	if (indx < 0) {
 		/*
 		 * Our preference for indirect blocks is the zone at the
 		 * beginning of the inode's cylinder group data area that
 		 * we try to reserve for indirect blocks.
 		 */
 		pref = cgmeta(fs, inocg);
 		/*
 		 * If we are allocating the first indirect block, try to
 		 * place it immediately following the last direct block.
 		 */
 		if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
 		    ip->i_din1->di_db[UFS_NDADDR - 1] != 0)
 			pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag;
 		return (pref);
 	}
 	/*
 	 * If we are allocating the first data block in the first indirect
 	 * block and the indirect has been allocated in the data block area,
 	 * try to place it immediately following the indirect block.
 	 */
 	if (lbn == UFS_NDADDR) {
 		pref = ip->i_din1->di_ib[0];
 		if (pref != 0 && pref >= cgdata(fs, inocg) &&
 		    pref < cgbase(fs, inocg + 1))
 			return (pref + fs->fs_frag);
 	}
 	/*
 	 * If we are at the beginning of a file, or we have already allocated
 	 * the maximum number of blocks per cylinder group, or we do not
 	 * have a block allocated immediately preceding us, then we need
 	 * to decide where to start allocating new blocks.
 	 */
 	if (indx ==  0) {
 		prevbn = 0;
 	} else {
 		prevbn = bap[indx - 1];
 		if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn,
 		    fs->fs_bsize) != 0)
 			prevbn = 0;
 	}
 	if (indx % fs->fs_maxbpg == 0 || prevbn == 0) {
 		/*
 		 * If we are allocating a directory data block, we want
 		 * to place it in the metadata area.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR)
 			return (cgmeta(fs, inocg));
 		/*
 		 * Until we fill all the direct and all the first indirect's
 		 * blocks, we try to allocate in the data area of the inode's
 		 * cylinder group.
 		 */
 		if (lbn < UFS_NDADDR + NINDIR(fs))
 			return (cgdata(fs, inocg));
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || prevbn == 0)
 			startcg = inocg + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, prevbn) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		return (0);
 	}
 	/*
 	 * Otherwise, we just always try to lay things out contiguously.
 	 */
 	return (prevbn + fs->fs_frag);
 }
 
 /*
  * Same as above, but for UFS2
  */
 ufs2_daddr_t
 ffs_blkpref_ufs2(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int indx;
 	ufs2_daddr_t *bap;
 {
 	struct fs *fs;
 	u_int cg, inocg;
 	u_int avgbfree, startcg;
 	ufs2_daddr_t pref, prevbn;
 
 	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
 	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 	fs = ITOFS(ip);
 	/*
 	 * Allocation of indirect blocks is indicated by passing negative
 	 * values in indx: -1 for single indirect, -2 for double indirect,
 	 * -3 for triple indirect. As noted below, we attempt to allocate
 	 * the first indirect inline with the file data. For all later
 	 * indirect blocks, the data is often allocated in other cylinder
 	 * groups. However to speed random file access and to speed up
 	 * fsck, the filesystem reserves the first fs_metaspace blocks
 	 * (typically half of fs_minfree) of the data area of each cylinder
 	 * group to hold these later indirect blocks.
 	 */
 	inocg = ino_to_cg(fs, ip->i_number);
 	if (indx < 0) {
 		/*
 		 * Our preference for indirect blocks is the zone at the
 		 * beginning of the inode's cylinder group data area that
 		 * we try to reserve for indirect blocks.
 		 */
 		pref = cgmeta(fs, inocg);
 		/*
 		 * If we are allocating the first indirect block, try to
 		 * place it immediately following the last direct block.
 		 */
 		if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
 		    ip->i_din2->di_db[UFS_NDADDR - 1] != 0)
 			pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag;
 		return (pref);
 	}
 	/*
 	 * If we are allocating the first data block in the first indirect
 	 * block and the indirect has been allocated in the data block area,
 	 * try to place it immediately following the indirect block.
 	 */
 	if (lbn == UFS_NDADDR) {
 		pref = ip->i_din2->di_ib[0];
 		if (pref != 0 && pref >= cgdata(fs, inocg) &&
 		    pref < cgbase(fs, inocg + 1))
 			return (pref + fs->fs_frag);
 	}
 	/*
 	 * If we are at the beginning of a file, or we have already allocated
 	 * the maximum number of blocks per cylinder group, or we do not
 	 * have a block allocated immediately preceding us, then we need
 	 * to decide where to start allocating new blocks.
 	 */
 	if (indx ==  0) {
 		prevbn = 0;
 	} else {
 		prevbn = bap[indx - 1];
 		if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn,
 		    fs->fs_bsize) != 0)
 			prevbn = 0;
 	}
 	if (indx % fs->fs_maxbpg == 0 || prevbn == 0) {
 		/*
 		 * If we are allocating a directory data block, we want
 		 * to place it in the metadata area.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR)
 			return (cgmeta(fs, inocg));
 		/*
 		 * Until we fill all the direct and all the first indirect's
 		 * blocks, we try to allocate in the data area of the inode's
 		 * cylinder group.
 		 */
 		if (lbn < UFS_NDADDR + NINDIR(fs))
 			return (cgdata(fs, inocg));
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || prevbn == 0)
 			startcg = inocg + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, prevbn) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		return (0);
 	}
 	/*
 	 * Otherwise, we just always try to lay things out contiguously.
 	 */
 	return (prevbn + fs->fs_frag);
 }
 
 /*
  * Implement the cylinder overflow algorithm.
  *
  * The policy implemented by this algorithm is:
  *   1) allocate the block in its requested cylinder group.
  *   2) quadradically rehash on the cylinder group number.
  *   3) brute force search for a free block.
  *
  * Must be called with the UFS lock held.  Will release the lock on success
  * and return with it held on failure.
  */
 /*VARARGS5*/
 static ufs2_daddr_t
 ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t pref;
 	int size;	/* Search size for data blocks, mode for inodes */
 	int rsize;	/* Real allocated size. */
 	allocfcn_t *allocator;
 {
 	struct fs *fs;
 	ufs2_daddr_t result;
 	u_int i, icg = cg;
 
 	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 #ifdef INVARIANTS
 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_hashalloc: allocation on suspended filesystem");
 #endif
 	fs = ITOFS(ip);
 	/*
 	 * 1: preferred cylinder group
 	 */
 	result = (*allocator)(ip, cg, pref, size, rsize);
 	if (result)
 		return (result);
 	/*
 	 * 2: quadratic rehash
 	 */
 	for (i = 1; i < fs->fs_ncg; i *= 2) {
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
 		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 	}
 	/*
 	 * 3: brute force search
 	 * Note that we start at i == 2, since 0 was checked initially,
 	 * and 1 is always checked in the quadratic rehash.
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
 		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 		cg++;
 		if (cg == fs->fs_ncg)
 			cg = 0;
 	}
 	return (0);
 }
 
 /*
  * Determine whether a fragment can be extended.
  *
  * Check to see if the necessary fragments are available, and
  * if they are, allocate them.
  */
 static ufs2_daddr_t
 ffs_fragextend(ip, cg, bprev, osize, nsize)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bprev;
 	int osize, nsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int nffree;
 	long bno;
 	int frags, bbase;
 	int i, error;
 	u_int8_t *blksfree;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 		return (0);
 	frags = numfrags(fs, nsize);
 	bbase = fragnum(fs, bprev);
 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
 		/* cannot extend across a block boundary */
 		return (0);
 	}
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0)
 		goto fail;
 	bno = dtogd(fs, bprev);
 	blksfree = cg_blksfree(cgp);
 	for (i = numfrags(fs, osize); i < frags; i++)
 		if (isclr(blksfree, bno + i))
 			goto fail;
 	/*
 	 * the current fragment can be extended
 	 * deduct the count on fragment being extended into
 	 * increase the count on the remaining fragment (if any)
 	 * allocate the extended piece
 	 */
 	for (i = frags; i < fs->fs_frag - bbase; i++)
 		if (isclr(blksfree, bno + i))
 			break;
 	cgp->cg_frsum[i - numfrags(fs, osize)]--;
 	if (i != frags)
 		cgp->cg_frsum[i - frags]++;
 	for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
 		clrbit(blksfree, bno + i);
 		cgp->cg_cs.cs_nffree--;
 		nffree++;
 	}
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nffree -= nffree;
 	fs->fs_cs(fs, cg).cs_nffree -= nffree;
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
 		    frags, numfrags(fs, osize));
 	bdwrite(bp);
 	return (bprev);
 
 fail:
 	brelse(bp);
 	UFS_LOCK(ump);
 	return (0);
 
 }
 
 /*
  * Determine whether a block can be allocated.
  *
  * Check to see if a block of the appropriate size is available,
  * and if it is, allocate it.
  */
 static ufs2_daddr_t
 ffs_alloccg(ip, cg, bpref, size, rsize)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bpref;
 	int size;
 	int rsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	int i, allocsiz, error, frags;
 	u_int8_t *blksfree;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
 		return (0);
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 ||
 	   (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
 		goto fail;
 	if (size == fs->fs_bsize) {
 		UFS_LOCK(ump);
 		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
 	/*
 	 * check to see if any fragments are already available
 	 * allocsiz is the size which will be allocated, hacking
 	 * it down to a smaller size if necessary
 	 */
 	blksfree = cg_blksfree(cgp);
 	frags = numfrags(fs, size);
 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
 		if (cgp->cg_frsum[allocsiz] != 0)
 			break;
 	if (allocsiz == fs->fs_frag) {
 		/*
 		 * no fragments were available, so a block will be
 		 * allocated, and hacked up
 		 */
 		if (cgp->cg_cs.cs_nbfree == 0)
 			goto fail;
 		UFS_LOCK(ump);
 		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
 	KASSERT(size == rsize,
 	    ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
 	if (bno < 0)
 		goto fail;
 	for (i = 0; i < frags; i++)
 		clrbit(blksfree, bno + i);
 	cgp->cg_cs.cs_nffree -= frags;
 	cgp->cg_frsum[allocsiz]--;
 	if (frags != allocsiz)
 		cgp->cg_frsum[allocsiz - frags]++;
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nffree -= frags;
 	fs->fs_cs(fs, cg).cs_nffree -= frags;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cg) + bno;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
 	bdwrite(bp);
 	return (blkno);
 
 fail:
 	brelse(bp);
 	UFS_LOCK(ump);
 	return (0);
 }
 
 /*
  * Allocate a block in a cylinder group.
  *
  * This algorithm implements the following policy:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate the next available block on the block rotor for the
  *      specified cylinder group.
  * Note that this routine only allocates fs_bsize blocks; these
  * blocks may be fragmented by the routine that allocates them.
  */
 static ufs2_daddr_t
 ffs_alloccgblk(ip, bp, bpref, size)
 	struct inode *ip;
 	struct buf *bp;
 	ufs2_daddr_t bpref;
 	int size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct ufsmount *ump;
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	u_int8_t *blksfree;
 	int i, cgbpref;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	cgp = (struct cg *)bp->b_data;
 	blksfree = cg_blksfree(cgp);
 	if (bpref == 0) {
 		bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag;
 	} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
 		/* map bpref to correct zone in this cg */
 		if (bpref < cgdata(fs, cgbpref))
 			bpref = cgmeta(fs, cgp->cg_cgx);
 		else
 			bpref = cgdata(fs, cgp->cg_cgx);
 	}
 	/*
 	 * if the requested block is available, use it
 	 */
 	bno = dtogd(fs, blknum(fs, bpref));
 	if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
 		goto gotit;
 	/*
 	 * Take the next available block in this cylinder group.
 	 */
 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
 	if (bno < 0)
 		return (0);
 	/* Update cg_rotor only if allocated from the data zone */
 	if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
 		cgp->cg_rotor = bno;
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
 	ffs_clusteracct(fs, cgp, blkno, -1);
 	cgp->cg_cs.cs_nbfree--;
 	fs->fs_cstotal.cs_nbfree--;
 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cgp->cg_cgx) + bno;
 	/*
 	 * If the caller didn't want the whole block free the frags here.
 	 */
 	size = numfrags(fs, size);
 	if (size != fs->fs_frag) {
 		bno = dtogd(fs, blkno);
 		for (i = size; i < fs->fs_frag; i++)
 			setbit(blksfree, bno + i);
 		i = fs->fs_frag - size;
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
 		fs->fs_fmod = 1;
 		cgp->cg_frsum[i]++;
 	}
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
 	UFS_LOCK(ump);
 	return (blkno);
 }
 
 /*
  * Determine whether a cluster can be allocated.
  *
  * We do not currently check for optimal rotational layout if there
  * are multiple choices in the same cylinder group. Instead we just
  * take the first one that we find following bpref.
  */
 static ufs2_daddr_t
 ffs_clusteralloc(ip, cg, bpref, len)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bpref;
 	int len;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int i, run, bit, map, got, error;
 	ufs2_daddr_t bno;
 	u_char *mapp;
 	int32_t *lp;
 	u_int8_t *blksfree;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fs->fs_maxcluster[cg] < len)
 		return (0);
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
 		UFS_LOCK(ump);
 		return (0);
 	}
 	/*
 	 * Check to see if a cluster of the needed size (or bigger) is
 	 * available in this cylinder group.
 	 */
 	lp = &cg_clustersum(cgp)[len];
 	for (i = len; i <= fs->fs_contigsumsize; i++)
 		if (*lp++ > 0)
 			break;
 	if (i > fs->fs_contigsumsize) {
 		/*
 		 * This is the first time looking for a cluster in this
 		 * cylinder group. Update the cluster summary information
 		 * to reflect the true maximum sized cluster so that
 		 * future cluster allocation requests can avoid reading
 		 * the cylinder group map only to find no clusters.
 		 */
 		lp = &cg_clustersum(cgp)[len - 1];
 		for (i = len - 1; i > 0; i--)
 			if (*lp-- > 0)
 				break;
 		UFS_LOCK(ump);
 		fs->fs_maxcluster[cg] = i;
 		brelse(bp);
 		return (0);
 	}
 	/*
 	 * Search the cluster map to find a big enough cluster.
 	 * We take the first one that we find, even if it is larger
 	 * than we need as we prefer to get one close to the previous
 	 * block allocation. We do not search before the current
 	 * preference point as we do not want to allocate a block
 	 * that is allocated before the previous one (as we will
 	 * then have to wait for another pass of the elevator
 	 * algorithm before it will be read). We prefer to fail and
 	 * be recalled to try an allocation in the next cylinder group.
 	 */
 	if (dtog(fs, bpref) != cg)
 		bpref = cgdata(fs, cg);
 	else
 		bpref = blknum(fs, bpref);
 	bpref = fragstoblks(fs, dtogd(fs, bpref));
 	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
 	map = *mapp++;
 	bit = 1 << (bpref % NBBY);
 	for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
 		if ((map & bit) == 0) {
 			run = 0;
 		} else {
 			run++;
 			if (run == len)
 				break;
 		}
 		if ((got & (NBBY - 1)) != (NBBY - 1)) {
 			bit <<= 1;
 		} else {
 			map = *mapp++;
 			bit = 1;
 		}
 	}
 	if (got >= cgp->cg_nclusterblks) {
 		UFS_LOCK(ump);
 		brelse(bp);
 		return (0);
 	}
 	/*
 	 * Allocate the cluster that we have found.
 	 */
 	blksfree = cg_blksfree(cgp);
 	for (i = 1; i <= len; i++)
 		if (!ffs_isblock(fs, blksfree, got - run + i))
 			panic("ffs_clusteralloc: map mismatch");
 	bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
 	if (dtog(fs, bno) != cg)
 		panic("ffs_clusteralloc: allocated out of group");
 	len = blkstofrags(fs, len);
 	UFS_LOCK(ump);
 	for (i = 0; i < len; i += fs->fs_frag)
 		if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	bdwrite(bp);
 	return (bno);
 }
 
 static inline struct buf *
 getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
 {
 	struct fs *fs;
 
 	fs = ITOFS(ip);
 	return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
 	    cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
 	    gbflags));
 }
 
 /*
  * Synchronous inode initialization is needed only when barrier writes do not
  * work as advertised, and will impose a heavy cost on file creation in a newly
  * created filesystem.
  */
 static int doasyncinodeinit = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
     &doasyncinodeinit, 0,
     "Perform inode block initialization using asynchronous writes");
 
 /*
  * Determine whether an inode can be allocated.
  *
  * Check to see if an inode is available, and if it is,
  * allocate it using the following policy:
  *   1) allocate the requested inode.
  *   2) allocate the next available inode after the requested
  *      inode in the specified cylinder group.
  */
 static ufs2_daddr_t
 ffs_nodealloccg(ip, cg, ipref, mode, unused)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t ipref;
 	int mode;
 	int unused;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp, *ibp;
 	struct ufsmount *ump;
 	u_int8_t *inosused, *loc;
 	struct ufs2_dinode *dp2;
 	int error, start, len, i;
 	u_int32_t old_initediblk;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 check_nifree:
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
 		UFS_LOCK(ump);
 		return (0);
 	}
 restart:
 	if (cgp->cg_cs.cs_nifree == 0) {
 		brelse(bp);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	inosused = cg_inosused(cgp);
 	if (ipref) {
 		ipref %= fs->fs_ipg;
 		if (isclr(inosused, ipref))
 			goto gotit;
 	}
 	start = cgp->cg_irotor / NBBY;
 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
 	loc = memcchr(&inosused[start], 0xff, len);
 	if (loc == NULL) {
 		len = start + 1;
 		start = 0;
 		loc = memcchr(&inosused[start], 0xff, len);
 		if (loc == NULL) {
 			printf("cg = %d, irotor = %ld, fs = %s\n",
 			    cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
 			panic("ffs_nodealloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
 gotit:
 	/*
 	 * Check to see if we need to initialize more inodes.
 	 */
 	if (fs->fs_magic == FS_UFS2_MAGIC &&
 	    ipref + INOPB(fs) > cgp->cg_initediblk &&
 	    cgp->cg_initediblk < cgp->cg_niblk) {
 		old_initediblk = cgp->cg_initediblk;
 
 		/*
 		 * Free the cylinder group lock before writing the
 		 * initialized inode block.  Entering the
 		 * babarrierwrite() with the cylinder group lock
 		 * causes lock order violation between the lock and
 		 * snaplk.
 		 *
 		 * Another thread can decide to initialize the same
 		 * inode block, but whichever thread first gets the
 		 * cylinder group lock after writing the newly
 		 * allocated inode block will update it and the other
 		 * will realize that it has lost and leave the
 		 * cylinder group unchanged.
 		 */
 		ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
 		brelse(bp);
 		if (ibp == NULL) {
 			/*
 			 * The inode block buffer is already owned by
 			 * another thread, which must initialize it.
 			 * Wait on the buffer to allow another thread
 			 * to finish the updates, with dropped cg
 			 * buffer lock, then retry.
 			 */
 			ibp = getinobuf(ip, cg, old_initediblk, 0);
 			brelse(ibp);
 			UFS_LOCK(ump);
 			goto check_nifree;
 		}
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
 			while (dp2->di_gen == 0)
 				dp2->di_gen = arc4random();
 			dp2++;
 		}
 
 		/*
 		 * Rather than adding a soft updates dependency to ensure
 		 * that the new inode block is written before it is claimed
 		 * by the cylinder group map, we just do a barrier write
 		 * here. The barrier write will ensure that the inode block
 		 * gets written before the updated cylinder group map can be
 		 * written. The barrier write should only slow down bulk
 		 * loading of newly created filesystems.
 		 */
 		if (doasyncinodeinit)
 			babarrierwrite(ibp);
 		else
 			bwrite(ibp);
 
 		/*
 		 * After the inode block is written, try to update the
 		 * cg initediblk pointer.  If another thread beat us
 		 * to it, then leave it unchanged as the other thread
 		 * has already set it correctly.
 		 */
 		error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp);
 		UFS_LOCK(ump);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		if (error != 0)
 			return (error);
 		if (cgp->cg_initediblk == old_initediblk)
 			cgp->cg_initediblk += INOPB(fs);
 		goto restart;
 	}
 	cgp->cg_irotor = ipref;
 	UFS_LOCK(ump);
 	ACTIVECLEAR(fs, cg);
 	setbit(inosused, ipref);
 	cgp->cg_cs.cs_nifree--;
 	fs->fs_cstotal.cs_nifree--;
 	fs->fs_cs(fs, cg).cs_nifree--;
 	fs->fs_fmod = 1;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir++;
 		fs->fs_cstotal.cs_ndir++;
 		fs->fs_cs(fs, cg).cs_ndir++;
 	}
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
 	bdwrite(bp);
 	return ((ino_t)(cg * fs->fs_ipg + ipref));
 }
 
 /*
  * Free a block or fragment.
  *
  * The specified block or fragment is placed back in the
  * free map. If a fragment is deallocated, a possible
  * block reassembly is checked.
  */
 static void
 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	struct workhead *dephd;
 {
 	struct mount *mp;
 	struct cg *cgp;
 	struct buf *bp;
 	daddr_t dbn;
 	ufs1_daddr_t fragno, cgbno;
 	int i, blk, frags, bbase, error;
 	u_int cg;
 	u_int8_t *blksfree;
 	struct cdev *dev;
 
 	cg = dtog(fs, bno);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
 		MPASS(devvp->v_mount->mnt_data == ump);
 		dev = ump->um_devvp->v_rdev;
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */
 		dev = devvp->v_rdev;
 		ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg");
 	} else
 		return;
 #ifdef INVARIANTS
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
 		printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
 		    devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
 		    size, fs->fs_fsmnt);
 		panic("ffs_blkfree_cg: bad size");
 	}
 #endif
 	if ((u_int)bno >= fs->fs_size) {
 		printf("bad block %jd, ino %lu\n", (intmax_t)bno,
 		    (u_long)inum);
 		ffs_fserr(fs, inum, "bad block");
 		return;
 	}
 	if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) {
 		if (!ffs_fsfail_cleanup(ump, error) ||
 		    !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR)
 			return;
 		if (devvp->v_type == VREG)
 			dbn = fragstoblks(fs, cgtod(fs, cg));
 		else
 			dbn = fsbtodb(fs, cgtod(fs, cg));
 		error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp);
 		KASSERT(error == 0, ("getblkx failed"));
 		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
 		    numfrags(fs, size), dephd);
 		bp->b_flags |= B_RELBUF | B_NOCACHE;
 		bp->b_flags &= ~B_CACHE;
 		bawrite(bp);
 		return;
 	}
 	cgbno = dtogd(fs, bno);
 	blksfree = cg_blksfree(cgp);
 	UFS_LOCK(ump);
 	if (size == fs->fs_bsize) {
 		fragno = fragstoblks(fs, cgbno);
 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
 			if (devvp->v_type == VREG) {
 				UFS_UNLOCK(ump);
 				/* devvp is a snapshot */
 				brelse(bp);
 				return;
 			}
 			printf("dev = %s, block = %jd, fs = %s\n",
 			    devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
 			panic("ffs_blkfree_cg: freeing free block");
 		}
 		ffs_setblock(fs, blksfree, fragno);
 		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
 	} else {
 		bbase = cgbno - fragnum(fs, cgbno);
 		/*
 		 * decrement the counts associated with the old frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/*
 		 * deallocate the fragment
 		 */
 		frags = numfrags(fs, size);
 		for (i = 0; i < frags; i++) {
 			if (isset(blksfree, cgbno + i)) {
 				printf("dev = %s, block = %jd, fs = %s\n",
 				    devtoname(dev), (intmax_t)(bno + i),
 				    fs->fs_fsmnt);
 				panic("ffs_blkfree_cg: freeing free frag");
 			}
 			setbit(blksfree, cgbno + i);
 		}
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cg).cs_nffree += i;
 		/*
 		 * add back in counts associated with the new frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/*
 		 * if a complete block has been reassembled, account for it
 		 */
 		fragno = fragstoblks(fs, bbase);
 		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
 			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
 		}
 	}
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	mp = UFSTOVFS(ump);
 	if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR)
 		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
 		    numfrags(fs, size), dephd);
 	bdwrite(bp);
 }
 
 /*
  * Structures and routines associated with trim management.
  *
  * The following requests are passed to trim_lookup to indicate
  * the actions that should be taken.
  */
 #define	NEW	1	/* if found, error else allocate and hash it */
 #define	OLD	2	/* if not found, error, else return it */
 #define	REPLACE	3	/* if not found, error else unhash and reallocate it */
 #define	DONE	4	/* if not found, error else unhash and return it */
 #define	SINGLE	5	/* don't look up, just allocate it and don't hash it */
 
 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
 
 #define	TRIMLIST_HASH(ump, key) \
 	(&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
 
 /*
  * These structures describe each of the block free requests aggregated
  * together to make up a trim request.
  */
 struct trim_blkreq {
 	TAILQ_ENTRY(trim_blkreq) blkreqlist;
 	ufs2_daddr_t bno;
 	long size;
 	struct workhead *pdephd;
 	struct workhead dephd;
 };
 
 /*
  * Description of a trim request.
  */
 struct ffs_blkfree_trim_params {
 	TAILQ_HEAD(, trim_blkreq) blklist;
 	LIST_ENTRY(ffs_blkfree_trim_params) hashlist;
 	struct task task;
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	ino_t inum;
 	ufs2_daddr_t bno;
 	long size;
 	long key;
 };
 
 static void	ffs_blkfree_trim_completed(struct buf *);
 static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
 static struct	ffs_blkfree_trim_params *trim_lookup(struct ufsmount *,
 		    struct vnode *, ufs2_daddr_t, long, ino_t, u_long, int);
 static void	ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *);
 
 /*
  * Called on trim completion to start a task to free the associated block(s).
  */
 static void
 ffs_blkfree_trim_completed(bp)
 	struct buf *bp;
 {
 	struct ffs_blkfree_trim_params *tp;
 
 	tp = bp->b_fsprivate1;
 	free(bp, M_TRIM);
 	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
 	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
 }
 
 /*
  * Trim completion task that free associated block(s).
  */
 static void
 ffs_blkfree_trim_task(ctx, pending)
 	void *ctx;
 	int pending;
 {
 	struct ffs_blkfree_trim_params *tp;
 	struct trim_blkreq *blkelm;
 	struct ufsmount *ump;
 
 	tp = ctx;
 	ump = tp->ump;
 	while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) {
 		ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno,
 		    blkelm->size, tp->inum, blkelm->pdephd);
 		TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist);
 		free(blkelm, M_TRIM);
 	}
 	vn_finished_secondary_write(UFSTOVFS(ump));
 	UFS_LOCK(ump);
 	ump->um_trim_inflight -= 1;
 	ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size);
 	UFS_UNLOCK(ump);
 	free(tp, M_TRIM);
 }
 
 /*
  * Lookup a trim request by inode number.
  * Allocate if requested (NEW, REPLACE, SINGLE).
  */
 static struct ffs_blkfree_trim_params *
 trim_lookup(ump, devvp, bno, size, inum, key, alloctype)
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	u_long key;
 	int alloctype;
 {
 	struct trimlist_hashhead *tphashhead;
 	struct ffs_blkfree_trim_params *tp, *ntp;
 
 	ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
 	if (alloctype != SINGLE) {
 		KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key"));
 		UFS_LOCK(ump);
 		tphashhead = TRIMLIST_HASH(ump, key);
 		LIST_FOREACH(tp, tphashhead, hashlist)
 			if (key == tp->key)
 				break;
 	}
 	switch (alloctype) {
 	case NEW:
 		KASSERT(tp == NULL, ("trim_lookup: found trim"));
 		break;
 	case OLD:
 		KASSERT(tp != NULL,
 		    ("trim_lookup: missing call to ffs_blkrelease_start()"));
 		UFS_UNLOCK(ump);
 		free(ntp, M_TRIM);
 		return (tp);
 	case REPLACE:
 		KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim"));
 		LIST_REMOVE(tp, hashlist);
 		/* tp will be freed by caller */
 		break;
 	case DONE:
 		KASSERT(tp != NULL, ("trim_lookup: missing DONE trim"));
 		LIST_REMOVE(tp, hashlist);
 		UFS_UNLOCK(ump);
 		free(ntp, M_TRIM);
 		return (tp);
 	}
 	TAILQ_INIT(&ntp->blklist);
 	ntp->ump = ump;
 	ntp->devvp = devvp;
 	ntp->bno = bno;
 	ntp->size = size;
 	ntp->inum = inum;
 	ntp->key = key;
 	if (alloctype != SINGLE) {
 		LIST_INSERT_HEAD(tphashhead, ntp, hashlist);
 		UFS_UNLOCK(ump);
 	}
 	return (ntp);
 }
 
 /*
  * Dispatch a trim request.
  */
 static void
 ffs_blkfree_sendtrim(tp)
 	struct ffs_blkfree_trim_params *tp;
 {
 	struct ufsmount *ump;
 	struct mount *mp;
 	struct buf *bp;
 
 	/*
 	 * Postpone the set of the free bit in the cg bitmap until the
 	 * BIO_DELETE is completed.  Otherwise, due to disk queue
 	 * reordering, TRIM might be issued after we reuse the block
 	 * and write some new data into it.
 	 */
 	ump = tp->ump;
 	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
 	bp->b_iocmd = BIO_DELETE;
 	bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno));
 	bp->b_iodone = ffs_blkfree_trim_completed;
 	bp->b_bcount = tp->size;
 	bp->b_fsprivate1 = tp;
 	UFS_LOCK(ump);
 	ump->um_trim_total += 1;
 	ump->um_trim_inflight += 1;
 	ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size);
 	ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size);
 	UFS_UNLOCK(ump);
 
 	mp = UFSTOVFS(ump);
 	vn_start_secondary_write(NULL, &mp, 0);
 	g_vfs_strategy(ump->um_bo, bp);
 }
 
 /*
  * Allocate a new key to use to identify a range of blocks.
  */
 u_long
 ffs_blkrelease_start(ump, devvp, inum)
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	ino_t inum;
 {
 	static u_long masterkey;
 	u_long key;
 
 	if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
 		return (SINGLETON_KEY);
 	do {
 		key = atomic_fetchadd_long(&masterkey, 1);
 	} while (key < FIRST_VALID_KEY);
 	(void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW);
 	return (key);
 }
 
 /*
  * Deallocate a key that has been used to identify a range of blocks.
  */
 void
 ffs_blkrelease_finish(ump, key)
 	struct ufsmount *ump;
 	u_long key;
 {
 	struct ffs_blkfree_trim_params *tp;
 
 	if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
 		return;
 	/*
 	 * If the vfs.ffs.dotrimcons sysctl option is enabled while
 	 * a file deletion is active, specifically after a call
 	 * to ffs_blkrelease_start() but before the call to
 	 * ffs_blkrelease_finish(), ffs_blkrelease_start() will
 	 * have handed out SINGLETON_KEY rather than starting a
 	 * collection sequence. Thus if we get a SINGLETON_KEY
 	 * passed to ffs_blkrelease_finish(), we just return rather
 	 * than trying to finish the nonexistent sequence.
 	 */
 	if (key == SINGLETON_KEY) {
 #ifdef INVARIANTS
 		printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n",
 		    ump->um_mountp->mnt_stat.f_mntonname);
 #endif
 		return;
 	}
 	/*
 	 * We are done with sending blocks using this key. Look up the key
 	 * using the DONE alloctype (in tp) to request that it be unhashed
 	 * as we will not be adding to it. If the key has never been used,
 	 * tp->size will be zero, so we can just free tp. Otherwise the call
 	 * to ffs_blkfree_sendtrim(tp) causes the block range described by
 	 * tp to be issued (and then tp to be freed).
 	 */
 	tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE);
 	if (tp->size == 0)
 		free(tp, M_TRIM);
 	else
 		ffs_blkfree_sendtrim(tp);
 }
 
 /*
  * Setup to free a block or fragment.
  *
  * Check for snapshots that might want to claim the block.
  * If trims are requested, prepare a trim request. Attempt to
  * aggregate consecutive blocks into a single trim request.
  */
 void
 ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	enum vtype vtype;
 	struct workhead *dephd;
 	u_long key;
 {
 	struct ffs_blkfree_trim_params *tp, *ntp;
 	struct trim_blkreq *blkelm;
 
 	/*
 	 * Check to see if a snapshot wants to claim the block.
 	 * Check that devvp is a normal disk device, not a snapshot,
 	 * it has a snapshot(s) associated with it, and one of the
 	 * snapshots wants to claim the block.
 	 */
 	if (devvp->v_type == VCHR &&
 	    (devvp->v_vflag & VV_COPYONWRITE) &&
 	    ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
 		return;
 	}
 	/*
 	 * Nothing to delay if TRIM is not required for this block or TRIM
 	 * is disabled or the operation is performed on a snapshot.
 	 */
 	if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
 	    devvp->v_type == VREG) {
 		ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
 		return;
 	}
 	blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK);
 	blkelm->bno = bno;
 	blkelm->size = size;
 	if (dephd == NULL) {
 		blkelm->pdephd = NULL;
 	} else {
 		LIST_INIT(&blkelm->dephd);
 		LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list);
 		blkelm->pdephd = &blkelm->dephd;
 	}
 	if (key == SINGLETON_KEY) {
 		/*
 		 * Just a single non-contiguous piece. Use the SINGLE
 		 * alloctype to return a trim request that will not be
 		 * hashed for future lookup.
 		 */
 		tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE);
 		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
 		ffs_blkfree_sendtrim(tp);
 		return;
 	}
 	/*
 	 * The callers of this function are not tracking whether or not
 	 * the blocks are contiguous. They are just saying that they
 	 * are freeing a set of blocks. It is this code that determines
 	 * the pieces of that range that are actually contiguous.
 	 *
 	 * Calling ffs_blkrelease_start() will have created an entry
 	 * that we will use.
 	 */
 	tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD);
 	if (tp->size == 0) {
 		/*
 		 * First block of a potential range, set block and size
 		 * for the trim block.
 		 */
 		tp->bno = bno;
 		tp->size = size;
 		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
 		return;
 	}
 	/*
 	 * If this block is a continuation of the range (either
 	 * follows at the end or preceeds in the front) then we
 	 * add it to the front or back of the list and return.
 	 *
 	 * If it is not a continuation of the trim that we were
 	 * building, using the REPLACE alloctype, we request that
 	 * the old trim request (still in tp) be unhashed and a
 	 * new range started (in ntp). The ffs_blkfree_sendtrim(tp)
 	 * call causes the block range described by tp to be issued
 	 * (and then tp to be freed).
 	 */
 	if (bno + numfrags(fs, size) == tp->bno) {
 		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
 		tp->bno = bno;
 		tp->size += size;
 		return;
 	} else if (bno == tp->bno + numfrags(fs, tp->size)) {
 		TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist);
 		tp->size += size;
 		return;
 	}
 	ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE);
 	TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist);
 	ffs_blkfree_sendtrim(tp);
 }
 
 #ifdef INVARIANTS
 /*
  * Verify allocation of a block or fragment. Returns true if block or
  * fragment is allocated, false if it is free.
  */
 static int
 ffs_checkblk(ip, bno, size)
 	struct inode *ip;
 	ufs2_daddr_t bno;
 	long size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	ufs1_daddr_t cgbno;
 	int i, error, frags, free;
 	u_int8_t *blksfree;
 
 	fs = ITOFS(ip);
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("bsize = %ld, size = %ld, fs = %s\n",
 		    (long)fs->fs_bsize, size, fs->fs_fsmnt);
 		panic("ffs_checkblk: bad size");
 	}
 	if ((u_int)bno >= fs->fs_size)
 		panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
 	error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp);
 	if (error)
 		panic("ffs_checkblk: cylinder group read failed");
 	blksfree = cg_blksfree(cgp);
 	cgbno = dtogd(fs, bno);
 	if (size == fs->fs_bsize) {
 		free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
 	} else {
 		frags = numfrags(fs, size);
 		for (free = 0, i = 0; i < frags; i++)
 			if (isset(blksfree, cgbno + i))
 				free++;
 		if (free != 0 && free != frags)
 			panic("ffs_checkblk: partially free fragment");
 	}
 	brelse(bp);
 	return (!free);
 }
 #endif /* INVARIANTS */
 
 /*
  * Free an inode.
  */
 int
 ffs_vfree(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	struct ufsmount *ump;
 
 	if (DOINGSOFTDEP(pvp)) {
 		softdep_freefile(pvp, ino, mode);
 		return (0);
 	}
 	ump = VFSTOUFS(pvp->v_mount);
 	return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
 }
 
 /*
  * Do the actual free operation.
  * The specified inode is placed back in the free map.
  */
 int
 ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 	int mode;
 	struct workhead *wkhd;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	daddr_t dbn;
 	int error;
 	u_int cg;
 	u_int8_t *inosused;
 	struct cdev *dev;
 	ino_t cgino;
 
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
 		MPASS(devvp->v_mount->mnt_data == ump);
 		dev = ump->um_devvp->v_rdev;
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */
 		dev = devvp->v_rdev;
 	} else {
 		bp = NULL;
 		return (0);
 	}
 	if (ino >= fs->fs_ipg * fs->fs_ncg)
 		panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s",
 		    devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt);
 	if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) {
 		if (!ffs_fsfail_cleanup(ump, error) ||
 		    !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR)
 			return (error);
 		if (devvp->v_type == VREG)
 			dbn = fragstoblks(fs, cgtod(fs, cg));
 		else
 			dbn = fsbtodb(fs, cgtod(fs, cg));
 		error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp);
 		KASSERT(error == 0, ("getblkx failed"));
 		softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd);
 		bp->b_flags |= B_RELBUF | B_NOCACHE;
 		bp->b_flags &= ~B_CACHE;
 		bawrite(bp);
 		return (error);
 	}
 	inosused = cg_inosused(cgp);
 	cgino = ino % fs->fs_ipg;
 	if (isclr(inosused, cgino)) {
 		printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev),
 		    (uintmax_t)ino, fs->fs_fsmnt);
 		if (fs->fs_ronly == 0)
 			panic("ffs_freefile: freeing free inode");
 	}
 	clrbit(inosused, cgino);
 	if (cgino < cgp->cg_irotor)
 		cgp->cg_irotor = cgino;
 	cgp->cg_cs.cs_nifree++;
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nifree++;
 	fs->fs_cs(fs, cg).cs_nifree++;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir--;
 		fs->fs_cstotal.cs_ndir--;
 		fs->fs_cs(fs, cg).cs_ndir--;
 	}
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR)
 		softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd);
 	bdwrite(bp);
 	return (0);
 }
 
 /*
  * Check to see if a file is free.
  * Used to check for allocated files in snapshots.
  */
 int
 ffs_checkfreefile(fs, devvp, ino)
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	int ret, error;
 	u_int cg;
 	u_int8_t *inosused;
 
 	cg = ino_to_cg(fs, ino);
 	if ((devvp->v_type != VREG) && (devvp->v_type != VCHR))
 		return (1);
 	if (ino >= fs->fs_ipg * fs->fs_ncg)
 		return (1);
 	if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0)
 		return (1);
 	inosused = cg_inosused(cgp);
 	ino %= fs->fs_ipg;
 	ret = isclr(inosused, ino);
 	brelse(bp);
 	return (ret);
 }
 
 /*
  * Find a block of the specified size in the specified cylinder group.
  *
  * It is a panic if a request is made to find a block if none are
  * available.
  */
 static ufs1_daddr_t
 ffs_mapsearch(fs, cgp, bpref, allocsiz)
 	struct fs *fs;
 	struct cg *cgp;
 	ufs2_daddr_t bpref;
 	int allocsiz;
 {
 	ufs1_daddr_t bno;
 	int start, len, loc, i;
 	int blk, field, subfield, pos;
 	u_int8_t *blksfree;
 
 	/*
 	 * find the fragment by searching through the free block
 	 * map for an appropriate bit pattern
 	 */
 	if (bpref)
 		start = dtogd(fs, bpref) / NBBY;
 	else
 		start = cgp->cg_frotor / NBBY;
 	blksfree = cg_blksfree(cgp);
 	len = howmany(fs->fs_fpg, NBBY) - start;
 	loc = scanc((u_int)len, (u_char *)&blksfree[start],
 		fragtbl[fs->fs_frag],
 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = scanc((u_int)len, (u_char *)&blksfree[0],
 			fragtbl[fs->fs_frag],
 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 		if (loc == 0) {
 			printf("start = %d, len = %d, fs = %s\n",
 			    start, len, fs->fs_fsmnt);
 			panic("ffs_alloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	bno = (start + len - loc) * NBBY;
 	cgp->cg_frotor = bno;
 	/*
 	 * found the byte in the map
 	 * sift through the bits to find the selected frag
 	 */
 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
 		blk = blkmap(fs, blksfree, bno);
 		blk <<= 1;
 		field = around[allocsiz];
 		subfield = inside[allocsiz];
 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
 			if ((blk & field) == subfield)
 				return (bno + pos);
 			field <<= 1;
 			subfield <<= 1;
 		}
 	}
 	printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
 	panic("ffs_alloccg: block not in map");
 	return (-1);
 }
 
 static const struct statfs *
 ffs_getmntstat(struct vnode *devvp)
 {
 
 	if (devvp->v_type == VCHR)
 		return (&devvp->v_rdev->si_mountpt->mnt_stat);
 	return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp));
 }
 
 /*
  * Fetch and verify a cylinder group.
  */
 int
 ffs_getcg(fs, devvp, cg, flags, bpp, cgpp)
 	struct fs *fs;
 	struct vnode *devvp;
 	u_int cg;
 	int flags;
 	struct buf **bpp;
 	struct cg **cgpp;
 {
 	struct buf *bp;
 	struct cg *cgp;
 	const struct statfs *sfs;
 	daddr_t blkno;
 	int error;
 
 	*bpp = NULL;
 	*cgpp = NULL;
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0)
 		flags |= GB_CKHASH;
 	if (devvp->v_type == VREG)
 		blkno = fragstoblks(fs, cgtod(fs, cg));
 	else
 		blkno = fsbtodb(fs, cgtod(fs, cg));
 	error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL,
 	    NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp);
 	if (error != 0)
 		return (error);
 	cgp = (struct cg *)bp->b_data;
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0 &&
 	    (bp->b_flags & B_CKHASH) != 0 &&
 	    cgp->cg_ckhash != bp->b_ckhash) {
 		sfs = ffs_getmntstat(devvp);
 		printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: "
 		    "0x%x != bp: 0x%jx\n",
 		    devvp->v_type == VCHR ? "" : "snapshot of ",
 		    sfs->f_mntfromname, sfs->f_mntonname,
 		    cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash);
 		bp->b_flags &= ~B_CKHASH;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		return (EIO);
 	}
 	if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) {
 		sfs = ffs_getmntstat(devvp);
 		printf("UFS %s%s (%s)",
 		    devvp->v_type == VCHR ? "" : "snapshot of ",
 		    sfs->f_mntfromname, sfs->f_mntonname);
 		if (!cg_chkmagic(cgp))
 			printf(" cg %u: bad magic number 0x%x should be 0x%x\n",
 			    cg, cgp->cg_magic, CG_MAGIC);
 		else
 			printf(": wrong cylinder group cg %u != cgx %u\n", cg,
 			    cgp->cg_cgx);
 		bp->b_flags &= ~B_CKHASH;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		return (EIO);
 	}
 	bp->b_flags &= ~B_CKHASH;
 	bp->b_xflags |= BX_BKGRDWRITE;
 	/*
 	 * If we are using check hashes on the cylinder group then we want
 	 * to limit changing the cylinder group time to when we are actually
 	 * going to write it to disk so that its check hash remains correct
 	 * in memory. If the CK_CYLGRP flag is set the time is updated in
 	 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we
 	 * update the time here as we have done historically.
 	 */
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0)
 		bp->b_xflags |= BX_CYLGRP;
 	else
 		cgp->cg_old_time = cgp->cg_time = time_second;
 	*bpp = bp;
 	*cgpp = cgp;
 	return (0);
 }
 
 static void
 ffs_ckhash_cg(bp)
 	struct buf *bp;
 {
 	uint32_t ckhash;
 	struct cg *cgp;
 
 	cgp = (struct cg *)bp->b_data;
 	ckhash = cgp->cg_ckhash;
 	cgp->cg_ckhash = 0;
 	bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
 	cgp->cg_ckhash = ckhash;
 }
 
 /*
  * Fserr prints the name of a filesystem with an error diagnostic.
  *
  * The form of the error message is:
  *	fs: error message
  */
 void
 ffs_fserr(fs, inum, cp)
 	struct fs *fs;
 	ino_t inum;
 	char *cp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct proc *p = td->td_proc;
 
 	log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n",
 	    p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum,
 	    fs->fs_fsmnt, cp);
 }
 
 /*
  * This function provides the capability for the fsck program to
  * update an active filesystem. Fourteen operations are provided:
  *
  * adjrefcnt(inode, amt) - adjusts the reference count on the
  *	specified inode by the specified amount. Under normal
  *	operation the count should always go down. Decrementing
  *	the count to zero will cause the inode to be freed.
  * adjblkcnt(inode, amt) - adjust the number of blocks used by the
  *	inode by the specified amount.
  * setsize(inode, size) - set the size of the inode to the
  *	specified size.
  * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
  *	adjust the superblock summary.
  * freedirs(inode, count) - directory inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freefiles(inode, count) - file inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
  *	are marked as free. Blocks should never have to be marked
  *	as in use.
  * setflags(flags, set/clear) - the fs_flags field has the specified
  *	flags set (second parameter +1) or cleared (second parameter -1).
  * setcwd(dirinode) - set the current directory to dirinode in the
  *	filesystem associated with the snapshot.
  * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".."
  *	in the current directory is oldvalue then change it to newvalue.
  * unlink(nameptr, oldvalue) - Verify that the inode number associated
  *	with nameptr in the current directory is oldvalue then unlink it.
  */
 
 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt,
     CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_ffs_fsck, "S,fsck",
     "Adjust Inode Reference Count");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust Inode Used Blocks Count");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Set the inode size");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of directories");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free blocks");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free frags");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free clusters");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Free Range of Directory Inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Free Range of File Inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Free Range of Blocks");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Change Filesystem Flags");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Set Current Working Directory");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Change Value of .. Entry");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Unlink a Duplicate Name");
 
 #ifdef DIAGNOSTIC
 static int fsckcmds = 0;
 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0,
 	"print out fsck_ffs-based filesystem update commands");
 #endif /* DIAGNOSTIC */
 
 static int
 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td = curthread;
 	struct fsck_cmd cmd;
 	struct ufsmount *ump;
 	struct vnode *vp, *dvp, *fdvp;
 	struct inode *ip, *dp;
 	struct mount *mp;
 	struct fs *fs;
 	struct pwd *pwd;
 	ufs2_daddr_t blkno;
 	long blkcnt, blksize;
 	u_long key;
 	struct file *fp;
 	cap_rights_t rights;
 	int filetype, error;
 
 	if (req->newlen > sizeof cmd)
 		return (EBADRPC);
 	if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
 		return (error);
 	if (cmd.version != FFS_CMD_VERSION)
 		return (ERPCMISMATCH);
 	if ((error = getvnode(td, cmd.handle,
 	    cap_rights_init(&rights, CAP_FSCK), &fp)) != 0)
 		return (error);
 	vp = fp->f_data;
 	if (vp->v_type != VREG && vp->v_type != VDIR) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	vn_start_write(vp, &mp, V_WAIT);
 	if (mp == NULL ||
 	    strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
 		vn_finished_write(mp);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ump = VFSTOUFS(mp);
 	if ((mp->mnt_flag & MNT_RDONLY) &&
 	    ump->um_fsckpid != td->td_proc->p_pid) {
 		vn_finished_write(mp);
 		fdrop(fp, td);
 		return (EROFS);
 	}
 	fs = ump->um_fs;
 	filetype = IFREG;
 
 	switch (oidp->oid_number) {
 
 	case FFS_SET_FLAGS:
 #ifdef DIAGNOSTIC
 		if (fsckcmds)
 			printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
 			    cmd.size > 0 ? "set" : "clear");
 #endif /* DIAGNOSTIC */
 		if (cmd.size > 0)
 			fs->fs_flags |= (long)cmd.value;
 		else
 			fs->fs_flags &= ~(long)cmd.value;
 		break;
 
 	case FFS_ADJ_REFCNT:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust inode %jd link count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		ip->i_nlink += cmd.size;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_effnlink += cmd.size;
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
 		error = ffs_update(vp, 1);
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 		vput(vp);
 		break;
 
 	case FFS_ADJ_BLKCNT:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust inode %jd block count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
 		error = ffs_update(vp, 1);
 		vput(vp);
 		break;
 
 	case FFS_SET_SIZE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: set inode %jd size to %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		DIP_SET(ip, i_size, cmd.size);
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED);
 		error = ffs_update(vp, 1);
 		vput(vp);
 		break;
 
 	case FFS_DIR_FREE:
 		filetype = IFDIR;
 		/* fall through */
 
 	case FFS_FILE_FREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free %s inode %ju\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (uintmax_t)cmd.value);
 			else
 				printf("%s: free %s inodes %ju-%ju\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (uintmax_t)cmd.value,
 				    (uintmax_t)(cmd.value + cmd.size - 1));
 		}
 #endif /* DIAGNOSTIC */
 		while (cmd.size > 0) {
 			if ((error = ffs_freefile(ump, fs, ump->um_devvp,
 			    cmd.value, filetype, NULL)))
 				break;
 			cmd.size -= 1;
 			cmd.value += 1;
 		}
 		break;
 
 	case FFS_BLK_FREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free block %jd\n",
 				    mp->mnt_stat.f_mntonname,
 				    (intmax_t)cmd.value);
 			else
 				printf("%s: free blocks %jd-%jd\n",
 				    mp->mnt_stat.f_mntonname, 
 				    (intmax_t)cmd.value,
 				    (intmax_t)cmd.value + cmd.size - 1);
 		}
 #endif /* DIAGNOSTIC */
 		blkno = cmd.value;
 		blkcnt = cmd.size;
 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
 		key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
 		while (blkcnt > 0) {
 			if (blkcnt < blksize)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
 			    blksize * fs->fs_fsize, UFS_ROOTINO, 
 			    VDIR, NULL, key);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
 		}
 		ffs_blkrelease_finish(ump, key);
 		break;
 
 	/*
 	 * Adjust superblock summaries.  fsck(8) is expected to
 	 * submit deltas when necessary.
 	 */
 	case FFS_ADJ_NDIR:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of directories by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_ndir += cmd.value;
 		break;
 
 	case FFS_ADJ_NBFREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free blocks by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_nbfree += cmd.value;
 		break;
 
 	case FFS_ADJ_NIFREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free inodes by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_nifree += cmd.value;
 		break;
 
 	case FFS_ADJ_NFFREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free frags by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_nffree += cmd.value;
 		break;
 
 	case FFS_ADJ_NUMCLUSTERS:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free clusters by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_numclusters += cmd.value;
 		break;
 
 	case FFS_SET_CWD:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: set current directory to inode %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp)))
 			break;
 		AUDIT_ARG_VNODE1(vp);
 		if ((error = change_dir(vp, td)) != 0) {
 			vput(vp);
 			break;
 		}
 		VOP_UNLOCK(vp);
 		pwd_chdir(td, vp);
 		break;
 
 	case FFS_SET_DOTDOT:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: change .. in cwd from %jd to %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		/*
 		 * First we have to get and lock the parent directory
 		 * to which ".." points.
 		 */
 		error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp);
 		if (error)
 			break;
 		/*
 		 * Now we get and lock the child directory containing "..".
 		 */
 		pwd = pwd_hold(td);
 		dvp = pwd->pwd_cdir;
-		if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
+		if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) {
 			vput(fdvp);
 			pwd_drop(pwd);
 			break;
 		}
 		dp = VTOI(dvp);
 		dp->i_offset = 12;	/* XXX mastertemplate.dot_reclen */
 		error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size,
 		    DT_DIR, 0);
 		cache_purge(fdvp);
 		cache_purge(dvp);
 		vput(dvp);
 		vput(fdvp);
 		pwd_drop(pwd);
 		break;
 
 	case FFS_UNLINK:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			char buf[32];
 
 			if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL))
 				strncpy(buf, "Name_too_long", 32);
 			printf("%s: unlink %s (inode %jd)\n",
 			    mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		/*
 		 * kern_funlinkat will do its own start/finish writes and
 		 * they do not nest, so drop ours here. Setting mp == NULL
 		 * indicates that vn_finished_write is not needed down below.
 		 */
 		vn_finished_write(mp);
 		mp = NULL;
 		error = kern_funlinkat(td, AT_FDCWD,
 		    (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE,
 		    0, (ino_t)cmd.size);
 		break;
 
 	default:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("Invalid request %d from fsck\n",
 			    oidp->oid_number);
 		}
 #endif /* DIAGNOSTIC */
 		error = EINVAL;
 		break;
 
 	}
 	fdrop(fp, td);
 	vn_finished_write(mp);
 	return (error);
 }
Index: projects/clang1100-import/sys/ufs/ffs/ffs_softdep.c
===================================================================
--- projects/clang1100-import/sys/ufs/ffs/ffs_softdep.c	(revision 364278)
+++ projects/clang1100-import/sys/ufs/ffs/ffs_softdep.c	(revision 364279)
@@ -1,14827 +1,14826 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 1998, 2000 Marshall Kirk McKusick.
  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
  * All rights reserved.
  *
  * The soft updates code is derived from the appendix of a University
  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  * "Soft Updates: A Solution to the Metadata Update Problem in File
  * Systems", CSE-TR-254-95, August 1995).
  *
  * Further information about soft updates can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ffs.h"
 #include "opt_quota.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kdb.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/softdep.h>
 #include <ufs/ffs/ffs_extern.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <ddb/ddb.h>
 
 #define	KTR_SUJ	0	/* Define to KTR_SPARE. */
 
 #ifndef SOFTUPDATES
 
 int
 softdep_flushfiles(oldmnt, flags, td)
 	struct mount *oldmnt;
 	int flags;
 	struct thread *td;
 {
 
 	panic("softdep_flushfiles called");
 }
 
 int
 softdep_mount(devvp, mp, fs, cred)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 
 	return (0);
 }
 
 void
 softdep_initialize()
 {
 
 	return;
 }
 
 void
 softdep_uninitialize()
 {
 
 	return;
 }
 
 void
 softdep_unmount(mp)
 	struct mount *mp;
 {
 
 	panic("softdep_unmount called");
 }
 
 void
 softdep_setup_sbupdate(ump, fs, bp)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct buf *bp;
 {
 
 	panic("softdep_setup_sbupdate called");
 }
 
 void
 softdep_setup_inomapdep(bp, ip, newinum, mode)
 	struct buf *bp;
 	struct inode *ip;
 	ino_t newinum;
 	int mode;
 {
 
 	panic("softdep_setup_inomapdep called");
 }
 
 void
 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 	struct buf *bp;
 	struct mount *mp;
 	ufs2_daddr_t newblkno;
 	int frags;
 	int oldfrags;
 {
 
 	panic("softdep_setup_blkmapdep called");
 }
 
 void
 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	
 	panic("softdep_setup_allocdirect called");
 }
 
 void
 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	
 	panic("softdep_setup_allocext called");
 }
 
 void
 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	struct buf *bp;
 	int ptrno;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	struct buf *nbp;
 {
 
 	panic("softdep_setup_allocindir_page called");
 }
 
 void
 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 	struct buf *nbp;
 	struct inode *ip;
 	struct buf *bp;
 	int ptrno;
 	ufs2_daddr_t newblkno;
 {
 
 	panic("softdep_setup_allocindir_meta called");
 }
 
 void
 softdep_journal_freeblocks(ip, cred, length, flags)
 	struct inode *ip;
 	struct ucred *cred;
 	off_t length;
 	int flags;
 {
 	
 	panic("softdep_journal_freeblocks called");
 }
 
 void
 softdep_journal_fsync(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_journal_fsync called");
 }
 
 void
 softdep_setup_freeblocks(ip, length, flags)
 	struct inode *ip;
 	off_t length;
 	int flags;
 {
 	
 	panic("softdep_setup_freeblocks called");
 }
 
 void
 softdep_freefile(pvp, ino, mode)
 		struct vnode *pvp;
 		ino_t ino;
 		int mode;
 {
 
 	panic("softdep_freefile called");
 }
 
 int
 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 	struct buf *bp;
 	struct inode *dp;
 	off_t diroffset;
 	ino_t newinum;
 	struct buf *newdirbp;
 	int isnewblk;
 {
 
 	panic("softdep_setup_directory_add called");
 }
 
 void
 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
 	struct buf *bp;
 	struct inode *dp;
 	caddr_t base;
 	caddr_t oldloc;
 	caddr_t newloc;
 	int entrysize;
 {
 
 	panic("softdep_change_directoryentry_offset called");
 }
 
 void
 softdep_setup_remove(bp, dp, ip, isrmdir)
 	struct buf *bp;
 	struct inode *dp;
 	struct inode *ip;
 	int isrmdir;
 {
 	
 	panic("softdep_setup_remove called");
 }
 
 void
 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	struct buf *bp;
 	struct inode *dp;
 	struct inode *ip;
 	ino_t newinum;
 	int isrmdir;
 {
 
 	panic("softdep_setup_directory_change called");
 }
 
 void
 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
 	struct mount *mp;
 	struct buf *bp;
 	ufs2_daddr_t blkno;
 	int frags;
 	struct workhead *wkhd;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_inofree(mp, bp, ino, wkhd)
 	struct mount *mp;
 	struct buf *bp;
 	ino_t ino;
 	struct workhead *wkhd;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_unlink(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_link(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_revert_link(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_rmdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_revert_rmdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_create(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_revert_create(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_mkdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_revert_mkdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 void
 softdep_setup_dotdot_link(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 int
 softdep_prealloc(vp, waitok)
 	struct vnode *vp;
 	int waitok;
 {
 
 	panic("%s called", __FUNCTION__);
 }
 
 int
 softdep_journal_lookup(mp, vpp)
 	struct mount *mp;
 	struct vnode **vpp;
 {
 
 	return (ENOENT);
 }
 
 void
 softdep_change_linkcnt(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_change_linkcnt called");
 }
 
 void 
 softdep_load_inodeblock(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_load_inodeblock called");
 }
 
 void
 softdep_update_inodeblock(ip, bp, waitfor)
 	struct inode *ip;
 	struct buf *bp;
 	int waitfor;
 {
 
 	panic("softdep_update_inodeblock called");
 }
 
 int
 softdep_fsync(vp)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
 {
 
 	return (0);
 }
 
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
 
 	return;
 }
 
 int
 softdep_flushworklist(oldmnt, countp, td)
 	struct mount *oldmnt;
 	int *countp;
 	struct thread *td;
 {
 
 	*countp = 0;
 	return (0);
 }
 
 int
 softdep_sync_metadata(struct vnode *vp)
 {
 
 	panic("softdep_sync_metadata called");
 }
 
 int
 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 {
 
 	panic("softdep_sync_buf called");
 }
 
 int
 softdep_slowdown(vp)
 	struct vnode *vp;
 {
 
 	panic("softdep_slowdown called");
 }
 
 int
 softdep_request_cleanup(fs, vp, cred, resource)
 	struct fs *fs;
 	struct vnode *vp;
 	struct ucred *cred;
 	int resource;
 {
 
 	return (0);
 }
 
 int
 softdep_check_suspend(struct mount *mp,
 		      struct vnode *devvp,
 		      int softdep_depcnt,
 		      int softdep_accdepcnt,
 		      int secondary_writes,
 		      int secondary_accwrites)
 {
 	struct bufobj *bo;
 	int error;
 	
 	(void) softdep_depcnt,
 	(void) softdep_accdepcnt;
 
 	bo = &devvp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 
 	MNT_ILOCK(mp);
 	while (mp->mnt_secondary_writes != 0) {
 		BO_UNLOCK(bo);
 		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 		    (PUSER - 1) | PDROP, "secwr", 0);
 		BO_LOCK(bo);
 		MNT_ILOCK(mp);
 	}
 
 	/*
 	 * Reasons for needing more work before suspend:
 	 * - Dirty buffers on devvp.
 	 * - Secondary writes occurred after start of vnode sync loop
 	 */
 	error = 0;
 	if (bo->bo_numoutput > 0 ||
 	    bo->bo_dirty.bv_cnt > 0 ||
 	    secondary_writes != 0 ||
 	    mp->mnt_secondary_writes != 0 ||
 	    secondary_accwrites != mp->mnt_secondary_accwrites)
 		error = EAGAIN;
 	BO_UNLOCK(bo);
 	return (error);
 }
 
 void
 softdep_get_depcounts(struct mount *mp,
 		      int *softdepactivep,
 		      int *softdepactiveaccp)
 {
 	(void) mp;
 	*softdepactivep = 0;
 	*softdepactiveaccp = 0;
 }
 
 void
 softdep_buf_append(bp, wkhd)
 	struct buf *bp;
 	struct workhead *wkhd;
 {
 
 	panic("softdep_buf_appendwork called");
 }
 
 void
 softdep_inode_append(ip, cred, wkhd)
 	struct inode *ip;
 	struct ucred *cred;
 	struct workhead *wkhd;
 {
 
 	panic("softdep_inode_appendwork called");
 }
 
 void
 softdep_freework(wkhd)
 	struct workhead *wkhd;
 {
 
 	panic("softdep_freework called");
 }
 
 #else
 
 FEATURE(softupdates, "FFS soft-updates support");
 
 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "soft updates stats");
 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "total dependencies allocated");
 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "high use dependencies allocated");
 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "current dependencies allocated");
 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "current dependencies written");
 
 unsigned long dep_current[D_LAST + 1];
 unsigned long dep_highuse[D_LAST + 1];
 unsigned long dep_total[D_LAST + 1];
 unsigned long dep_write[D_LAST + 1];
 
 #define	SOFTDEP_TYPE(type, str, long)					\
     static MALLOC_DEFINE(M_ ## type, #str, long);			\
     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
 	&dep_total[D_ ## type], 0, "");					\
     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
 	&dep_current[D_ ## type], 0, "");				\
     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
 	&dep_highuse[D_ ## type], 0, "");				\
     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
 	&dep_write[D_ ## type], 0, "");
 
 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 
 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
     "Block or frag allocated from cyl group map");
 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
 
 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
 
 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
 
 #define M_SOFTDEP_FLAGS	(M_WAITOK)
 
 /* 
  * translate from workitem type to memory type
  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  */
 static struct malloc_type *memtype[] = {
 	NULL,
 	M_PAGEDEP,
 	M_INODEDEP,
 	M_BMSAFEMAP,
 	M_NEWBLK,
 	M_ALLOCDIRECT,
 	M_INDIRDEP,
 	M_ALLOCINDIR,
 	M_FREEFRAG,
 	M_FREEBLKS,
 	M_FREEFILE,
 	M_DIRADD,
 	M_MKDIR,
 	M_DIRREM,
 	M_NEWDIRBLK,
 	M_FREEWORK,
 	M_FREEDEP,
 	M_JADDREF,
 	M_JREMREF,
 	M_JMVREF,
 	M_JNEWBLK,
 	M_JFREEBLK,
 	M_JFREEFRAG,
 	M_JSEG,
 	M_JSEGDEP,
 	M_SBDEP,
 	M_JTRUNC,
 	M_JFSYNC,
 	M_SENTINEL
 };
 
 #define DtoM(type) (memtype[type])
 
 /*
  * Names of malloc types.
  */
 #define TYPENAME(type)  \
 	((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
 	memtype[type]->ks_shortdesc : "???")
 /*
  * End system adaptation definitions.
  */
 
 #define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
 #define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
 
 /*
  * Internal function prototypes.
  */
 static	void check_clear_deps(struct mount *);
 static	void softdep_error(char *, int);
 static	int softdep_process_worklist(struct mount *, int);
 static	int softdep_waitidle(struct mount *, int);
 static	void drain_output(struct vnode *);
 static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
 static	int check_inodedep_free(struct inodedep *);
 static	void clear_remove(struct mount *);
 static	void clear_inodedeps(struct mount *);
 static	void unlinked_inodedep(struct mount *, struct inodedep *);
 static	void clear_unlinked_inodedep(struct inodedep *);
 static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 static	int flush_pagedep_deps(struct vnode *, struct mount *,
 	    struct diraddhd *);
 static	int free_pagedep(struct pagedep *);
 static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 static	int flush_deplist(struct allocdirectlst *, int, int *);
 static	int sync_cgs(struct mount *, int);
 static	int handle_written_filepage(struct pagedep *, struct buf *, int);
 static	int handle_written_sbdep(struct sbdep *, struct buf *);
 static	void initiate_write_sbdep(struct sbdep *);
 static	void diradd_inode_written(struct diradd *, struct inodedep *);
 static	int handle_written_indirdep(struct indirdep *, struct buf *,
 	    struct buf**, int);
 static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
 static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 	    uint8_t *);
 static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
 static	void handle_written_jaddref(struct jaddref *);
 static	void handle_written_jremref(struct jremref *);
 static	void handle_written_jseg(struct jseg *, struct buf *);
 static	void handle_written_jnewblk(struct jnewblk *);
 static	void handle_written_jblkdep(struct jblkdep *);
 static	void handle_written_jfreefrag(struct jfreefrag *);
 static	void complete_jseg(struct jseg *);
 static	void complete_jsegs(struct jseg *);
 static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
 static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
 static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
 static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
 static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
 static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
 static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
 static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
 static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
 static	inline void inoref_write(struct inoref *, struct jseg *,
 	    struct jrefrec *);
 static	void handle_allocdirect_partdone(struct allocdirect *,
 	    struct workhead *);
 static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
 	    struct workhead *);
 static	void indirdep_complete(struct indirdep *);
 static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
 static	void indirblk_insert(struct freework *);
 static	void indirblk_remove(struct freework *);
 static	void handle_allocindir_partdone(struct allocindir *);
 static	void initiate_write_filepage(struct pagedep *, struct buf *);
 static	void initiate_write_indirdep(struct indirdep*, struct buf *);
 static	void handle_written_mkdir(struct mkdir *, int);
 static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
 	    uint8_t *);
 static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 static	void handle_workitem_freefile(struct freefile *);
 static	int handle_workitem_remove(struct dirrem *, int);
 static	struct dirrem *newdirrem(struct buf *, struct inode *,
 	    struct inode *, int, struct dirrem **);
 static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
 	    struct buf *);
 static	void cancel_indirdep(struct indirdep *, struct buf *,
 	    struct freeblks *);
 static	void free_indirdep(struct indirdep *);
 static	void free_diradd(struct diradd *, struct workhead *);
 static	void merge_diradd(struct inodedep *, struct diradd *);
 static	void complete_diradd(struct diradd *);
 static	struct diradd *diradd_lookup(struct pagedep *, int);
 static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
 	    struct jremref *);
 static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
 	    struct jremref *);
 static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
 	    struct jremref *, struct jremref *);
 static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
 	    struct jremref *);
 static	void cancel_allocindir(struct allocindir *, struct buf *bp,
 	    struct freeblks *, int);
 static	int setup_trunc_indir(struct freeblks *, struct inode *,
 	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
 static	void complete_trunc_indir(struct freework *);
 static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
 	    int);
 static	void complete_mkdir(struct mkdir *);
 static	void free_newdirblk(struct newdirblk *);
 static	void free_jremref(struct jremref *);
 static	void free_jaddref(struct jaddref *);
 static	void free_jsegdep(struct jsegdep *);
 static	void free_jsegs(struct jblocks *);
 static	void rele_jseg(struct jseg *);
 static	void free_jseg(struct jseg *, struct jblocks *);
 static	void free_jnewblk(struct jnewblk *);
 static	void free_jblkdep(struct jblkdep *);
 static	void free_jfreefrag(struct jfreefrag *);
 static	void free_freedep(struct freedep *);
 static	void journal_jremref(struct dirrem *, struct jremref *,
 	    struct inodedep *);
 static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
 static	int cancel_jaddref(struct jaddref *, struct inodedep *,
 	    struct workhead *);
 static	void cancel_jfreefrag(struct jfreefrag *);
 static	inline void setup_freedirect(struct freeblks *, struct inode *,
 	    int, int);
 static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
 static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
 	    ufs_lbn_t, int);
 static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
 static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
 static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
 static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
 static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
 static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
 	    int, int);
 static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
 static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
 static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
 static	void newblk_freefrag(struct newblk*);
 static	void free_newblk(struct newblk *);
 static	void cancel_allocdirect(struct allocdirectlst *,
 	    struct allocdirect *, struct freeblks *);
 static	int check_inode_unwritten(struct inodedep *);
 static	int free_inodedep(struct inodedep *);
 static	void freework_freeblock(struct freework *, u_long);
 static	void freework_enqueue(struct freework *);
 static	int handle_workitem_freeblocks(struct freeblks *, int);
 static	int handle_complete_freeblocks(struct freeblks *, int);
 static	void handle_workitem_indirblk(struct freework *);
 static	void handle_written_freework(struct freework *);
 static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
 	    struct workhead *);
 static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
 	    struct inodedep *, struct allocindir *, ufs_lbn_t);
 static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 	    ufs2_daddr_t, ufs_lbn_t);
 static	void handle_workitem_freefrag(struct freefrag *);
 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
 	    ufs_lbn_t, u_long);
 static	void allocdirect_merge(struct allocdirectlst *,
 	    struct allocdirect *, struct allocdirect *);
 static	struct freefrag *allocindir_merge(struct allocindir *,
 	    struct allocindir *);
 static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
 	    struct bmsafemap **);
 static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
 	    int cg, struct bmsafemap *);
 static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
 	    struct newblk **);
 static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
 static	int inodedep_find(struct inodedep_hashhead *, ino_t,
 	    struct inodedep **);
 static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
 	    int, struct pagedep **);
 static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 	    struct pagedep **);
 static	void pause_timer(void *);
 static	int request_cleanup(struct mount *, int);
 static	int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
 static	void schedule_cleanup(struct mount *);
 static void softdep_ast_cleanup_proc(struct thread *);
 static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
 static	int process_worklist_item(struct mount *, int, int);
 static	void process_removes(struct vnode *);
 static	void process_truncates(struct vnode *);
 static	void jwork_move(struct workhead *, struct workhead *);
 static	void jwork_insert(struct workhead *, struct jsegdep *);
 static	void add_to_worklist(struct worklist *, int);
 static	void wake_worklist(struct worklist *);
 static	void wait_worklist(struct worklist *, char *);
 static	void remove_from_worklist(struct worklist *);
 static	void softdep_flush(void *);
 static	void softdep_flushjournal(struct mount *);
 static	int softdep_speedup(struct ufsmount *);
 static	void worklist_speedup(struct mount *);
 static	int journal_mount(struct mount *, struct fs *, struct ucred *);
 static	void journal_unmount(struct ufsmount *);
 static	int journal_space(struct ufsmount *, int);
 static	void journal_suspend(struct ufsmount *);
 static	int journal_unsuspend(struct ufsmount *ump);
 static	void softdep_prelink(struct vnode *, struct vnode *);
 static	void add_to_journal(struct worklist *);
 static	void remove_from_journal(struct worklist *);
 static	bool softdep_excess_items(struct ufsmount *, int);
 static	void softdep_process_journal(struct mount *, struct worklist *, int);
 static	struct jremref *newjremref(struct dirrem *, struct inode *,
 	    struct inode *ip, off_t, nlink_t);
 static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
 	    uint16_t);
 static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
 	    uint16_t);
 static	inline struct jsegdep *inoref_jseg(struct inoref *);
 static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
 static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
 	    ufs2_daddr_t, int);
 static	void adjust_newfreework(struct freeblks *, int);
 static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
 static	void move_newblock_dep(struct jaddref *, struct inodedep *);
 static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
 static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
 	    ufs2_daddr_t, long, ufs_lbn_t);
 static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
 	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
 static	int jwait(struct worklist *, int);
 static	struct inodedep *inodedep_lookup_ip(struct inode *);
 static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
 static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
 static	void handle_jwork(struct workhead *);
 static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
 	    struct mkdir **);
 static	struct jblocks *jblocks_create(void);
 static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
 static	void jblocks_free(struct jblocks *, struct mount *, int);
 static	void jblocks_destroy(struct jblocks *);
 static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
 
 /*
  * Exported softdep operations.
  */
 static	void softdep_disk_io_initiation(struct buf *);
 static	void softdep_disk_write_complete(struct buf *);
 static	void softdep_deallocate_dependencies(struct buf *);
 static	int softdep_count_dependencies(struct buf *bp, int);
 
 /*
  * Global lock over all of soft updates.
  */
 static struct mtx lk;
 MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF);
 
 #define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
 #define FREE_GBLLOCK(lk)	mtx_unlock(lk)
 #define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
 
 /*
  * Per-filesystem soft-updates locking.
  */
 #define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
 #define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
 #define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
 #define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
 #define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
 				    RA_WLOCKED)
 
 #define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
 #define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
 
 /*
  * Worklist queue management.
  * These routines require that the lock be held.
  */
 #ifndef /* NOT */ INVARIANTS
 #define WORKLIST_INSERT(head, item) do {	\
 	(item)->wk_state |= ONWORKLIST;		\
 	LIST_INSERT_HEAD(head, item, wk_list);	\
 } while (0)
 #define WORKLIST_REMOVE(item) do {		\
 	(item)->wk_state &= ~ONWORKLIST;	\
 	LIST_REMOVE(item, wk_list);		\
 } while (0)
 #define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
 #define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
 
 #else /* INVARIANTS */
 static	void worklist_insert(struct workhead *, struct worklist *, int,
 	const char *, int);
 static	void worklist_remove(struct worklist *, int, const char *, int);
 
 #define WORKLIST_INSERT(head, item) \
 	worklist_insert(head, item, 1, __func__, __LINE__)
 #define WORKLIST_INSERT_UNLOCKED(head, item)\
 	worklist_insert(head, item, 0, __func__, __LINE__)
 #define WORKLIST_REMOVE(item)\
 	worklist_remove(item, 1, __func__, __LINE__)
 #define WORKLIST_REMOVE_UNLOCKED(item)\
 	worklist_remove(item, 0, __func__, __LINE__)
 
 static void
 worklist_insert(head, item, locked, func, line)
 	struct workhead *head;
 	struct worklist *item;
 	int locked;
 	const char *func;
 	int line;
 {
 
 	if (locked)
 		LOCK_OWNED(VFSTOUFS(item->wk_mp));
 	if (item->wk_state & ONWORKLIST)
 		panic("worklist_insert: %p %s(0x%X) already on list, "
 		    "added in function %s at line %d",
 		    item, TYPENAME(item->wk_type), item->wk_state,
 		    item->wk_func, item->wk_line);
 	item->wk_state |= ONWORKLIST;
 	item->wk_func = func;
 	item->wk_line = line;
 	LIST_INSERT_HEAD(head, item, wk_list);
 }
 
 static void
 worklist_remove(item, locked, func, line)
 	struct worklist *item;
 	int locked;
 	const char *func;
 	int line;
 {
 
 	if (locked)
 		LOCK_OWNED(VFSTOUFS(item->wk_mp));
 	if ((item->wk_state & ONWORKLIST) == 0)
 		panic("worklist_remove: %p %s(0x%X) not on list, "
 		    "removed in function %s at line %d",
 		    item, TYPENAME(item->wk_type), item->wk_state,
 		    item->wk_func, item->wk_line);
 	item->wk_state &= ~ONWORKLIST;
 	item->wk_func = func;
 	item->wk_line = line;
 	LIST_REMOVE(item, wk_list);
 }
 #endif /* INVARIANTS */
 
 /*
  * Merge two jsegdeps keeping only the oldest one as newer references
  * can't be discarded until after older references.
  */
 static inline struct jsegdep *
 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
 {
 	struct jsegdep *swp;
 
 	if (two == NULL)
 		return (one);
 
 	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
 		swp = one;
 		one = two;
 		two = swp;
 	}
 	WORKLIST_REMOVE(&two->jd_list);
 	free_jsegdep(two);
 
 	return (one);
 }
 
 /*
  * If two freedeps are compatible free one to reduce list size.
  */
 static inline struct freedep *
 freedep_merge(struct freedep *one, struct freedep *two)
 {
 	if (two == NULL)
 		return (one);
 
 	if (one->fd_freework == two->fd_freework) {
 		WORKLIST_REMOVE(&two->fd_list);
 		free_freedep(two);
 	}
 	return (one);
 }
 
 /*
  * Move journal work from one list to another.  Duplicate freedeps and
  * jsegdeps are coalesced to keep the lists as small as possible.
  */
 static void
 jwork_move(dst, src)
 	struct workhead *dst;
 	struct workhead *src;
 {
 	struct freedep *freedep;
 	struct jsegdep *jsegdep;
 	struct worklist *wkn;
 	struct worklist *wk;
 
 	KASSERT(dst != src,
 	    ("jwork_move: dst == src"));
 	freedep = NULL;
 	jsegdep = NULL;
 	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
 		if (wk->wk_type == D_JSEGDEP)
 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
 		else if (wk->wk_type == D_FREEDEP)
 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 	}
 
 	while ((wk = LIST_FIRST(src)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(dst, wk);
 		if (wk->wk_type == D_JSEGDEP) {
 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
 			continue;
 		}
 		if (wk->wk_type == D_FREEDEP)
 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 	}
 }
 
 static void
 jwork_insert(dst, jsegdep)
 	struct workhead *dst;
 	struct jsegdep *jsegdep;
 {
 	struct jsegdep *jsegdepn;
 	struct worklist *wk;
 
 	LIST_FOREACH(wk, dst, wk_list)
 		if (wk->wk_type == D_JSEGDEP)
 			break;
 	if (wk == NULL) {
 		WORKLIST_INSERT(dst, &jsegdep->jd_list);
 		return;
 	}
 	jsegdepn = WK_JSEGDEP(wk);
 	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
 		WORKLIST_REMOVE(wk);
 		free_jsegdep(jsegdepn);
 		WORKLIST_INSERT(dst, &jsegdep->jd_list);
 	} else
 		free_jsegdep(jsegdep);
 }
 
 /*
  * Routines for tracking and managing workitems.
  */
 static	void workitem_free(struct worklist *, int);
 static	void workitem_alloc(struct worklist *, int, struct mount *);
 static	void workitem_reassign(struct worklist *, int);
 
 #define	WORKITEM_FREE(item, type) \
 	workitem_free((struct worklist *)(item), (type))
 #define	WORKITEM_REASSIGN(item, type) \
 	workitem_reassign((struct worklist *)(item), (type))
 
 static void
 workitem_free(item, type)
 	struct worklist *item;
 	int type;
 {
 	struct ufsmount *ump;
 
 #ifdef INVARIANTS
 	if (item->wk_state & ONWORKLIST)
 		panic("workitem_free: %s(0x%X) still on list, "
 		    "added in function %s at line %d",
 		    TYPENAME(item->wk_type), item->wk_state,
 		    item->wk_func, item->wk_line);
 	if (item->wk_type != type && type != D_NEWBLK)
 		panic("workitem_free: type mismatch %s != %s",
 		    TYPENAME(item->wk_type), TYPENAME(type));
 #endif
 	if (item->wk_state & IOWAITING)
 		wakeup(item);
 	ump = VFSTOUFS(item->wk_mp);
 	LOCK_OWNED(ump);
 	KASSERT(ump->softdep_deps > 0,
 	    ("workitem_free: %s: softdep_deps going negative",
 	    ump->um_fs->fs_fsmnt));
 	if (--ump->softdep_deps == 0 && ump->softdep_req)
 		wakeup(&ump->softdep_deps);
 	KASSERT(dep_current[item->wk_type] > 0,
 	    ("workitem_free: %s: dep_current[%s] going negative",
 	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
 	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 	atomic_subtract_long(&dep_current[item->wk_type], 1);
 	ump->softdep_curdeps[item->wk_type] -= 1;
 #ifdef INVARIANTS
 	LIST_REMOVE(item, wk_all);
 #endif
 	free(item, DtoM(type));
 }
 
 static void
 workitem_alloc(item, type, mp)
 	struct worklist *item;
 	int type;
 	struct mount *mp;
 {
 	struct ufsmount *ump;
 
 	item->wk_type = type;
 	item->wk_mp = mp;
 	item->wk_state = 0;
 
 	ump = VFSTOUFS(mp);
 	ACQUIRE_GBLLOCK(&lk);
 	dep_current[type]++;
 	if (dep_current[type] > dep_highuse[type])
 		dep_highuse[type] = dep_current[type];
 	dep_total[type]++;
 	FREE_GBLLOCK(&lk);
 	ACQUIRE_LOCK(ump);
 	ump->softdep_curdeps[type] += 1;
 	ump->softdep_deps++;
 	ump->softdep_accdeps++;
 #ifdef INVARIANTS
 	LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all);
 #endif
 	FREE_LOCK(ump);
 }
 
 static void
 workitem_reassign(item, newtype)
 	struct worklist *item;
 	int newtype;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(item->wk_mp);
 	LOCK_OWNED(ump);
 	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
 	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 	ump->softdep_curdeps[item->wk_type] -= 1;
 	ump->softdep_curdeps[newtype] += 1;
 	KASSERT(dep_current[item->wk_type] > 0,
 	    ("workitem_reassign: %s: dep_current[%s] going negative",
 	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 	ACQUIRE_GBLLOCK(&lk);
 	dep_current[newtype]++;
 	dep_current[item->wk_type]--;
 	if (dep_current[newtype] > dep_highuse[newtype])
 		dep_highuse[newtype] = dep_current[newtype];
 	dep_total[newtype]++;
 	FREE_GBLLOCK(&lk);
 	item->wk_type = newtype;
 }
 
 /*
  * Workitem queue management
  */
 static int max_softdeps;	/* maximum number of structs before slowdown */
 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
 static int proc_waiting;	/* tracks whether we have a timeout posted */
 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
 static struct callout softdep_callout;
 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
 static int req_clear_remove;	/* syncer process flush some freeblks */
 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
 
 /*
  * runtime statistics
  */
 static int stat_flush_threads;	/* number of softdep flushing threads */
 static int stat_worklist_push;	/* number of worklist cleanups */
 static int stat_blk_limit_push;	/* number of times block limit neared */
 static int stat_ino_limit_push;	/* number of times inode limit neared */
 static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
 static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
 static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
 static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
 static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
 static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
 static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
 static int stat_journal_min;	/* Times hit journal min threshold */
 static int stat_journal_low;	/* Times hit journal low threshold */
 static int stat_journal_wait;	/* Times blocked in jwait(). */
 static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
 static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
 static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
 static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
 
 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
     &max_softdeps, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
     &tickdelay, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
     &stat_flush_threads, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,"");
 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,"");
 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,"");
 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures,
     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, "");
 
 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
     &softdep_flushcache, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
     &stat_emptyjblocks, 0, "");
 
 SYSCTL_DECL(_vfs_ffs);
 
 /* Whether to recompute the summary at mount time */
 static int compute_summary_at_mount = 0;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
 static int print_threads = 0;
 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
     &print_threads, 0, "Notify flusher thread start/stop");
 
 /* List of all filesystems mounted with soft updates */
 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
 
 /*
  * This function cleans the worklist for a filesystem.
  * Each filesystem running with soft dependencies gets its own
  * thread to run in this function. The thread is started up in
  * softdep_mount and shutdown in softdep_unmount. They show up
  * as part of the kernel "bufdaemon" process whose process
  * entry is available in bufdaemonproc.
  */
 static int searchfailed;
 extern struct proc *bufdaemonproc;
 static void
 softdep_flush(addr)
 	void *addr;
 {
 	struct mount *mp;
 	struct thread *td;
 	struct ufsmount *ump;
 
 	td = curthread;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 	mp = (struct mount *)addr;
 	ump = VFSTOUFS(mp);
 	atomic_add_int(&stat_flush_threads, 1);
 	ACQUIRE_LOCK(ump);
 	ump->softdep_flags &= ~FLUSH_STARTING;
 	wakeup(&ump->softdep_flushtd);
 	FREE_LOCK(ump);
 	if (print_threads) {
 		if (stat_flush_threads == 1)
 			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
 			    bufdaemonproc->p_pid);
 		printf("Start thread %s\n", td->td_name);
 	}
 	for (;;) {	
 		while (softdep_process_worklist(mp, 0) > 0 ||
 		    (MOUNTEDSUJ(mp) &&
 		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
 			kthread_suspend_check();
 		ACQUIRE_LOCK(ump);
 		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
 			    "sdflush", hz / 2);
 		ump->softdep_flags &= ~FLUSH_CLEANUP;
 		/*
 		 * Check to see if we are done and need to exit.
 		 */
 		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
 			FREE_LOCK(ump);
 			continue;
 		}
 		ump->softdep_flags &= ~FLUSH_EXIT;
 		FREE_LOCK(ump);
 		wakeup(&ump->softdep_flags);
 		if (print_threads)
 			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
 		atomic_subtract_int(&stat_flush_threads, 1);
 		kthread_exit();
 		panic("kthread_exit failed\n");
 	}
 }
 
 static void
 worklist_speedup(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 		ump->softdep_flags |= FLUSH_CLEANUP;
 	wakeup(&ump->softdep_flushtd);
 }
 
 static void
 softdep_send_speedup(struct ufsmount *ump, size_t shortage, u_int flags)
 {
 	struct buf *bp;
 
 	if ((ump->um_flags & UM_CANSPEEDUP) == 0)
 		return;
 
 	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
 	bp->b_iocmd = BIO_SPEEDUP;
 	bp->b_ioflags = flags;
 	bp->b_bcount = shortage;
 	g_vfs_strategy(ump->um_bo, bp);
 	bufwait(bp);
 	free(bp, M_TRIM);
 }
 
 static int
 softdep_speedup(ump)
 	struct ufsmount *ump;
 {
 	struct ufsmount *altump;
 	struct mount_softdeps *sdp;
 
 	LOCK_OWNED(ump);
 	worklist_speedup(ump->um_mountp);
 	bd_speedup();
 	/*
 	 * If we have global shortages, then we need other
 	 * filesystems to help with the cleanup. Here we wakeup a
 	 * flusher thread for a filesystem that is over its fair
 	 * share of resources.
 	 */
 	if (req_clear_inodedeps || req_clear_remove) {
 		ACQUIRE_GBLLOCK(&lk);
 		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
 			if ((altump = sdp->sd_ump) == ump)
 				continue;
 			if (((req_clear_inodedeps &&
 			    altump->softdep_curdeps[D_INODEDEP] >
 			    max_softdeps / stat_flush_threads) ||
 			    (req_clear_remove &&
 			    altump->softdep_curdeps[D_DIRREM] >
 			    (max_softdeps / 2) / stat_flush_threads)) &&
 			    TRY_ACQUIRE_LOCK(altump))
 				break;
 		}
 		if (sdp == NULL) {
 			searchfailed++;
 			FREE_GBLLOCK(&lk);
 		} else {
 			/*
 			 * Move to the end of the list so we pick a
 			 * different one on out next try.
 			 */
 			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
 			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
 			FREE_GBLLOCK(&lk);
 			if ((altump->softdep_flags &
 			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 				altump->softdep_flags |= FLUSH_CLEANUP;
 			altump->um_softdep->sd_cleanups++;
 			wakeup(&altump->softdep_flushtd);
 			FREE_LOCK(altump);
 		}
 	}
 	return (speedup_syncer());
 }
 
 /*
  * Add an item to the end of the work queue.
  * This routine requires that the lock be held.
  * This is the only routine that adds items to the list.
  * The following routine is the only one that removes items
  * and does so in order from first to last.
  */
 
 #define	WK_HEAD		0x0001	/* Add to HEAD. */
 #define	WK_NODELAY	0x0002	/* Process immediately. */
 
 static void
 add_to_worklist(wk, flags)
 	struct worklist *wk;
 	int flags;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(wk->wk_mp);
 	LOCK_OWNED(ump);
 	if (wk->wk_state & ONWORKLIST)
 		panic("add_to_worklist: %s(0x%X) already on list",
 		    TYPENAME(wk->wk_type), wk->wk_state);
 	wk->wk_state |= ONWORKLIST;
 	if (ump->softdep_on_worklist == 0) {
 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 		ump->softdep_worklist_tail = wk;
 	} else if (flags & WK_HEAD) {
 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 	} else {
 		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 		ump->softdep_worklist_tail = wk;
 	}
 	ump->softdep_on_worklist += 1;
 	if (flags & WK_NODELAY)
 		worklist_speedup(wk->wk_mp);
 }
 
 /*
  * Remove the item to be processed. If we are removing the last
  * item on the list, we need to recalculate the tail pointer.
  */
 static void
 remove_from_worklist(wk)
 	struct worklist *wk;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(wk->wk_mp);
 	if (ump->softdep_worklist_tail == wk)
 		ump->softdep_worklist_tail =
 		    (struct worklist *)wk->wk_list.le_prev;
 	WORKLIST_REMOVE(wk);
 	ump->softdep_on_worklist -= 1;
 }
 
 static void
 wake_worklist(wk)
 	struct worklist *wk;
 {
 	if (wk->wk_state & IOWAITING) {
 		wk->wk_state &= ~IOWAITING;
 		wakeup(wk);
 	}
 }
 
 static void
 wait_worklist(wk, wmesg)
 	struct worklist *wk;
 	char *wmesg;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(wk->wk_mp);
 	wk->wk_state |= IOWAITING;
 	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
 }
 
 /*
  * Process that runs once per second to handle items in the background queue.
  *
  * Note that we ensure that everything is done in the order in which they
  * appear in the queue. The code below depends on this property to ensure
  * that blocks of a file are freed before the inode itself is freed. This
  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  * until all the old ones have been purged from the dependency lists.
  */
 static int 
 softdep_process_worklist(mp, full)
 	struct mount *mp;
 	int full;
 {
 	int cnt, matchcnt;
 	struct ufsmount *ump;
 	long starttime;
 
 	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 	if (MOUNTEDSOFTDEP(mp) == 0)
 		return (0);
 	matchcnt = 0;
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(ump);
 	starttime = time_second;
 	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
 	check_clear_deps(mp);
 	while (ump->softdep_on_worklist > 0) {
 		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
 			break;
 		else
 			matchcnt += cnt;
 		check_clear_deps(mp);
 		/*
 		 * We do not generally want to stop for buffer space, but if
 		 * we are really being a buffer hog, we will stop and wait.
 		 */
 		if (should_yield()) {
 			FREE_LOCK(ump);
 			kern_yield(PRI_USER);
 			bwillwrite();
 			ACQUIRE_LOCK(ump);
 		}
 		/*
 		 * Never allow processing to run for more than one
 		 * second. This gives the syncer thread the opportunity
 		 * to pause if appropriate.
 		 */
 		if (!full && starttime != time_second)
 			break;
 	}
 	if (full == 0)
 		journal_unsuspend(ump);
 	FREE_LOCK(ump);
 	return (matchcnt);
 }
 
 /*
  * Process all removes associated with a vnode if we are running out of
  * journal space.  Any other process which attempts to flush these will
  * be unable as we have the vnodes locked.
  */
 static void
 process_removes(vp)
 	struct vnode *vp;
 {
 	struct inodedep *inodedep;
 	struct dirrem *dirrem;
 	struct ufsmount *ump;
 	struct mount *mp;
 	ino_t inum;
 
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	inum = VTOI(vp)->i_number;
 	for (;;) {
 top:
 		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
 			return;
 		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
 			/*
 			 * If another thread is trying to lock this vnode
 			 * it will fail but we must wait for it to do so
 			 * before we can proceed.
 			 */
 			if (dirrem->dm_state & INPROGRESS) {
 				wait_worklist(&dirrem->dm_list, "pwrwait");
 				goto top;
 			}
 			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 
 			    (COMPLETE | ONWORKLIST))
 				break;
 		}
 		if (dirrem == NULL)
 			return;
 		remove_from_worklist(&dirrem->dm_list);
 		FREE_LOCK(ump);
 		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 			panic("process_removes: suspended filesystem");
 		handle_workitem_remove(dirrem, 0);
 		vn_finished_secondary_write(mp);
 		ACQUIRE_LOCK(ump);
 	}
 }
 
 /*
  * Process all truncations associated with a vnode if we are running out
  * of journal space.  This is called when the vnode lock is already held
  * and no other process can clear the truncation.  This function returns
  * a value greater than zero if it did any work.
  */
 static void
 process_truncates(vp)
 	struct vnode *vp;
 {
 	struct inodedep *inodedep;
 	struct freeblks *freeblks;
 	struct ufsmount *ump;
 	struct mount *mp;
 	ino_t inum;
 	int cgwait;
 
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	inum = VTOI(vp)->i_number;
 	for (;;) {
 		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
 			return;
 		cgwait = 0;
 		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
 			/* Journal entries not yet written.  */
 			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
 				jwait(&LIST_FIRST(
 				    &freeblks->fb_jblkdephd)->jb_list,
 				    MNT_WAIT);
 				break;
 			}
 			/* Another thread is executing this item. */
 			if (freeblks->fb_state & INPROGRESS) {
 				wait_worklist(&freeblks->fb_list, "ptrwait");
 				break;
 			}
 			/* Freeblks is waiting on a inode write. */
 			if ((freeblks->fb_state & COMPLETE) == 0) {
 				FREE_LOCK(ump);
 				ffs_update(vp, 1);
 				ACQUIRE_LOCK(ump);
 				break;
 			}
 			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
 			    (ALLCOMPLETE | ONWORKLIST)) {
 				remove_from_worklist(&freeblks->fb_list);
 				freeblks->fb_state |= INPROGRESS;
 				FREE_LOCK(ump);
 				if (vn_start_secondary_write(NULL, &mp,
 				    V_NOWAIT))
 					panic("process_truncates: "
 					    "suspended filesystem");
 				handle_workitem_freeblocks(freeblks, 0);
 				vn_finished_secondary_write(mp);
 				ACQUIRE_LOCK(ump);
 				break;
 			}
 			if (freeblks->fb_cgwait)
 				cgwait++;
 		}
 		if (cgwait) {
 			FREE_LOCK(ump);
 			sync_cgs(mp, MNT_WAIT);
 			ffs_sync_snap(mp, MNT_WAIT);
 			ACQUIRE_LOCK(ump);
 			continue;
 		}
 		if (freeblks == NULL)
 			break;
 	}
 	return;
 }
 
 /*
  * Process one item on the worklist.
  */
 static int
 process_worklist_item(mp, target, flags)
 	struct mount *mp;
 	int target;
 	int flags;
 {
 	struct worklist sentinel;
 	struct worklist *wk;
 	struct ufsmount *ump;
 	int matchcnt;
 	int error;
 
 	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to write as we may
 	 * recurse into the copy-on-write routine.
 	 */
 	if (curthread->td_pflags & TDP_COWINPROGRESS)
 		return (-1);
 	PHOLD(curproc);	/* Don't let the stack go away. */
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	matchcnt = 0;
 	sentinel.wk_mp = NULL;
 	sentinel.wk_type = D_SENTINEL;
 	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
 	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
 	    wk = LIST_NEXT(&sentinel, wk_list)) {
 		if (wk->wk_type == D_SENTINEL) {
 			LIST_REMOVE(&sentinel, wk_list);
 			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
 			continue;
 		}
 		if (wk->wk_state & INPROGRESS)
 			panic("process_worklist_item: %p already in progress.",
 			    wk);
 		wk->wk_state |= INPROGRESS;
 		remove_from_worklist(wk);
 		FREE_LOCK(ump);
 		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 			panic("process_worklist_item: suspended filesystem");
 		switch (wk->wk_type) {
 		case D_DIRREM:
 			/* removal of a directory entry */
 			error = handle_workitem_remove(WK_DIRREM(wk), flags);
 			break;
 
 		case D_FREEBLKS:
 			/* releasing blocks and/or fragments from a file */
 			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
 			    flags);
 			break;
 
 		case D_FREEFRAG:
 			/* releasing a fragment when replaced as a file grows */
 			handle_workitem_freefrag(WK_FREEFRAG(wk));
 			error = 0;
 			break;
 
 		case D_FREEFILE:
 			/* releasing an inode when its link count drops to 0 */
 			handle_workitem_freefile(WK_FREEFILE(wk));
 			error = 0;
 			break;
 
 		default:
 			panic("%s_process_worklist: Unknown type %s",
 			    "softdep", TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 		vn_finished_secondary_write(mp);
 		ACQUIRE_LOCK(ump);
 		if (error == 0) {
 			if (++matchcnt == target)
 				break;
 			continue;
 		}
 		/*
 		 * We have to retry the worklist item later.  Wake up any
 		 * waiters who may be able to complete it immediately and
 		 * add the item back to the head so we don't try to execute
 		 * it again.
 		 */
 		wk->wk_state &= ~INPROGRESS;
 		wake_worklist(wk);
 		add_to_worklist(wk, WK_HEAD);
 	}
 	/* Sentinal could've become the tail from remove_from_worklist. */
 	if (ump->softdep_worklist_tail == &sentinel)
 		ump->softdep_worklist_tail =
 		    (struct worklist *)sentinel.wk_list.le_prev;
 	LIST_REMOVE(&sentinel, wk_list);
 	PRELE(curproc);
 	return (matchcnt);
 }
 
 /*
  * Move dependencies from one buffer to another.
  */
 int
 softdep_move_dependencies(oldbp, newbp)
 	struct buf *oldbp;
 	struct buf *newbp;
 {
 	struct worklist *wk, *wktail;
 	struct ufsmount *ump;
 	int dirty;
 
 	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
 		return (0);
 	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
 	    ("softdep_move_dependencies called on non-softdep filesystem"));
 	dirty = 0;
 	wktail = NULL;
 	ump = VFSTOUFS(wk->wk_mp);
 	ACQUIRE_LOCK(ump);
 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 		LIST_REMOVE(wk, wk_list);
 		if (wk->wk_type == D_BMSAFEMAP &&
 		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
 			dirty = 1;
 		if (wktail == NULL)
 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 		else
 			LIST_INSERT_AFTER(wktail, wk, wk_list);
 		wktail = wk;
 	}
 	FREE_LOCK(ump);
 
 	return (dirty);
 }
 
 /*
  * Purge the work list of all items associated with a particular mount point.
  */
 int
 softdep_flushworklist(oldmnt, countp, td)
 	struct mount *oldmnt;
 	int *countp;
 	struct thread *td;
 {
 	struct vnode *devvp;
 	struct ufsmount *ump;
 	int count, error;
 
 	/*
 	 * Alternately flush the block device associated with the mount
 	 * point and process any dependencies that the flushing
 	 * creates. We continue until no more worklist dependencies
 	 * are found.
 	 */
 	*countp = 0;
 	error = 0;
 	ump = VFSTOUFS(oldmnt);
 	devvp = ump->um_devvp;
 	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 		*countp += count;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(devvp, MNT_WAIT, td);
 		VOP_UNLOCK(devvp);
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 #define	SU_WAITIDLE_RETRIES	20
 static int
 softdep_waitidle(struct mount *mp, int flags __unused)
 {
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	struct thread *td;
 	int error, i;
 
 	ump = VFSTOUFS(mp);
 	devvp = ump->um_devvp;
 	td = curthread;
 	error = 0;
 	ACQUIRE_LOCK(ump);
 	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
 		ump->softdep_req = 1;
 		KASSERT((flags & FORCECLOSE) == 0 ||
 		    ump->softdep_on_worklist == 0,
 		    ("softdep_waitidle: work added after flush"));
 		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
 		    "softdeps", 10 * hz);
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(devvp, MNT_WAIT, td);
 		VOP_UNLOCK(devvp);
 		ACQUIRE_LOCK(ump);
 		if (error != 0)
 			break;
 	}
 	ump->softdep_req = 0;
 	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
 		error = EBUSY;
 		printf("softdep_waitidle: Failed to flush worklist for %p\n",
 		    mp);
 	}
 	FREE_LOCK(ump);
 	return (error);
 }
 
 /*
  * Flush all vnodes and worklist items associated with a specified mount point.
  */
 int
 softdep_flushfiles(oldmnt, flags, td)
 	struct mount *oldmnt;
 	int flags;
 	struct thread *td;
 {
 #ifdef QUOTA
 	struct ufsmount *ump;
 	int i;
 #endif
 	int error, early, depcount, loopcnt, retry_flush_count, retry;
 	int morework;
 
 	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
 	    ("softdep_flushfiles called on non-softdep filesystem"));
 	loopcnt = 10;
 	retry_flush_count = 3;
 retry_flush:
 	error = 0;
 
 	/*
 	 * Alternately flush the vnodes associated with the mount
 	 * point and process any dependencies that the flushing
 	 * creates. In theory, this loop can happen at most twice,
 	 * but we give it a few extra just to be sure.
 	 */
 	for (; loopcnt > 0; loopcnt--) {
 		/*
 		 * Do another flush in case any vnodes were brought in
 		 * as part of the cleanup operations.
 		 */
 		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
 		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
 		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
 			break;
 		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
 		    depcount == 0)
 			break;
 	}
 	/*
 	 * If we are unmounting then it is an error to fail. If we
 	 * are simply trying to downgrade to read-only, then filesystem
 	 * activity can keep us busy forever, so we just fail with EBUSY.
 	 */
 	if (loopcnt == 0) {
 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 			panic("softdep_flushfiles: looping");
 		error = EBUSY;
 	}
 	if (!error)
 		error = softdep_waitidle(oldmnt, flags);
 	if (!error) {
 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
 			retry = 0;
 			MNT_ILOCK(oldmnt);
 			morework = oldmnt->mnt_nvnodelistsize > 0;
 #ifdef QUOTA
 			ump = VFSTOUFS(oldmnt);
 			UFS_LOCK(ump);
 			for (i = 0; i < MAXQUOTAS; i++) {
 				if (ump->um_quotas[i] != NULLVP)
 					morework = 1;
 			}
 			UFS_UNLOCK(ump);
 #endif
 			if (morework) {
 				if (--retry_flush_count > 0) {
 					retry = 1;
 					loopcnt = 3;
 				} else
 					error = EBUSY;
 			}
 			MNT_IUNLOCK(oldmnt);
 			if (retry)
 				goto retry_flush;
 		}
 	}
 	return (error);
 }
 
 /*
  * Structure hashing.
  * 
  * There are four types of structures that can be looked up:
  *	1) pagedep structures identified by mount point, inode number,
  *	   and logical block.
  *	2) inodedep structures identified by mount point and inode number.
  *	3) newblk structures identified by mount point and
  *	   physical block number.
  *	4) bmsafemap structures identified by mount point and
  *	   cylinder group number.
  *
  * The "pagedep" and "inodedep" dependency structures are hashed
  * separately from the file blocks and inodes to which they correspond.
  * This separation helps when the in-memory copy of an inode or
  * file block must be replaced. It also obviates the need to access
  * an inode or file page when simply updating (or de-allocating)
  * dependency structures. Lookup of newblk structures is needed to
  * find newly allocated blocks when trying to associate them with
  * their allocdirect or allocindir structure.
  *
  * The lookup routines optionally create and hash a new instance when
  * an existing entry is not found. The bmsafemap lookup routine always
  * allocates a new structure if an existing one is not found.
  */
 #define DEPALLOC	0x0001	/* allocate structure if lookup fails */
 
 /*
  * Structures and routines associated with pagedep caching.
  */
 #define	PAGEDEP_HASH(ump, inum, lbn) \
 	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
 
 static int
 pagedep_find(pagedephd, ino, lbn, pagedeppp)
 	struct pagedep_hashhead *pagedephd;
 	ino_t ino;
 	ufs_lbn_t lbn;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 
 	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
 			*pagedeppp = pagedep;
 			return (1);
 		}
 	}
 	*pagedeppp = NULL;
 	return (0);
 }
 /*
  * Look up a pagedep. Return 1 if found, 0 otherwise.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in pagedeppp.
  */
 static int
 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
 	struct mount *mp;
 	struct buf *bp;
 	ino_t ino;
 	ufs_lbn_t lbn;
 	int flags;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 	struct pagedep_hashhead *pagedephd;
 	struct worklist *wk;
 	struct ufsmount *ump;
 	int ret;
 	int i;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	if (bp) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			if (wk->wk_type == D_PAGEDEP) {
 				*pagedeppp = WK_PAGEDEP(wk);
 				return (1);
 			}
 		}
 	}
 	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
 	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
 	if (ret) {
 		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
 			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
 		return (1);
 	}
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	FREE_LOCK(ump);
 	pagedep = malloc(sizeof(struct pagedep),
 	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 	ACQUIRE_LOCK(ump);
 	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
 	if (*pagedeppp) {
 		/*
 		 * This should never happen since we only create pagedeps
 		 * with the vnode lock held.  Could be an assert.
 		 */
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 		return (ret);
 	}
 	pagedep->pd_ino = ino;
 	pagedep->pd_lbn = lbn;
 	LIST_INIT(&pagedep->pd_dirremhd);
 	LIST_INIT(&pagedep->pd_pendinghd);
 	for (i = 0; i < DAHASHSZ; i++)
 		LIST_INIT(&pagedep->pd_diraddhd[i]);
 	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	*pagedeppp = pagedep;
 	return (0);
 }
 
 /*
  * Structures and routines associated with inodedep caching.
  */
 #define	INODEDEP_HASH(ump, inum) \
       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
 
 static int
 inodedep_find(inodedephd, inum, inodedeppp)
 	struct inodedep_hashhead *inodedephd;
 	ino_t inum;
 	struct inodedep **inodedeppp;
 {
 	struct inodedep *inodedep;
 
 	LIST_FOREACH(inodedep, inodedephd, id_hash)
 		if (inum == inodedep->id_ino)
 			break;
 	if (inodedep) {
 		*inodedeppp = inodedep;
 		return (1);
 	}
 	*inodedeppp = NULL;
 
 	return (0);
 }
 /*
  * Look up an inodedep. Return 1 if found, 0 if not found.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in inodedeppp.
  */
 static int
 inodedep_lookup(mp, inum, flags, inodedeppp)
 	struct mount *mp;
 	ino_t inum;
 	int flags;
 	struct inodedep **inodedeppp;
 {
 	struct inodedep *inodedep;
 	struct inodedep_hashhead *inodedephd;
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	fs = ump->um_fs;
 	inodedephd = INODEDEP_HASH(ump, inum);
 
 	if (inodedep_find(inodedephd, inum, inodedeppp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	/*
 	 * If the system is over its limit and our filesystem is
 	 * responsible for more than our share of that usage and
 	 * we are not in a rush, request some inodedep cleanup.
 	 */
 	if (softdep_excess_items(ump, D_INODEDEP))
 		schedule_cleanup(mp);
 	else
 		FREE_LOCK(ump);
 	inodedep = malloc(sizeof(struct inodedep),
 		M_INODEDEP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 	ACQUIRE_LOCK(ump);
 	if (inodedep_find(inodedephd, inum, inodedeppp)) {
 		WORKITEM_FREE(inodedep, D_INODEDEP);
 		return (1);
 	}
 	inodedep->id_fs = fs;
 	inodedep->id_ino = inum;
 	inodedep->id_state = ALLCOMPLETE;
 	inodedep->id_nlinkdelta = 0;
 	inodedep->id_nlinkwrote = -1;
 	inodedep->id_savedino1 = NULL;
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
 	inodedep->id_savednlink = -1;
 	inodedep->id_bmsafemap = NULL;
 	inodedep->id_mkdiradd = NULL;
 	LIST_INIT(&inodedep->id_dirremhd);
 	LIST_INIT(&inodedep->id_pendinghd);
 	LIST_INIT(&inodedep->id_inowait);
 	LIST_INIT(&inodedep->id_bufwait);
 	TAILQ_INIT(&inodedep->id_inoreflst);
 	TAILQ_INIT(&inodedep->id_inoupdt);
 	TAILQ_INIT(&inodedep->id_newinoupdt);
 	TAILQ_INIT(&inodedep->id_extupdt);
 	TAILQ_INIT(&inodedep->id_newextupdt);
 	TAILQ_INIT(&inodedep->id_freeblklst);
 	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 	*inodedeppp = inodedep;
 	return (0);
 }
 
 /*
  * Structures and routines associated with newblk caching.
  */
 #define	NEWBLK_HASH(ump, inum) \
 	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
 
 static int
 newblk_find(newblkhd, newblkno, flags, newblkpp)
 	struct newblk_hashhead *newblkhd;
 	ufs2_daddr_t newblkno;
 	int flags;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 
 	LIST_FOREACH(newblk, newblkhd, nb_hash) {
 		if (newblkno != newblk->nb_newblkno)
 			continue;
 		/*
 		 * If we're creating a new dependency don't match those that
 		 * have already been converted to allocdirects.  This is for
 		 * a frag extend.
 		 */
 		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
 			continue;
 		break;
 	}
 	if (newblk) {
 		*newblkpp = newblk;
 		return (1);
 	}
 	*newblkpp = NULL;
 	return (0);
 }
 
 /*
  * Look up a newblk. Return 1 if found, 0 if not found.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in newblkpp.
  */
 static int
 newblk_lookup(mp, newblkno, flags, newblkpp)
 	struct mount *mp;
 	ufs2_daddr_t newblkno;
 	int flags;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 	struct newblk_hashhead *newblkhd;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	newblkhd = NEWBLK_HASH(ump, newblkno);
 	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	if (softdep_excess_items(ump, D_NEWBLK) ||
 	    softdep_excess_items(ump, D_ALLOCDIRECT) ||
 	    softdep_excess_items(ump, D_ALLOCINDIR))
 		schedule_cleanup(mp);
 	else
 		FREE_LOCK(ump);
 	newblk = malloc(sizeof(union allblk), M_NEWBLK,
 	    M_SOFTDEP_FLAGS | M_ZERO);
 	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
 	ACQUIRE_LOCK(ump);
 	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
 		WORKITEM_FREE(newblk, D_NEWBLK);
 		return (1);
 	}
 	newblk->nb_freefrag = NULL;
 	LIST_INIT(&newblk->nb_indirdeps);
 	LIST_INIT(&newblk->nb_newdirblk);
 	LIST_INIT(&newblk->nb_jwork);
 	newblk->nb_state = ATTACHED;
 	newblk->nb_newblkno = newblkno;
 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 	*newblkpp = newblk;
 	return (0);
 }
 
 /*
  * Structures and routines associated with freed indirect block caching.
  */
 #define	INDIR_HASH(ump, blkno) \
 	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
 
 /*
  * Lookup an indirect block in the indir hash table.  The freework is
  * removed and potentially freed.  The caller must do a blocking journal
  * write before writing to the blkno.
  */
 static int
 indirblk_lookup(mp, blkno)
 	struct mount *mp;
 	ufs2_daddr_t blkno;
 {
 	struct freework *freework;
 	struct indir_hashhead *wkhd;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	wkhd = INDIR_HASH(ump, blkno);
 	TAILQ_FOREACH(freework, wkhd, fw_next) {
 		if (freework->fw_blkno != blkno)
 			continue;
 		indirblk_remove(freework);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Insert an indirect block represented by freework into the indirblk
  * hash table so that it may prevent the block from being re-used prior
  * to the journal being written.
  */
 static void
 indirblk_insert(freework)
 	struct freework *freework;
 {
 	struct jblocks *jblocks;
 	struct jseg *jseg;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(freework->fw_list.wk_mp);
 	jblocks = ump->softdep_jblocks;
 	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
 	if (jseg == NULL)
 		return;
 	
 	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
 	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
 	    fw_next);
 	freework->fw_state &= ~DEPCOMPLETE;
 }
 
 static void
 indirblk_remove(freework)
 	struct freework *freework;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(freework->fw_list.wk_mp);
 	LIST_REMOVE(freework, fw_segs);
 	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
 	freework->fw_state |= DEPCOMPLETE;
 	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
 		WORKITEM_FREE(freework, D_FREEWORK);
 }
 
 /*
  * Executed during filesystem system initialization before
  * mounting any filesystems.
  */
 void 
 softdep_initialize()
 {
 
 	TAILQ_INIT(&softdepmounts);
 #ifdef __LP64__
 	max_softdeps = desiredvnodes * 4;
 #else
 	max_softdeps = desiredvnodes * 2;
 #endif
 
 	/* initialise bioops hack */
 	bioops.io_start = softdep_disk_io_initiation;
 	bioops.io_complete = softdep_disk_write_complete;
 	bioops.io_deallocate = softdep_deallocate_dependencies;
 	bioops.io_countdeps = softdep_count_dependencies;
 	softdep_ast_cleanup = softdep_ast_cleanup_proc;
 
 	/* Initialize the callout with an mtx. */
 	callout_init_mtx(&softdep_callout, &lk, 0);
 }
 
 /*
  * Executed after all filesystems have been unmounted during
  * filesystem module unload.
  */
 void
 softdep_uninitialize()
 {
 
 	/* clear bioops hack */
 	bioops.io_start = NULL;
 	bioops.io_complete = NULL;
 	bioops.io_deallocate = NULL;
 	bioops.io_countdeps = NULL;
 	softdep_ast_cleanup = NULL;
 
 	callout_drain(&softdep_callout);
 }
 
 /*
  * Called at mount time to notify the dependency code that a
  * filesystem wishes to use it.
  */
 int
 softdep_mount(devvp, mp, fs, cred)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 	struct csum_total cstotal;
 	struct mount_softdeps *sdp;
 	struct ufsmount *ump;
 	struct cg *cgp;
 	struct buf *bp;
 	u_int cyl, i;
 	int error;
 
 	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
 	    M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
 	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
 		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 
 			MNTK_SOFTDEP | MNTK_NOASYNC;
 	}
 	ump = VFSTOUFS(mp);
 	ump->um_softdep = sdp;
 	MNT_IUNLOCK(mp);
 	rw_init(LOCK_PTR(ump), "per-fs softdep");
 	sdp->sd_ump = ump;
 	LIST_INIT(&ump->softdep_workitem_pending);
 	LIST_INIT(&ump->softdep_journal_pending);
 	TAILQ_INIT(&ump->softdep_unlinked);
 	LIST_INIT(&ump->softdep_dirtycg);
 	ump->softdep_worklist_tail = NULL;
 	ump->softdep_on_worklist = 0;
 	ump->softdep_deps = 0;
 	LIST_INIT(&ump->softdep_mkdirlisthd);
 	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 	    &ump->pagedep_hash_size);
 	ump->pagedep_nextclean = 0;
 	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
 	    &ump->inodedep_hash_size);
 	ump->inodedep_nextclean = 0;
 	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
 	    &ump->newblk_hash_size);
 	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
 	    &ump->bmsafemap_hash_size);
 	i = 1 << (ffs(desiredvnodes / 10) - 1);
 	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
 	    M_FREEWORK, M_WAITOK);
 	ump->indir_hash_size = i - 1;
 	for (i = 0; i <= ump->indir_hash_size; i++)
 		TAILQ_INIT(&ump->indir_hashtbl[i]);
 #ifdef INVARIANTS
 	for (i = 0; i <= D_LAST; i++)
 		LIST_INIT(&ump->softdep_alldeps[i]);
 #endif
 	ACQUIRE_GBLLOCK(&lk);
 	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
 	FREE_GBLLOCK(&lk);
 	if ((fs->fs_flags & FS_SUJ) &&
 	    (error = journal_mount(mp, fs, cred)) != 0) {
 		printf("Failed to start journal: %d\n", error);
 		softdep_unmount(mp);
 		return (error);
 	}
 	/*
 	 * Start our flushing thread in the bufdaemon process.
 	 */
 	ACQUIRE_LOCK(ump);
 	ump->softdep_flags |= FLUSH_STARTING;
 	FREE_LOCK(ump);
 	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
 	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
 	    mp->mnt_stat.f_mntonname);
 	ACQUIRE_LOCK(ump);
 	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
 		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
 		    hz / 2);
 	}
 	FREE_LOCK(ump);
 	/*
 	 * When doing soft updates, the counters in the
 	 * superblock may have gotten out of sync. Recomputation
 	 * can take a long time and can be deferred for background
 	 * fsck.  However, the old behavior of scanning the cylinder
 	 * groups and recalculating them at mount time is available
 	 * by setting vfs.ffs.compute_summary_at_mount to one.
 	 */
 	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 		return (0);
 	bzero(&cstotal, sizeof cstotal);
 	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 		    fs->fs_cgsize, cred, &bp)) != 0) {
 			brelse(bp);
 			softdep_unmount(mp);
 			return (error);
 		}
 		cgp = (struct cg *)bp->b_data;
 		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 		fs->fs_cs(fs, cyl) = cgp->cg_cs;
 		brelse(bp);
 	}
 #ifdef INVARIANTS
 	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 #endif
 	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 	return (0);
 }
 
 void
 softdep_unmount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump;
 #ifdef INVARIANTS
 	int i;
 #endif
 
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_unmount called on non-softdep filesystem"));
 	ump = VFSTOUFS(mp);
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_SOFTDEP;
 	if (MOUNTEDSUJ(mp) == 0) {
 		MNT_IUNLOCK(mp);
 	} else {
 		mp->mnt_flag &= ~MNT_SUJ;
 		MNT_IUNLOCK(mp);
 		journal_unmount(ump);
 	}
 	/*
 	 * Shut down our flushing thread. Check for NULL is if
 	 * softdep_mount errors out before the thread has been created.
 	 */
 	if (ump->softdep_flushtd != NULL) {
 		ACQUIRE_LOCK(ump);
 		ump->softdep_flags |= FLUSH_EXIT;
 		wakeup(&ump->softdep_flushtd);
 		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
 		    "sdwait", 0);
 		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
 		    ("Thread shutdown failed"));
 	}
 	/*
 	 * Free up our resources.
 	 */
 	ACQUIRE_GBLLOCK(&lk);
 	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
 	FREE_GBLLOCK(&lk);
 	rw_destroy(LOCK_PTR(ump));
 	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
 	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
 	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
 	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
 	    ump->bmsafemap_hash_size);
 	free(ump->indir_hashtbl, M_FREEWORK);
 #ifdef INVARIANTS
 	for (i = 0; i <= D_LAST; i++) {
 		KASSERT(ump->softdep_curdeps[i] == 0,
 		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
 		    TYPENAME(i), ump->softdep_curdeps[i]));
 		KASSERT(LIST_EMPTY(&ump->softdep_alldeps[i]),
 		    ("Unmount %s: Dep type %s not empty (%p)", ump->um_fs->fs_fsmnt,
 		    TYPENAME(i), LIST_FIRST(&ump->softdep_alldeps[i])));
 	}
 #endif
 	free(ump->um_softdep, M_MOUNTDATA);
 }
 
 static struct jblocks *
 jblocks_create(void)
 {
 	struct jblocks *jblocks;
 
 	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&jblocks->jb_segs);
 	jblocks->jb_avail = 10;
 	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
 	    M_JBLOCKS, M_WAITOK | M_ZERO);
 
 	return (jblocks);
 }
 
 static ufs2_daddr_t
 jblocks_alloc(jblocks, bytes, actual)
 	struct jblocks *jblocks;
 	int bytes;
 	int *actual;
 {
 	ufs2_daddr_t daddr;
 	struct jextent *jext;
 	int freecnt;
 	int blocks;
 
 	blocks = bytes / DEV_BSIZE;
 	jext = &jblocks->jb_extent[jblocks->jb_head];
 	freecnt = jext->je_blocks - jblocks->jb_off;
 	if (freecnt == 0) {
 		jblocks->jb_off = 0;
 		if (++jblocks->jb_head > jblocks->jb_used)
 			jblocks->jb_head = 0;
 		jext = &jblocks->jb_extent[jblocks->jb_head];
 		freecnt = jext->je_blocks;
 	}
 	if (freecnt > blocks)
 		freecnt = blocks;
 	*actual = freecnt * DEV_BSIZE;
 	daddr = jext->je_daddr + jblocks->jb_off;
 	jblocks->jb_off += freecnt;
 	jblocks->jb_free -= freecnt;
 
 	return (daddr);
 }
 
 static void
 jblocks_free(jblocks, mp, bytes)
 	struct jblocks *jblocks;
 	struct mount *mp;
 	int bytes;
 {
 
 	LOCK_OWNED(VFSTOUFS(mp));
 	jblocks->jb_free += bytes / DEV_BSIZE;
 	if (jblocks->jb_suspended)
 		worklist_speedup(mp);
 	wakeup(jblocks);
 }
 
 static void
 jblocks_destroy(jblocks)
 	struct jblocks *jblocks;
 {
 
 	if (jblocks->jb_extent)
 		free(jblocks->jb_extent, M_JBLOCKS);
 	free(jblocks, M_JBLOCKS);
 }
 
 static void
 jblocks_add(jblocks, daddr, blocks)
 	struct jblocks *jblocks;
 	ufs2_daddr_t daddr;
 	int blocks;
 {
 	struct jextent *jext;
 
 	jblocks->jb_blocks += blocks;
 	jblocks->jb_free += blocks;
 	jext = &jblocks->jb_extent[jblocks->jb_used];
 	/* Adding the first block. */
 	if (jext->je_daddr == 0) {
 		jext->je_daddr = daddr;
 		jext->je_blocks = blocks;
 		return;
 	}
 	/* Extending the last extent. */
 	if (jext->je_daddr + jext->je_blocks == daddr) {
 		jext->je_blocks += blocks;
 		return;
 	}
 	/* Adding a new extent. */
 	if (++jblocks->jb_used == jblocks->jb_avail) {
 		jblocks->jb_avail *= 2;
 		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
 		    M_JBLOCKS, M_WAITOK | M_ZERO);
 		memcpy(jext, jblocks->jb_extent,
 		    sizeof(struct jextent) * jblocks->jb_used);
 		free(jblocks->jb_extent, M_JBLOCKS);
 		jblocks->jb_extent = jext;
 	}
 	jext = &jblocks->jb_extent[jblocks->jb_used];
 	jext->je_daddr = daddr;
 	jext->je_blocks = blocks;
 	return;
 }
 
 int
 softdep_journal_lookup(mp, vpp)
 	struct mount *mp;
 	struct vnode **vpp;
 {
 	struct componentname cnp;
 	struct vnode *dvp;
 	ino_t sujournal;
 	int error;
 
 	error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
 	if (error)
 		return (error);
 	bzero(&cnp, sizeof(cnp));
 	cnp.cn_nameiop = LOOKUP;
 	cnp.cn_flags = ISLASTCN;
 	cnp.cn_thread = curthread;
 	cnp.cn_cred = curthread->td_ucred;
 	cnp.cn_pnbuf = SUJ_FILE;
 	cnp.cn_nameptr = SUJ_FILE;
 	cnp.cn_namelen = strlen(SUJ_FILE);
 	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
 	vput(dvp);
 	if (error != 0)
 		return (error);
 	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
 	return (error);
 }
 
 /*
  * Open and verify the journal file.
  */
 static int
 journal_mount(mp, fs, cred)
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 	struct jblocks *jblocks;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	int bcount;
 	int error;
 	int i;
 
 	ump = VFSTOUFS(mp);
 	ump->softdep_journal_tail = NULL;
 	ump->softdep_on_journal = 0;
 	ump->softdep_accdeps = 0;
 	ump->softdep_req = 0;
 	ump->softdep_jblocks = NULL;
 	error = softdep_journal_lookup(mp, &vp);
 	if (error != 0) {
 		printf("Failed to find journal.  Use tunefs to create one\n");
 		return (error);
 	}
 	ip = VTOI(vp);
 	if (ip->i_size < SUJ_MIN) {
 		error = ENOSPC;
 		goto out;
 	}
 	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
 	jblocks = jblocks_create();
 	for (i = 0; i < bcount; i++) {
 		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
 		if (error)
 			break;
 		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
 	}
 	if (error) {
 		jblocks_destroy(jblocks);
 		goto out;
 	}
 	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
 	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
 	ump->softdep_jblocks = jblocks;
 out:
 	if (error == 0) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_SUJ;
 		mp->mnt_flag &= ~MNT_SOFTDEP;
 		MNT_IUNLOCK(mp);
 		/*
 		 * Only validate the journal contents if the
 		 * filesystem is clean, otherwise we write the logs
 		 * but they'll never be used.  If the filesystem was
 		 * still dirty when we mounted it the journal is
 		 * invalid and a new journal can only be valid if it
 		 * starts from a clean mount.
 		 */
 		if (fs->fs_clean) {
 			DIP_SET(ip, i_modrev, fs->fs_mtime);
 			ip->i_flags |= IN_MODIFIED;
 			ffs_update(vp, 1);
 		}
 	}
 	vput(vp);
 	return (error);
 }
 
 static void
 journal_unmount(ump)
 	struct ufsmount *ump;
 {
 
 	if (ump->softdep_jblocks)
 		jblocks_destroy(ump->softdep_jblocks);
 	ump->softdep_jblocks = NULL;
 }
 
 /*
  * Called when a journal record is ready to be written.  Space is allocated
  * and the journal entry is created when the journal is flushed to stable
  * store.
  */
 static void
 add_to_journal(wk)
 	struct worklist *wk;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(wk->wk_mp);
 	LOCK_OWNED(ump);
 	if (wk->wk_state & ONWORKLIST)
 		panic("add_to_journal: %s(0x%X) already on list",
 		    TYPENAME(wk->wk_type), wk->wk_state);
 	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
 	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
 		ump->softdep_jblocks->jb_age = ticks;
 		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
 	} else
 		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
 	ump->softdep_journal_tail = wk;
 	ump->softdep_on_journal += 1;
 }
 
 /*
  * Remove an arbitrary item for the journal worklist maintain the tail
  * pointer.  This happens when a new operation obviates the need to
  * journal an old operation.
  */
 static void
 remove_from_journal(wk)
 	struct worklist *wk;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(wk->wk_mp);
 	LOCK_OWNED(ump);
 #ifdef INVARIANTS
 	{
 		struct worklist *wkn;
 
 		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
 			if (wkn == wk)
 				break;
 		if (wkn == NULL)
 			panic("remove_from_journal: %p is not in journal", wk);
 	}
 #endif
 	/*
 	 * We emulate a TAILQ to save space in most structures which do not
 	 * require TAILQ semantics.  Here we must update the tail position
 	 * when removing the tail which is not the final entry. This works
 	 * only if the worklist linkage are at the beginning of the structure.
 	 */
 	if (ump->softdep_journal_tail == wk)
 		ump->softdep_journal_tail =
 		    (struct worklist *)wk->wk_list.le_prev;
 	WORKLIST_REMOVE(wk);
 	ump->softdep_on_journal -= 1;
 }
 
 /*
  * Check for journal space as well as dependency limits so the prelink
  * code can throttle both journaled and non-journaled filesystems.
  * Threshold is 0 for low and 1 for min.
  */
 static int
 journal_space(ump, thresh)
 	struct ufsmount *ump;
 	int thresh;
 {
 	struct jblocks *jblocks;
 	int limit, avail;
 
 	jblocks = ump->softdep_jblocks;
 	if (jblocks == NULL)
 		return (1);
 	/*
 	 * We use a tighter restriction here to prevent request_cleanup()
 	 * running in threads from running into locks we currently hold.
 	 * We have to be over the limit and our filesystem has to be
 	 * responsible for more than our share of that usage.
 	 */
 	limit = (max_softdeps / 10) * 9;
 	if (dep_current[D_INODEDEP] > limit &&
 	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
 		return (0);
 	if (thresh)
 		thresh = jblocks->jb_min;
 	else
 		thresh = jblocks->jb_low;
 	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
 	avail = jblocks->jb_free - avail;
 
 	return (avail > thresh);
 }
 
 static void
 journal_suspend(ump)
 	struct ufsmount *ump;
 {
 	struct jblocks *jblocks;
 	struct mount *mp;
 	bool set;
 
 	mp = UFSTOVFS(ump);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0)
 		return;
 
 	jblocks = ump->softdep_jblocks;
 	vfs_op_enter(mp);
 	set = false;
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		stat_journal_min++;
 		mp->mnt_kern_flag |= MNTK_SUSPEND;
 		mp->mnt_susp_owner = ump->softdep_flushtd;
 		set = true;
 	}
 	jblocks->jb_suspended = 1;
 	MNT_IUNLOCK(mp);
 	if (!set)
 		vfs_op_exit(mp);
 }
 
 static int
 journal_unsuspend(struct ufsmount *ump)
 {
 	struct jblocks *jblocks;
 	struct mount *mp;
 
 	mp = UFSTOVFS(ump);
 	jblocks = ump->softdep_jblocks;
 
 	if (jblocks != NULL && jblocks->jb_suspended &&
 	    journal_space(ump, jblocks->jb_min)) {
 		jblocks->jb_suspended = 0;
 		FREE_LOCK(ump);
 		mp->mnt_susp_owner = curthread;
 		vfs_write_resume(mp, 0);
 		ACQUIRE_LOCK(ump);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Called before any allocation function to be certain that there is
  * sufficient space in the journal prior to creating any new records.
  * Since in the case of block allocation we may have multiple locked
  * buffers at the time of the actual allocation we can not block
  * when the journal records are created.  Doing so would create a deadlock
  * if any of these buffers needed to be flushed to reclaim space.  Instead
  * we require a sufficiently large amount of available space such that
  * each thread in the system could have passed this allocation check and
  * still have sufficient free space.  With 20% of a minimum journal size
  * of 1MB we have 6553 records available.
  */
 int
 softdep_prealloc(vp, waitok)
 	struct vnode *vp;
 	int waitok;
 {
 	struct ufsmount *ump;
 
 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 	    ("softdep_prealloc called on non-softdep filesystem"));
 	/*
 	 * Nothing to do if we are not running journaled soft updates.
 	 * If we currently hold the snapshot lock, we must avoid
 	 * handling other resources that could cause deadlock.  Do not
 	 * touch quotas vnode since it is typically recursed with
 	 * other vnode locks held.
 	 */
 	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
 	    (vp->v_vflag & VV_SYSTEM) != 0)
 		return (0);
 	ump = VFSTOUFS(vp->v_mount);
 	ACQUIRE_LOCK(ump);
 	if (journal_space(ump, 0)) {
 		FREE_LOCK(ump);
 		return (0);
 	}
 	stat_journal_low++;
 	FREE_LOCK(ump);
 	if (waitok == MNT_NOWAIT)
 		return (ENOSPC);
 	/*
 	 * Attempt to sync this vnode once to flush any journal
 	 * work attached to it.
 	 */
 	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
 		ffs_syncvnode(vp, waitok, 0);
 	ACQUIRE_LOCK(ump);
 	process_removes(vp);
 	process_truncates(vp);
 	if (journal_space(ump, 0) == 0) {
 		softdep_speedup(ump);
 		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
 	FREE_LOCK(ump);
 
 	return (0);
 }
 
 /*
  * Before adjusting a link count on a vnode verify that we have sufficient
  * journal space.  If not, process operations that depend on the currently
  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
  * and softdep flush threads can not acquire these locks to reclaim space.
  */
 static void
 softdep_prelink(dvp, vp)
 	struct vnode *dvp;
 	struct vnode *vp;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(dvp->v_mount);
 	LOCK_OWNED(ump);
 	/*
 	 * Nothing to do if we have sufficient journal space.
 	 * If we currently hold the snapshot lock, we must avoid
 	 * handling other resources that could cause deadlock.
 	 */
 	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
 		return;
 	stat_journal_low++;
 	FREE_LOCK(ump);
 	if (vp)
 		ffs_syncvnode(vp, MNT_NOWAIT, 0);
 	ffs_syncvnode(dvp, MNT_WAIT, 0);
 	ACQUIRE_LOCK(ump);
 	/* Process vp before dvp as it may create .. removes. */
 	if (vp) {
 		process_removes(vp);
 		process_truncates(vp);
 	}
 	process_removes(dvp);
 	process_truncates(dvp);
 	softdep_speedup(ump);
 	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
 	if (journal_space(ump, 0) == 0) {
 		softdep_speedup(ump);
 		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
 }
 
 static void
 jseg_write(ump, jseg, data)
 	struct ufsmount *ump;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jsegrec *rec;
 
 	rec = (struct jsegrec *)data;
 	rec->jsr_seq = jseg->js_seq;
 	rec->jsr_oldest = jseg->js_oldseq;
 	rec->jsr_cnt = jseg->js_cnt;
 	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
 	rec->jsr_crc = 0;
 	rec->jsr_time = ump->um_fs->fs_mtime;
 }
 
 static inline void
 inoref_write(inoref, jseg, rec)
 	struct inoref *inoref;
 	struct jseg *jseg;
 	struct jrefrec *rec;
 {
 
 	inoref->if_jsegdep->jd_seg = jseg;
 	rec->jr_ino = inoref->if_ino;
 	rec->jr_parent = inoref->if_parent;
 	rec->jr_nlink = inoref->if_nlink;
 	rec->jr_mode = inoref->if_mode;
 	rec->jr_diroff = inoref->if_diroff;
 }
 
 static void
 jaddref_write(jaddref, jseg, data)
 	struct jaddref *jaddref;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jrefrec *rec;
 
 	rec = (struct jrefrec *)data;
 	rec->jr_op = JOP_ADDREF;
 	inoref_write(&jaddref->ja_ref, jseg, rec);
 }
 
 static void
 jremref_write(jremref, jseg, data)
 	struct jremref *jremref;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jrefrec *rec;
 
 	rec = (struct jrefrec *)data;
 	rec->jr_op = JOP_REMREF;
 	inoref_write(&jremref->jr_ref, jseg, rec);
 }
 
 static void
 jmvref_write(jmvref, jseg, data)
 	struct jmvref *jmvref;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jmvrec *rec;
 
 	rec = (struct jmvrec *)data;
 	rec->jm_op = JOP_MVREF;
 	rec->jm_ino = jmvref->jm_ino;
 	rec->jm_parent = jmvref->jm_parent;
 	rec->jm_oldoff = jmvref->jm_oldoff;
 	rec->jm_newoff = jmvref->jm_newoff;
 }
 
 static void
 jnewblk_write(jnewblk, jseg, data)
 	struct jnewblk *jnewblk;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jblkrec *rec;
 
 	jnewblk->jn_jsegdep->jd_seg = jseg;
 	rec = (struct jblkrec *)data;
 	rec->jb_op = JOP_NEWBLK;
 	rec->jb_ino = jnewblk->jn_ino;
 	rec->jb_blkno = jnewblk->jn_blkno;
 	rec->jb_lbn = jnewblk->jn_lbn;
 	rec->jb_frags = jnewblk->jn_frags;
 	rec->jb_oldfrags = jnewblk->jn_oldfrags;
 }
 
 static void
 jfreeblk_write(jfreeblk, jseg, data)
 	struct jfreeblk *jfreeblk;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jblkrec *rec;
 
 	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
 	rec = (struct jblkrec *)data;
 	rec->jb_op = JOP_FREEBLK;
 	rec->jb_ino = jfreeblk->jf_ino;
 	rec->jb_blkno = jfreeblk->jf_blkno;
 	rec->jb_lbn = jfreeblk->jf_lbn;
 	rec->jb_frags = jfreeblk->jf_frags;
 	rec->jb_oldfrags = 0;
 }
 
 static void
 jfreefrag_write(jfreefrag, jseg, data)
 	struct jfreefrag *jfreefrag;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jblkrec *rec;
 
 	jfreefrag->fr_jsegdep->jd_seg = jseg;
 	rec = (struct jblkrec *)data;
 	rec->jb_op = JOP_FREEBLK;
 	rec->jb_ino = jfreefrag->fr_ino;
 	rec->jb_blkno = jfreefrag->fr_blkno;
 	rec->jb_lbn = jfreefrag->fr_lbn;
 	rec->jb_frags = jfreefrag->fr_frags;
 	rec->jb_oldfrags = 0;
 }
 
 static void
 jtrunc_write(jtrunc, jseg, data)
 	struct jtrunc *jtrunc;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jtrncrec *rec;
 
 	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
 	rec = (struct jtrncrec *)data;
 	rec->jt_op = JOP_TRUNC;
 	rec->jt_ino = jtrunc->jt_ino;
 	rec->jt_size = jtrunc->jt_size;
 	rec->jt_extsize = jtrunc->jt_extsize;
 }
 
 static void
 jfsync_write(jfsync, jseg, data)
 	struct jfsync *jfsync;
 	struct jseg *jseg;
 	uint8_t *data;
 {
 	struct jtrncrec *rec;
 
 	rec = (struct jtrncrec *)data;
 	rec->jt_op = JOP_SYNC;
 	rec->jt_ino = jfsync->jfs_ino;
 	rec->jt_size = jfsync->jfs_size;
 	rec->jt_extsize = jfsync->jfs_extsize;
 }
 
 static void
 softdep_flushjournal(mp)
 	struct mount *mp;
 {
 	struct jblocks *jblocks;
 	struct ufsmount *ump;
 
 	if (MOUNTEDSUJ(mp) == 0)
 		return;
 	ump = VFSTOUFS(mp);
 	jblocks = ump->softdep_jblocks;
 	ACQUIRE_LOCK(ump);
 	while (ump->softdep_on_journal) {
 		jblocks->jb_needseg = 1;
 		softdep_process_journal(mp, NULL, MNT_WAIT);
 	}
 	FREE_LOCK(ump);
 }
 
 static void softdep_synchronize_completed(struct bio *);
 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
 
 static void
 softdep_synchronize_completed(bp)
         struct bio *bp;
 {
 	struct jseg *oldest;
 	struct jseg *jseg;
 	struct ufsmount *ump;
 
 	/*
 	 * caller1 marks the last segment written before we issued the
 	 * synchronize cache.
 	 */
 	jseg = bp->bio_caller1;
 	if (jseg == NULL) {
 		g_destroy_bio(bp);
 		return;
 	}
 	ump = VFSTOUFS(jseg->js_list.wk_mp);
 	ACQUIRE_LOCK(ump);
 	oldest = NULL;
 	/*
 	 * Mark all the journal entries waiting on the synchronize cache
 	 * as completed so they may continue on.
 	 */
 	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
 		jseg->js_state |= COMPLETE;
 		oldest = jseg;
 		jseg = TAILQ_PREV(jseg, jseglst, js_next);
 	}
 	/*
 	 * Restart deferred journal entry processing from the oldest
 	 * completed jseg.
 	 */
 	if (oldest)
 		complete_jsegs(oldest);
 
 	FREE_LOCK(ump);
 	g_destroy_bio(bp);
 }
 
 /*
  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
  * barriers.  The journal must be written prior to any blocks that depend
  * on it and the journal can not be released until the blocks have be
  * written.  This code handles both barriers simultaneously.
  */
 static void
 softdep_synchronize(bp, ump, caller1)
 	struct bio *bp;
 	struct ufsmount *ump;
 	void *caller1;
 {
 
 	bp->bio_cmd = BIO_FLUSH;
 	bp->bio_flags |= BIO_ORDERED;
 	bp->bio_data = NULL;
 	bp->bio_offset = ump->um_cp->provider->mediasize;
 	bp->bio_length = 0;
 	bp->bio_done = softdep_synchronize_completed;
 	bp->bio_caller1 = caller1;
 	g_io_request(bp, ump->um_cp);
 }
 
 /*
  * Flush some journal records to disk.
  */
 static void
 softdep_process_journal(mp, needwk, flags)
 	struct mount *mp;
 	struct worklist *needwk;
 	int flags;
 {
 	struct jblocks *jblocks;
 	struct ufsmount *ump;
 	struct worklist *wk;
 	struct jseg *jseg;
 	struct buf *bp;
 	struct bio *bio;
 	uint8_t *data;
 	struct fs *fs;
 	int shouldflush;
 	int segwritten;
 	int jrecmin;	/* Minimum records per block. */
 	int jrecmax;	/* Maximum records per block. */
 	int size;
 	int cnt;
 	int off;
 	int devbsize;
 
 	if (MOUNTEDSUJ(mp) == 0)
 		return;
 	shouldflush = softdep_flushcache;
 	bio = NULL;
 	jseg = NULL;
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	fs = ump->um_fs;
 	jblocks = ump->softdep_jblocks;
 	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
 	/*
 	 * We write anywhere between a disk block and fs block.  The upper
 	 * bound is picked to prevent buffer cache fragmentation and limit
 	 * processing time per I/O.
 	 */
 	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
 	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
 	segwritten = 0;
 	for (;;) {
 		cnt = ump->softdep_on_journal;
 		/*
 		 * Criteria for writing a segment:
 		 * 1) We have a full block.
 		 * 2) We're called from jwait() and haven't found the
 		 *    journal item yet.
 		 * 3) Always write if needseg is set.
 		 * 4) If we are called from process_worklist and have
 		 *    not yet written anything we write a partial block
 		 *    to enforce a 1 second maximum latency on journal
 		 *    entries.
 		 */
 		if (cnt < (jrecmax - 1) && needwk == NULL &&
 		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
 			break;
 		cnt++;
 		/*
 		 * Verify some free journal space.  softdep_prealloc() should
 		 * guarantee that we don't run out so this is indicative of
 		 * a problem with the flow control.  Try to recover
 		 * gracefully in any event.
 		 */
 		while (jblocks->jb_free == 0) {
 			if (flags != MNT_WAIT)
 				break;
 			printf("softdep: Out of journal space!\n");
 			softdep_speedup(ump);
 			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
 		}
 		FREE_LOCK(ump);
 		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
 		workitem_alloc(&jseg->js_list, D_JSEG, mp);
 		LIST_INIT(&jseg->js_entries);
 		LIST_INIT(&jseg->js_indirs);
 		jseg->js_state = ATTACHED;
 		if (shouldflush == 0)
 			jseg->js_state |= COMPLETE;
 		else if (bio == NULL)
 			bio = g_alloc_bio();
 		jseg->js_jblocks = jblocks;
 		bp = geteblk(fs->fs_bsize, 0);
 		ACQUIRE_LOCK(ump);
 		/*
 		 * If there was a race while we were allocating the block
 		 * and jseg the entry we care about was likely written.
 		 * We bail out in both the WAIT and NOWAIT case and assume
 		 * the caller will loop if the entry it cares about is
 		 * not written.
 		 */
 		cnt = ump->softdep_on_journal;
 		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
 			bp->b_flags |= B_INVAL | B_NOCACHE;
 			WORKITEM_FREE(jseg, D_JSEG);
 			FREE_LOCK(ump);
 			brelse(bp);
 			ACQUIRE_LOCK(ump);
 			break;
 		}
 		/*
 		 * Calculate the disk block size required for the available
 		 * records rounded to the min size.
 		 */
 		if (cnt == 0)
 			size = devbsize;
 		else if (cnt < jrecmax)
 			size = howmany(cnt, jrecmin) * devbsize;
 		else
 			size = fs->fs_bsize;
 		/*
 		 * Allocate a disk block for this journal data and account
 		 * for truncation of the requested size if enough contiguous
 		 * space was not available.
 		 */
 		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
 		bp->b_lblkno = bp->b_blkno;
 		bp->b_offset = bp->b_blkno * DEV_BSIZE;
 		bp->b_bcount = size;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
 		/*
 		 * Initialize our jseg with cnt records.  Assign the next
 		 * sequence number to it and link it in-order.
 		 */
 		cnt = MIN(cnt, (size / devbsize) * jrecmin);
 		jseg->js_buf = bp;
 		jseg->js_cnt = cnt;
 		jseg->js_refs = cnt + 1;	/* Self ref. */
 		jseg->js_size = size;
 		jseg->js_seq = jblocks->jb_nextseq++;
 		if (jblocks->jb_oldestseg == NULL)
 			jblocks->jb_oldestseg = jseg;
 		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
 		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
 		if (jblocks->jb_writeseg == NULL)
 			jblocks->jb_writeseg = jseg;
 		/*
 		 * Start filling in records from the pending list.
 		 */
 		data = bp->b_data;
 		off = 0;
 
 		/*
 		 * Always put a header on the first block.
 		 * XXX As with below, there might not be a chance to get
 		 * into the loop.  Ensure that something valid is written.
 		 */
 		jseg_write(ump, jseg, data);
 		off += JREC_SIZE;
 		data = bp->b_data + off;
 
 		/*
 		 * XXX Something is wrong here.  There's no work to do,
 		 * but we need to perform and I/O and allow it to complete
 		 * anyways.
 		 */
 		if (LIST_EMPTY(&ump->softdep_journal_pending))
 			stat_emptyjblocks++;
 
 		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
 		    != NULL) {
 			if (cnt == 0)
 				break;
 			/* Place a segment header on every device block. */
 			if ((off % devbsize) == 0) {
 				jseg_write(ump, jseg, data);
 				off += JREC_SIZE;
 				data = bp->b_data + off;
 			}
 			if (wk == needwk)
 				needwk = NULL;
 			remove_from_journal(wk);
 			wk->wk_state |= INPROGRESS;
 			WORKLIST_INSERT(&jseg->js_entries, wk);
 			switch (wk->wk_type) {
 			case D_JADDREF:
 				jaddref_write(WK_JADDREF(wk), jseg, data);
 				break;
 			case D_JREMREF:
 				jremref_write(WK_JREMREF(wk), jseg, data);
 				break;
 			case D_JMVREF:
 				jmvref_write(WK_JMVREF(wk), jseg, data);
 				break;
 			case D_JNEWBLK:
 				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
 				break;
 			case D_JFREEBLK:
 				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
 				break;
 			case D_JFREEFRAG:
 				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
 				break;
 			case D_JTRUNC:
 				jtrunc_write(WK_JTRUNC(wk), jseg, data);
 				break;
 			case D_JFSYNC:
 				jfsync_write(WK_JFSYNC(wk), jseg, data);
 				break;
 			default:
 				panic("process_journal: Unknown type %s",
 				    TYPENAME(wk->wk_type));
 				/* NOTREACHED */
 			}
 			off += JREC_SIZE;
 			data = bp->b_data + off;
 			cnt--;
 		}
 
 		/* Clear any remaining space so we don't leak kernel data */
 		if (size > off)
 			bzero(data, size - off);
 
 		/*
 		 * Write this one buffer and continue.
 		 */
 		segwritten = 1;
 		jblocks->jb_needseg = 0;
 		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
 		FREE_LOCK(ump);
 		bp->b_xflags |= BX_CVTENXIO;
 		pbgetvp(ump->um_devvp, bp);
 		/*
 		 * We only do the blocking wait once we find the journal
 		 * entry we're looking for.
 		 */
 		if (needwk == NULL && flags == MNT_WAIT)
 			bwrite(bp);
 		else
 			bawrite(bp);
 		ACQUIRE_LOCK(ump);
 	}
 	/*
 	 * If we wrote a segment issue a synchronize cache so the journal
 	 * is reflected on disk before the data is written.  Since reclaiming
 	 * journal space also requires writing a journal record this
 	 * process also enforces a barrier before reclamation.
 	 */
 	if (segwritten && shouldflush) {
 		softdep_synchronize(bio, ump, 
 		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
 	} else if (bio)
 		g_destroy_bio(bio);
 	/*
 	 * If we've suspended the filesystem because we ran out of journal
 	 * space either try to sync it here to make some progress or
 	 * unsuspend it if we already have.
 	 */
 	if (flags == 0 && jblocks->jb_suspended) {
 		if (journal_unsuspend(ump))
 			return;
 		FREE_LOCK(ump);
 		VFS_SYNC(mp, MNT_NOWAIT);
 		ffs_sbupdate(ump, MNT_WAIT, 0);
 		ACQUIRE_LOCK(ump);
 	}
 }
 
 /*
  * Complete a jseg, allowing all dependencies awaiting journal writes
  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
  * structures so that the journal segment can be freed to reclaim space.
  */
 static void
 complete_jseg(jseg)
 	struct jseg *jseg;
 {
 	struct worklist *wk;
 	struct jmvref *jmvref;
 #ifdef INVARIANTS
 	int i = 0;
 #endif
 
 	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		wk->wk_state &= ~INPROGRESS;
 		wk->wk_state |= COMPLETE;
 		KASSERT(i++ < jseg->js_cnt,
 		    ("handle_written_jseg: overflow %d >= %d",
 		    i - 1, jseg->js_cnt));
 		switch (wk->wk_type) {
 		case D_JADDREF:
 			handle_written_jaddref(WK_JADDREF(wk));
 			break;
 		case D_JREMREF:
 			handle_written_jremref(WK_JREMREF(wk));
 			break;
 		case D_JMVREF:
 			rele_jseg(jseg);	/* No jsegdep. */
 			jmvref = WK_JMVREF(wk);
 			LIST_REMOVE(jmvref, jm_deps);
 			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
 				free_pagedep(jmvref->jm_pagedep);
 			WORKITEM_FREE(jmvref, D_JMVREF);
 			break;
 		case D_JNEWBLK:
 			handle_written_jnewblk(WK_JNEWBLK(wk));
 			break;
 		case D_JFREEBLK:
 			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
 			break;
 		case D_JTRUNC:
 			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
 			break;
 		case D_JFSYNC:
 			rele_jseg(jseg);	/* No jsegdep. */
 			WORKITEM_FREE(wk, D_JFSYNC);
 			break;
 		case D_JFREEFRAG:
 			handle_written_jfreefrag(WK_JFREEFRAG(wk));
 			break;
 		default:
 			panic("handle_written_jseg: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	/* Release the self reference so the structure may be freed. */
 	rele_jseg(jseg);
 }
 
 /*
  * Determine which jsegs are ready for completion processing.  Waits for
  * synchronize cache to complete as well as forcing in-order completion
  * of journal entries.
  */
 static void
 complete_jsegs(jseg)
 	struct jseg *jseg;
 {
 	struct jblocks *jblocks;
 	struct jseg *jsegn;
 
 	jblocks = jseg->js_jblocks;
 	/*
 	 * Don't allow out of order completions.  If this isn't the first
 	 * block wait for it to write before we're done.
 	 */
 	if (jseg != jblocks->jb_writeseg)
 		return;
 	/* Iterate through available jsegs processing their entries. */
 	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		jblocks->jb_oldestwrseq = jseg->js_oldseq;
 		jsegn = TAILQ_NEXT(jseg, js_next);
 		complete_jseg(jseg);
 		jseg = jsegn;
 	}
 	jblocks->jb_writeseg = jseg;
 	/*
 	 * Attempt to free jsegs now that oldestwrseq may have advanced. 
 	 */
 	free_jsegs(jblocks);
 }
 
 /*
  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
  * the final completions.
  */
 static void
 handle_written_jseg(jseg, bp)
 	struct jseg *jseg;
 	struct buf *bp;
 {
 
 	if (jseg->js_refs == 0)
 		panic("handle_written_jseg: No self-reference on %p", jseg);
 	jseg->js_state |= DEPCOMPLETE;
 	/*
 	 * We'll never need this buffer again, set flags so it will be
 	 * discarded.
 	 */
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 	pbrelvp(bp);
 	complete_jsegs(jseg);
 }
 
 static inline struct jsegdep *
 inoref_jseg(inoref)
 	struct inoref *inoref;
 {
 	struct jsegdep *jsegdep;
 
 	jsegdep = inoref->if_jsegdep;
 	inoref->if_jsegdep = NULL;
 
 	return (jsegdep);
 }
 
 /*
  * Called once a jremref has made it to stable store.  The jremref is marked
  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
  * for the jremref to complete will be awoken by free_jremref.
  */
 static void
 handle_written_jremref(jremref)
 	struct jremref *jremref;
 {
 	struct inodedep *inodedep;
 	struct jsegdep *jsegdep;
 	struct dirrem *dirrem;
 
 	/* Grab the jsegdep. */
 	jsegdep = inoref_jseg(&jremref->jr_ref);
 	/*
 	 * Remove us from the inoref list.
 	 */
 	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
 	    0, &inodedep) == 0)
 		panic("handle_written_jremref: Lost inodedep");
 	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
 	/*
 	 * Complete the dirrem.
 	 */
 	dirrem = jremref->jr_dirrem;
 	jremref->jr_dirrem = NULL;
 	LIST_REMOVE(jremref, jr_deps);
 	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
 	jwork_insert(&dirrem->dm_jwork, jsegdep);
 	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
 	    (dirrem->dm_state & COMPLETE) != 0)
 		add_to_worklist(&dirrem->dm_list, 0);
 	free_jremref(jremref);
 }
 
 /*
  * Called once a jaddref has made it to stable store.  The dependency is
  * marked complete and any dependent structures are added to the inode
  * bufwait list to be completed as soon as it is written.  If a bitmap write
  * depends on this entry we move the inode into the inodedephd of the
  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
  */
 static void
 handle_written_jaddref(jaddref)
 	struct jaddref *jaddref;
 {
 	struct jsegdep *jsegdep;
 	struct inodedep *inodedep;
 	struct diradd *diradd;
 	struct mkdir *mkdir;
 
 	/* Grab the jsegdep. */
 	jsegdep = inoref_jseg(&jaddref->ja_ref);
 	mkdir = NULL;
 	diradd = NULL;
 	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
 	    0, &inodedep) == 0)
 		panic("handle_written_jaddref: Lost inodedep.");
 	if (jaddref->ja_diradd == NULL)
 		panic("handle_written_jaddref: No dependency");
 	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
 		diradd = jaddref->ja_diradd;
 		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
 	} else if (jaddref->ja_state & MKDIR_PARENT) {
 		mkdir = jaddref->ja_mkdir;
 		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
 	} else if (jaddref->ja_state & MKDIR_BODY)
 		mkdir = jaddref->ja_mkdir;
 	else
 		panic("handle_written_jaddref: Unknown dependency %p",
 		    jaddref->ja_diradd);
 	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
 	/*
 	 * Remove us from the inode list.
 	 */
 	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
 	/*
 	 * The mkdir may be waiting on the jaddref to clear before freeing.
 	 */
 	if (mkdir) {
 		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
 		    ("handle_written_jaddref: Incorrect type for mkdir %s",
 		    TYPENAME(mkdir->md_list.wk_type)));
 		mkdir->md_jaddref = NULL;
 		diradd = mkdir->md_diradd;
 		mkdir->md_state |= DEPCOMPLETE;
 		complete_mkdir(mkdir);
 	}
 	jwork_insert(&diradd->da_jwork, jsegdep);
 	if (jaddref->ja_state & NEWBLOCK) {
 		inodedep->id_state |= ONDEPLIST;
 		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
 		    inodedep, id_deps);
 	}
 	free_jaddref(jaddref);
 }
 
 /*
  * Called once a jnewblk journal is written.  The allocdirect or allocindir
  * is placed in the bmsafemap to await notification of a written bitmap.  If
  * the operation was canceled we add the segdep to the appropriate
  * dependency to free the journal space once the canceling operation
  * completes.
  */
 static void
 handle_written_jnewblk(jnewblk)
 	struct jnewblk *jnewblk;
 {
 	struct bmsafemap *bmsafemap;
 	struct freefrag *freefrag;
 	struct freework *freework;
 	struct jsegdep *jsegdep;
 	struct newblk *newblk;
 
 	/* Grab the jsegdep. */
 	jsegdep = jnewblk->jn_jsegdep;
 	jnewblk->jn_jsegdep = NULL;
 	if (jnewblk->jn_dep == NULL) 
 		panic("handle_written_jnewblk: No dependency for the segdep.");
 	switch (jnewblk->jn_dep->wk_type) {
 	case D_NEWBLK:
 	case D_ALLOCDIRECT:
 	case D_ALLOCINDIR:
 		/*
 		 * Add the written block to the bmsafemap so it can
 		 * be notified when the bitmap is on disk.
 		 */
 		newblk = WK_NEWBLK(jnewblk->jn_dep);
 		newblk->nb_jnewblk = NULL;
 		if ((newblk->nb_state & GOINGAWAY) == 0) {
 			bmsafemap = newblk->nb_bmsafemap;
 			newblk->nb_state |= ONDEPLIST;
 			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
 			    nb_deps);
 		}
 		jwork_insert(&newblk->nb_jwork, jsegdep);
 		break;
 	case D_FREEFRAG:
 		/*
 		 * A newblock being removed by a freefrag when replaced by
 		 * frag extension.
 		 */
 		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
 		freefrag->ff_jdep = NULL;
 		jwork_insert(&freefrag->ff_jwork, jsegdep);
 		break;
 	case D_FREEWORK:
 		/*
 		 * A direct block was removed by truncate.
 		 */
 		freework = WK_FREEWORK(jnewblk->jn_dep);
 		freework->fw_jnewblk = NULL;
 		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
 		break;
 	default:
 		panic("handle_written_jnewblk: Unknown type %d.",
 		    jnewblk->jn_dep->wk_type);
 	}
 	jnewblk->jn_dep = NULL;
 	free_jnewblk(jnewblk);
 }
 
 /*
  * Cancel a jfreefrag that won't be needed, probably due to colliding with
  * an in-flight allocation that has not yet been committed.  Divorce us
  * from the freefrag and mark it DEPCOMPLETE so that it may be added
  * to the worklist.
  */
 static void
 cancel_jfreefrag(jfreefrag)
 	struct jfreefrag *jfreefrag;
 {
 	struct freefrag *freefrag;
 
 	if (jfreefrag->fr_jsegdep) {
 		free_jsegdep(jfreefrag->fr_jsegdep);
 		jfreefrag->fr_jsegdep = NULL;
 	}
 	freefrag = jfreefrag->fr_freefrag;
 	jfreefrag->fr_freefrag = NULL;
 	free_jfreefrag(jfreefrag);
 	freefrag->ff_state |= DEPCOMPLETE;
 	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
 }
 
 /*
  * Free a jfreefrag when the parent freefrag is rendered obsolete.
  */
 static void
 free_jfreefrag(jfreefrag)
 	struct jfreefrag *jfreefrag;
 {
 
 	if (jfreefrag->fr_state & INPROGRESS)
 		WORKLIST_REMOVE(&jfreefrag->fr_list);
 	else if (jfreefrag->fr_state & ONWORKLIST)
 		remove_from_journal(&jfreefrag->fr_list);
 	if (jfreefrag->fr_freefrag != NULL)
 		panic("free_jfreefrag:  Still attached to a freefrag.");
 	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
 }
 
 /*
  * Called when the journal write for a jfreefrag completes.  The parent
  * freefrag is added to the worklist if this completes its dependencies.
  */
 static void
 handle_written_jfreefrag(jfreefrag)
 	struct jfreefrag *jfreefrag;
 {
 	struct jsegdep *jsegdep;
 	struct freefrag *freefrag;
 
 	/* Grab the jsegdep. */
 	jsegdep = jfreefrag->fr_jsegdep;
 	jfreefrag->fr_jsegdep = NULL;
 	freefrag = jfreefrag->fr_freefrag;
 	if (freefrag == NULL)
 		panic("handle_written_jfreefrag: No freefrag.");
 	freefrag->ff_state |= DEPCOMPLETE;
 	freefrag->ff_jdep = NULL;
 	jwork_insert(&freefrag->ff_jwork, jsegdep);
 	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
 		add_to_worklist(&freefrag->ff_list, 0);
 	jfreefrag->fr_freefrag = NULL;
 	free_jfreefrag(jfreefrag);
 }
 
 /*
  * Called when the journal write for a jfreeblk completes.  The jfreeblk
  * is removed from the freeblks list of pending journal writes and the
  * jsegdep is moved to the freeblks jwork to be completed when all blocks
  * have been reclaimed.
  */
 static void
 handle_written_jblkdep(jblkdep)
 	struct jblkdep *jblkdep;
 {
 	struct freeblks *freeblks;
 	struct jsegdep *jsegdep;
 
 	/* Grab the jsegdep. */
 	jsegdep = jblkdep->jb_jsegdep;
 	jblkdep->jb_jsegdep = NULL;
 	freeblks = jblkdep->jb_freeblks;
 	LIST_REMOVE(jblkdep, jb_deps);
 	jwork_insert(&freeblks->fb_jwork, jsegdep);
 	/*
 	 * If the freeblks is all journaled, we can add it to the worklist.
 	 */
 	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
 	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 
 	free_jblkdep(jblkdep);
 }
 
 static struct jsegdep *
 newjsegdep(struct worklist *wk)
 {
 	struct jsegdep *jsegdep;
 
 	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
 	jsegdep->jd_seg = NULL;
 
 	return (jsegdep);
 }
 
 static struct jmvref *
 newjmvref(dp, ino, oldoff, newoff)
 	struct inode *dp;
 	ino_t ino;
 	off_t oldoff;
 	off_t newoff;
 {
 	struct jmvref *jmvref;
 
 	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
 	workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
 	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
 	jmvref->jm_parent = dp->i_number;
 	jmvref->jm_ino = ino;
 	jmvref->jm_oldoff = oldoff;
 	jmvref->jm_newoff = newoff;
 
 	return (jmvref);
 }
 
 /*
  * Allocate a new jremref that tracks the removal of ip from dp with the
  * directory entry offset of diroff.  Mark the entry as ATTACHED and
  * DEPCOMPLETE as we have all the information required for the journal write
  * and the directory has already been removed from the buffer.  The caller
  * is responsible for linking the jremref into the pagedep and adding it
  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
  * a DOTDOT addition so handle_workitem_remove() can properly assign
  * the jsegdep when we're done.
  */
 static struct jremref *
 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
     off_t diroff, nlink_t nlink)
 {
 	struct jremref *jremref;
 
 	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
 	workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
 	jremref->jr_state = ATTACHED;
 	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
 	   nlink, ip->i_mode);
 	jremref->jr_dirrem = dirrem;
 
 	return (jremref);
 }
 
 static inline void
 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
     nlink_t nlink, uint16_t mode)
 {
 
 	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
 	inoref->if_diroff = diroff;
 	inoref->if_ino = ino;
 	inoref->if_parent = parent;
 	inoref->if_nlink = nlink;
 	inoref->if_mode = mode;
 }
 
 /*
  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
  * directory offset may not be known until later.  The caller is responsible
  * adding the entry to the journal when this information is available.  nlink
  * should be the link count prior to the addition and mode is only required
  * to have the correct FMT.
  */
 static struct jaddref *
 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
     uint16_t mode)
 {
 	struct jaddref *jaddref;
 
 	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
 	workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
 	jaddref->ja_state = ATTACHED;
 	jaddref->ja_mkdir = NULL;
 	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
 
 	return (jaddref);
 }
 
 /*
  * Create a new free dependency for a freework.  The caller is responsible
  * for adjusting the reference count when it has the lock held.  The freedep
  * will track an outstanding bitmap write that will ultimately clear the
  * freework to continue.
  */
 static struct freedep *
 newfreedep(struct freework *freework)
 {
 	struct freedep *freedep;
 
 	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
 	freedep->fd_freework = freework;
 
 	return (freedep);
 }
 
 /*
  * Free a freedep structure once the buffer it is linked to is written.  If
  * this is the last reference to the freework schedule it for completion.
  */
 static void
 free_freedep(freedep)
 	struct freedep *freedep;
 {
 	struct freework *freework;
 
 	freework = freedep->fd_freework;
 	freework->fw_freeblks->fb_cgwait--;
 	if (--freework->fw_ref == 0)
 		freework_enqueue(freework);
 	WORKITEM_FREE(freedep, D_FREEDEP);
 }
 
 /*
  * Allocate a new freework structure that may be a level in an indirect
  * when parent is not NULL or a top level block when it is.  The top level
  * freework structures are allocated without the per-filesystem lock held
  * and before the freeblks is visible outside of softdep_setup_freeblocks().
  */
 static struct freework *
 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
 	struct ufsmount *ump;
 	struct freeblks *freeblks;
 	struct freework *parent;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t nb;
 	int frags;
 	int off;
 	int journal;
 {
 	struct freework *freework;
 
 	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
 	freework->fw_state = ATTACHED;
 	freework->fw_jnewblk = NULL;
 	freework->fw_freeblks = freeblks;
 	freework->fw_parent = parent;
 	freework->fw_lbn = lbn;
 	freework->fw_blkno = nb;
 	freework->fw_frags = frags;
 	freework->fw_indir = NULL;
 	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
 	    lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
 	freework->fw_start = freework->fw_off = off;
 	if (journal)
 		newjfreeblk(freeblks, lbn, nb, frags);
 	if (parent == NULL) {
 		ACQUIRE_LOCK(ump);
 		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
 		freeblks->fb_ref++;
 		FREE_LOCK(ump);
 	}
 
 	return (freework);
 }
 
 /*
  * Eliminate a jfreeblk for a block that does not need journaling.
  */
 static void
 cancel_jfreeblk(freeblks, blkno)
 	struct freeblks *freeblks;
 	ufs2_daddr_t blkno;
 {
 	struct jfreeblk *jfreeblk;
 	struct jblkdep *jblkdep;
 
 	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
 		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
 			continue;
 		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
 		if (jfreeblk->jf_blkno == blkno)
 			break;
 	}
 	if (jblkdep == NULL)
 		return;
 	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
 	free_jsegdep(jblkdep->jb_jsegdep);
 	LIST_REMOVE(jblkdep, jb_deps);
 	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
 }
 
 /*
  * Allocate a new jfreeblk to journal top level block pointer when truncating
  * a file.  The caller must add this to the worklist when the per-filesystem
  * lock is held.
  */
 static struct jfreeblk *
 newjfreeblk(freeblks, lbn, blkno, frags)
 	struct freeblks *freeblks;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int frags;
 {
 	struct jfreeblk *jfreeblk;
 
 	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
 	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
 	    freeblks->fb_list.wk_mp);
 	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
 	jfreeblk->jf_dep.jb_freeblks = freeblks;
 	jfreeblk->jf_ino = freeblks->fb_inum;
 	jfreeblk->jf_lbn = lbn;
 	jfreeblk->jf_blkno = blkno;
 	jfreeblk->jf_frags = frags;
 	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
 
 	return (jfreeblk);
 }
 
 /*
  * The journal is only prepared to handle full-size block numbers, so we
  * have to adjust the record to reflect the change to a full-size block.
  * For example, suppose we have a block made up of fragments 8-15 and
  * want to free its last two fragments. We are given a request that says:
  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
  * where frags are the number of fragments to free and oldfrags are the
  * number of fragments to keep. To block align it, we have to change it to
  * have a valid full-size blkno, so it becomes:
  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
  */
 static void
 adjust_newfreework(freeblks, frag_offset)
 	struct freeblks *freeblks;
 	int frag_offset;
 {
 	struct jfreeblk *jfreeblk;
 
 	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
 	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
 	    ("adjust_newfreework: Missing freeblks dependency"));
 
 	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
 	jfreeblk->jf_blkno -= frag_offset;
 	jfreeblk->jf_frags += frag_offset;
 }
 
 /*
  * Allocate a new jtrunc to track a partial truncation.
  */
 static struct jtrunc *
 newjtrunc(freeblks, size, extsize)
 	struct freeblks *freeblks;
 	off_t size;
 	int extsize;
 {
 	struct jtrunc *jtrunc;
 
 	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
 	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
 	    freeblks->fb_list.wk_mp);
 	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
 	jtrunc->jt_dep.jb_freeblks = freeblks;
 	jtrunc->jt_ino = freeblks->fb_inum;
 	jtrunc->jt_size = size;
 	jtrunc->jt_extsize = extsize;
 	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
 
 	return (jtrunc);
 }
 
 /*
  * If we're canceling a new bitmap we have to search for another ref
  * to move into the bmsafemap dep.  This might be better expressed
  * with another structure.
  */
 static void
 move_newblock_dep(jaddref, inodedep)
 	struct jaddref *jaddref;
 	struct inodedep *inodedep;
 {
 	struct inoref *inoref;
 	struct jaddref *jaddrefn;
 
 	jaddrefn = NULL;
 	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
 	    inoref = TAILQ_NEXT(inoref, if_deps)) {
 		if ((jaddref->ja_state & NEWBLOCK) &&
 		    inoref->if_list.wk_type == D_JADDREF) {
 			jaddrefn = (struct jaddref *)inoref;
 			break;
 		}
 	}
 	if (jaddrefn == NULL)
 		return;
 	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
 	jaddrefn->ja_state |= jaddref->ja_state &
 	    (ATTACHED | UNDONE | NEWBLOCK);
 	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
 	jaddref->ja_state |= ATTACHED;
 	LIST_REMOVE(jaddref, ja_bmdeps);
 	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
 	    ja_bmdeps);
 }
 
 /*
  * Cancel a jaddref either before it has been written or while it is being
  * written.  This happens when a link is removed before the add reaches
  * the disk.  The jaddref dependency is kept linked into the bmsafemap
  * and inode to prevent the link count or bitmap from reaching the disk
  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
  * required.
  *
  * Returns 1 if the canceled addref requires journaling of the remove and
  * 0 otherwise.
  */
 static int
 cancel_jaddref(jaddref, inodedep, wkhd)
 	struct jaddref *jaddref;
 	struct inodedep *inodedep;
 	struct workhead *wkhd;
 {
 	struct inoref *inoref;
 	struct jsegdep *jsegdep;
 	int needsj;
 
 	KASSERT((jaddref->ja_state & COMPLETE) == 0,
 	    ("cancel_jaddref: Canceling complete jaddref"));
 	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
 		needsj = 1;
 	else
 		needsj = 0;
 	if (inodedep == NULL)
 		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
 		    0, &inodedep) == 0)
 			panic("cancel_jaddref: Lost inodedep");
 	/*
 	 * We must adjust the nlink of any reference operation that follows
 	 * us so that it is consistent with the in-memory reference.  This
 	 * ensures that inode nlink rollbacks always have the correct link.
 	 */
 	if (needsj == 0) {
 		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
 		    inoref = TAILQ_NEXT(inoref, if_deps)) {
 			if (inoref->if_state & GOINGAWAY)
 				break;
 			inoref->if_nlink--;
 		}
 	}
 	jsegdep = inoref_jseg(&jaddref->ja_ref);
 	if (jaddref->ja_state & NEWBLOCK)
 		move_newblock_dep(jaddref, inodedep);
 	wake_worklist(&jaddref->ja_list);
 	jaddref->ja_mkdir = NULL;
 	if (jaddref->ja_state & INPROGRESS) {
 		jaddref->ja_state &= ~INPROGRESS;
 		WORKLIST_REMOVE(&jaddref->ja_list);
 		jwork_insert(wkhd, jsegdep);
 	} else {
 		free_jsegdep(jsegdep);
 		if (jaddref->ja_state & DEPCOMPLETE)
 			remove_from_journal(&jaddref->ja_list);
 	}
 	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
 	/*
 	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
 	 * can arrange for them to be freed with the bitmap.  Otherwise we
 	 * no longer need this addref attached to the inoreflst and it
 	 * will incorrectly adjust nlink if we leave it.
 	 */
 	if ((jaddref->ja_state & NEWBLOCK) == 0) {
 		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
 		jaddref->ja_state |= COMPLETE;
 		free_jaddref(jaddref);
 		return (needsj);
 	}
 	/*
 	 * Leave the head of the list for jsegdeps for fast merging.
 	 */
 	if (LIST_FIRST(wkhd) != NULL) {
 		jaddref->ja_state |= ONWORKLIST;
 		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
 	} else
 		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
 
 	return (needsj);
 }
 
 /* 
  * Attempt to free a jaddref structure when some work completes.  This
  * should only succeed once the entry is written and all dependencies have
  * been notified.
  */
 static void
 free_jaddref(jaddref)
 	struct jaddref *jaddref;
 {
 
 	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	if (jaddref->ja_ref.if_jsegdep)
 		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
 		    jaddref, jaddref->ja_state);
 	if (jaddref->ja_state & NEWBLOCK)
 		LIST_REMOVE(jaddref, ja_bmdeps);
 	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
 		panic("free_jaddref: Bad state %p(0x%X)",
 		    jaddref, jaddref->ja_state);
 	if (jaddref->ja_mkdir != NULL)
 		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
 	WORKITEM_FREE(jaddref, D_JADDREF);
 }
 
 /*
  * Free a jremref structure once it has been written or discarded.
  */
 static void
 free_jremref(jremref)
 	struct jremref *jremref;
 {
 
 	if (jremref->jr_ref.if_jsegdep)
 		free_jsegdep(jremref->jr_ref.if_jsegdep);
 	if (jremref->jr_state & INPROGRESS)
 		panic("free_jremref: IO still pending");
 	WORKITEM_FREE(jremref, D_JREMREF);
 }
 
 /*
  * Free a jnewblk structure.
  */
 static void
 free_jnewblk(jnewblk)
 	struct jnewblk *jnewblk;
 {
 
 	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	LIST_REMOVE(jnewblk, jn_deps);
 	if (jnewblk->jn_dep != NULL)
 		panic("free_jnewblk: Dependency still attached.");
 	WORKITEM_FREE(jnewblk, D_JNEWBLK);
 }
 
 /*
  * Cancel a jnewblk which has been been made redundant by frag extension.
  */
 static void
 cancel_jnewblk(jnewblk, wkhd)
 	struct jnewblk *jnewblk;
 	struct workhead *wkhd;
 {
 	struct jsegdep *jsegdep;
 
 	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
 	jsegdep = jnewblk->jn_jsegdep;
 	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
 		panic("cancel_jnewblk: Invalid state");
 	jnewblk->jn_jsegdep  = NULL;
 	jnewblk->jn_dep = NULL;
 	jnewblk->jn_state |= GOINGAWAY;
 	if (jnewblk->jn_state & INPROGRESS) {
 		jnewblk->jn_state &= ~INPROGRESS;
 		WORKLIST_REMOVE(&jnewblk->jn_list);
 		jwork_insert(wkhd, jsegdep);
 	} else {
 		free_jsegdep(jsegdep);
 		remove_from_journal(&jnewblk->jn_list);
 	}
 	wake_worklist(&jnewblk->jn_list);
 	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
 }
 
 static void
 free_jblkdep(jblkdep)
 	struct jblkdep *jblkdep;
 {
 
 	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
 		WORKITEM_FREE(jblkdep, D_JFREEBLK);
 	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
 		WORKITEM_FREE(jblkdep, D_JTRUNC);
 	else
 		panic("free_jblkdep: Unexpected type %s",
 		    TYPENAME(jblkdep->jb_list.wk_type));
 }
 
 /*
  * Free a single jseg once it is no longer referenced in memory or on
  * disk.  Reclaim journal blocks and dependencies waiting for the segment
  * to disappear.
  */
 static void
 free_jseg(jseg, jblocks)
 	struct jseg *jseg;
 	struct jblocks *jblocks;
 {
 	struct freework *freework;
 
 	/*
 	 * Free freework structures that were lingering to indicate freed
 	 * indirect blocks that forced journal write ordering on reallocate.
 	 */
 	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
 		indirblk_remove(freework);
 	if (jblocks->jb_oldestseg == jseg)
 		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
 	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
 	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
 	KASSERT(LIST_EMPTY(&jseg->js_entries),
 	    ("free_jseg: Freed jseg has valid entries."));
 	WORKITEM_FREE(jseg, D_JSEG);
 }
 
 /*
  * Free all jsegs that meet the criteria for being reclaimed and update
  * oldestseg.
  */
 static void
 free_jsegs(jblocks)
 	struct jblocks *jblocks;
 {
 	struct jseg *jseg;
 
 	/*
 	 * Free only those jsegs which have none allocated before them to
 	 * preserve the journal space ordering.
 	 */
 	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
 		/*
 		 * Only reclaim space when nothing depends on this journal
 		 * set and another set has written that it is no longer
 		 * valid.
 		 */
 		if (jseg->js_refs != 0) {
 			jblocks->jb_oldestseg = jseg;
 			return;
 		}
 		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
 			break;
 		if (jseg->js_seq > jblocks->jb_oldestwrseq)
 			break;
 		/*
 		 * We can free jsegs that didn't write entries when
 		 * oldestwrseq == js_seq.
 		 */
 		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
 		    jseg->js_cnt != 0)
 			break;
 		free_jseg(jseg, jblocks);
 	}
 	/*
 	 * If we exited the loop above we still must discover the
 	 * oldest valid segment.
 	 */
 	if (jseg)
 		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
 		     jseg = TAILQ_NEXT(jseg, js_next))
 			if (jseg->js_refs != 0)
 				break;
 	jblocks->jb_oldestseg = jseg;
 	/*
 	 * The journal has no valid records but some jsegs may still be
 	 * waiting on oldestwrseq to advance.  We force a small record
 	 * out to permit these lingering records to be reclaimed.
 	 */
 	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
 		jblocks->jb_needseg = 1;
 }
 
 /*
  * Release one reference to a jseg and free it if the count reaches 0.  This
  * should eventually reclaim journal space as well.
  */
 static void
 rele_jseg(jseg)
 	struct jseg *jseg;
 {
 
 	KASSERT(jseg->js_refs > 0,
 	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
 	if (--jseg->js_refs != 0)
 		return;
 	free_jsegs(jseg->js_jblocks);
 }
 
 /*
  * Release a jsegdep and decrement the jseg count.
  */
 static void
 free_jsegdep(jsegdep)
 	struct jsegdep *jsegdep;
 {
 
 	if (jsegdep->jd_seg)
 		rele_jseg(jsegdep->jd_seg);
 	WORKITEM_FREE(jsegdep, D_JSEGDEP);
 }
 
 /*
  * Wait for a journal item to make it to disk.  Initiate journal processing
  * if required.
  */
 static int
 jwait(wk, waitfor)
 	struct worklist *wk;
 	int waitfor;
 {
 
 	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
 	/*
 	 * Blocking journal waits cause slow synchronous behavior.  Record
 	 * stats on the frequency of these blocking operations.
 	 */
 	if (waitfor == MNT_WAIT) {
 		stat_journal_wait++;
 		switch (wk->wk_type) {
 		case D_JREMREF:
 		case D_JMVREF:
 			stat_jwait_filepage++;
 			break;
 		case D_JTRUNC:
 		case D_JFREEBLK:
 			stat_jwait_freeblks++;
 			break;
 		case D_JNEWBLK:
 			stat_jwait_newblk++;
 			break;
 		case D_JADDREF:
 			stat_jwait_inode++;
 			break;
 		default:
 			break;
 		}
 	}
 	/*
 	 * If IO has not started we process the journal.  We can't mark the
 	 * worklist item as IOWAITING because we drop the lock while
 	 * processing the journal and the worklist entry may be freed after
 	 * this point.  The caller may call back in and re-issue the request.
 	 */
 	if ((wk->wk_state & INPROGRESS) == 0) {
 		softdep_process_journal(wk->wk_mp, wk, waitfor);
 		if (waitfor != MNT_WAIT)
 			return (EBUSY);
 		return (0);
 	}
 	if (waitfor != MNT_WAIT)
 		return (EBUSY);
 	wait_worklist(wk, "jwait");
 	return (0);
 }
 
 /*
  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
  * appropriate.  This is a convenience function to reduce duplicate code
  * for the setup and revert functions below.
  */
 static struct inodedep *
 inodedep_lookup_ip(ip)
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 
 	KASSERT(ip->i_nlink >= ip->i_effnlink,
 	    ("inodedep_lookup_ip: bad delta"));
 	(void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
 	    &inodedep);
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
 
 	return (inodedep);
 }
 
 /*
  * Called prior to creating a new inode and linking it to a directory.  The
  * jaddref structure must already be allocated by softdep_setup_inomapdep
  * and it is discovered here so we can initialize the mode and update
  * nlinkdelta.
  */
 void
 softdep_setup_create(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_create called on non-softdep filesystem"));
 	KASSERT(ip->i_nlink == 1,
 	    ("softdep_setup_create: Invalid link count."));
 	dvp = ITOV(dp);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 		    ("softdep_setup_create: No addref structure present."));
 	}
 	softdep_prelink(dvp, NULL);
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Create a jaddref structure to track the addition of a DOTDOT link when
  * we are reparenting an inode as part of a rename.  This jaddref will be
  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
  * non-journaling softdep.
  */
 void
 softdep_setup_dotdot_link(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	jaddref = NULL;
 	/*
 	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
 	 * is used as a normal link would be.
 	 */
 	if (DOINGSUJ(dvp))
 		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 		    dp->i_effnlink - 1, dp->i_mode);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(dp);
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
 	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Create a jaddref structure to track a new link to an inode.  The directory
  * offset is not known until softdep_setup_directory_add or
  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
  * softdep.
  */
 void
 softdep_setup_link(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_link called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	jaddref = NULL;
 	if (DOINGSUJ(dvp))
 		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
 		    ip->i_mode);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
 	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Called to create the jaddref structures to track . and .. references as
  * well as lookup and further initialize the incomplete jaddref created
  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
  * nlinkdelta for non-journaling softdep.
  */
 void
 softdep_setup_mkdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *dotdotaddref;
 	struct jaddref *dotaddref;
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_mkdir called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	dotaddref = dotdotaddref = NULL;
 	if (DOINGSUJ(dvp)) {
 		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
 		    ip->i_mode);
 		dotaddref->ja_state |= MKDIR_BODY;
 		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 		    dp->i_effnlink - 1, dp->i_mode);
 		dotdotaddref->ja_state |= MKDIR_PARENT;
 	}
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref != NULL,
 		    ("softdep_setup_mkdir: No addref structure present."));
 		KASSERT(jaddref->ja_parent == dp->i_number, 
 		    ("softdep_setup_mkdir: bad parent %ju",
 		    (uintmax_t)jaddref->ja_parent));
 		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
 		    if_deps);
 	}
 	inodedep = inodedep_lookup_ip(dp);
 	if (DOINGSUJ(dvp))
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
 		    &dotdotaddref->ja_ref, if_deps);
 	softdep_prelink(ITOV(dp), NULL);
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Called to track nlinkdelta of the inode and parent directories prior to
  * unlinking a directory.
  */
 void
 softdep_setup_rmdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_rmdir called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
 	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Called to track nlinkdelta of the inode and parent directories prior to
  * unlink.
  */
 void
 softdep_setup_unlink(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_unlink called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
 	softdep_prelink(dvp, ITOV(ip));
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Called to release the journal structures created by a failed non-directory
  * creation.  Adjusts nlinkdelta for non-journaling softdep.
  */
 void
 softdep_revert_create(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
 	    ("softdep_revert_create called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref->ja_parent == dp->i_number,
 		    ("softdep_revert_create: addref parent mismatch"));
 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 	}
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Called to release the journal structures created by a failed link
  * addition.  Adjusts nlinkdelta for non-journaling softdep.
  */
 void
 softdep_revert_link(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_revert_link called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref->ja_parent == dp->i_number,
 		    ("softdep_revert_link: addref parent mismatch"));
 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 	}
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Called to release the journal structures created by a failed mkdir
  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
  */
 void
 softdep_revert_mkdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct jaddref *dotaddref;
 	struct vnode *dvp;
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_revert_mkdir called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 
 	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(dp);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref->ja_parent == ip->i_number,
 		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 	}
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref->ja_parent == dp->i_number,
 		    ("softdep_revert_mkdir: addref parent mismatch"));
 		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
 		    inoreflst, if_deps);
 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 		KASSERT(dotaddref->ja_parent == ip->i_number,
 		    ("softdep_revert_mkdir: dot addref parent mismatch"));
 		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
 	}
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /* 
  * Called to correct nlinkdelta after a failed rmdir.
  */
 void
 softdep_revert_rmdir(dp, ip)
 	struct inode *dp;
 	struct inode *ip;
 {
 
 	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_revert_rmdir called on non-softdep filesystem"));
 	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
 	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
  * Protecting the freemaps (or bitmaps).
  * 
  * To eliminate the need to execute fsck before mounting a filesystem
  * after a power failure, one must (conservatively) guarantee that the
  * on-disk copy of the bitmaps never indicate that a live inode or block is
  * free.  So, when a block or inode is allocated, the bitmap should be
  * updated (on disk) before any new pointers.  When a block or inode is
  * freed, the bitmap should not be updated until all pointers have been
  * reset.  The latter dependency is handled by the delayed de-allocation
  * approach described below for block and inode de-allocation.  The former
  * dependency is handled by calling the following procedure when a block or
  * inode is allocated. When an inode is allocated an "inodedep" is created
  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
  * Each "inodedep" is also inserted into the hash indexing structure so
  * that any additional link additions can be made dependent on the inode
  * allocation.
  * 
  * The ufs filesystem maintains a number of free block counts (e.g., per
  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
  * in addition to the bitmaps.  These counts are used to improve efficiency
  * during allocation and therefore must be consistent with the bitmaps.
  * There is no convenient way to guarantee post-crash consistency of these
  * counts with simple update ordering, for two main reasons: (1) The counts
  * and bitmaps for a single cylinder group block are not in the same disk
  * sector.  If a disk write is interrupted (e.g., by power failure), one may
  * be written and the other not.  (2) Some of the counts are located in the
  * superblock rather than the cylinder group block. So, we focus our soft
  * updates implementation on protecting the bitmaps. When mounting a
  * filesystem, we recompute the auxiliary counts from the bitmaps.
  */
 
 /*
  * Called just after updating the cylinder group block to allocate an inode.
  */
 void
 softdep_setup_inomapdep(bp, ip, newinum, mode)
 	struct buf *bp;		/* buffer for cylgroup block with inode map */
 	struct inode *ip;	/* inode related to allocation */
 	ino_t newinum;		/* new inode number being allocated */
 	int mode;
 {
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
 	struct jaddref *jaddref;
 	struct mount *mp;
 	struct fs *fs;
 
 	mp = ITOVFS(ip);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
 	fs = VFSTOUFS(mp)->um_fs;
 	jaddref = NULL;
 
 	/*
 	 * Allocate the journal reference add structure so that the bitmap
 	 * can be dependent on it.
 	 */
 	if (MOUNTEDSUJ(mp)) {
 		jaddref = newjaddref(ip, newinum, 0, 0, mode);
 		jaddref->ja_state |= NEWBLOCK;
 	}
 
 	/*
 	 * Create a dependency for the newly allocated inode.
 	 * Panic if it already exists as something is seriously wrong.
 	 * Otherwise add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 *
 	 * We have to preallocate a bmsafemap entry in case it is needed
 	 * in bmsafemap_lookup since once we allocate the inodedep, we
 	 * have to finish initializing it before we can FREE_LOCK().
 	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
 	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
 	 * creating the inodedep as it can be freed during the time
 	 * that we FREE_LOCK() while allocating the inodedep. We must
 	 * call workitem_alloc() before entering the locked section as
 	 * it also acquires the lock and we must avoid trying doing so
 	 * recursively.
 	 */
 	bmsafemap = malloc(sizeof(struct bmsafemap),
 	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 	ACQUIRE_LOCK(ITOUMP(ip));
 	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
 		panic("softdep_setup_inomapdep: dependency %p for new"
 		    "inode already exists", inodedep);
 	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
 	if (jaddref) {
 		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
 	} else {
 		inodedep->id_state |= ONDEPLIST;
 		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 	}
 	inodedep->id_bmsafemap = bmsafemap;
 	inodedep->id_state &= ~DEPCOMPLETE;
 	FREE_LOCK(ITOUMP(ip));
 }
 
 /*
  * Called just after updating the cylinder group block to
  * allocate block or fragment.
  */
 void
 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 	struct buf *bp;		/* buffer for cylgroup block with block map */
 	struct mount *mp;	/* filesystem doing allocation */
 	ufs2_daddr_t newblkno;	/* number of newly allocated block */
 	int frags;		/* Number of fragments. */
 	int oldfrags;		/* Previous number of fragments for extend. */
 {
 	struct newblk *newblk;
 	struct bmsafemap *bmsafemap;
 	struct jnewblk *jnewblk;
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	jnewblk = NULL;
 	/*
 	 * Create a dependency for the newly allocated block.
 	 * Add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
 	if (MOUNTEDSUJ(mp)) {
 		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
 		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
 		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
 		jnewblk->jn_state = ATTACHED;
 		jnewblk->jn_blkno = newblkno;
 		jnewblk->jn_frags = frags;
 		jnewblk->jn_oldfrags = oldfrags;
 #ifdef INVARIANTS
 		{
 			struct cg *cgp;
 			uint8_t *blksfree;
 			long bno;
 			int i;
 	
 			cgp = (struct cg *)bp->b_data;
 			blksfree = cg_blksfree(cgp);
 			bno = dtogd(fs, jnewblk->jn_blkno);
 			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
 			    i++) {
 				if (isset(blksfree, bno + i))
 					panic("softdep_setup_blkmapdep: "
 					    "free fragment %d from %d-%d "
 					    "state 0x%X dep %p", i,
 					    jnewblk->jn_oldfrags,
 					    jnewblk->jn_frags,
 					    jnewblk->jn_state,
 					    jnewblk->jn_dep);
 			}
 		}
 #endif
 	}
 
 	CTR3(KTR_SUJ,
 	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
 	    newblkno, frags, oldfrags);
 	ACQUIRE_LOCK(ump);
 	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
 		panic("softdep_setup_blkmapdep: found block");
 	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
 	    dtog(fs, newblkno), NULL);
 	if (jnewblk) {
 		jnewblk->jn_dep = (struct worklist *)newblk;
 		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
 	} else {
 		newblk->nb_state |= ONDEPLIST;
 		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 	}
 	newblk->nb_bmsafemap = bmsafemap;
 	newblk->nb_jnewblk = jnewblk;
 	FREE_LOCK(ump);
 }
 
 #define	BMSAFEMAP_HASH(ump, cg) \
       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
 
 static int
 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
 	struct bmsafemap_hashhead *bmsafemaphd;
 	int cg;
 	struct bmsafemap **bmsafemapp;
 {
 	struct bmsafemap *bmsafemap;
 
 	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
 		if (bmsafemap->sm_cg == cg)
 			break;
 	if (bmsafemap) {
 		*bmsafemapp = bmsafemap;
 		return (1);
 	}
 	*bmsafemapp = NULL;
 
 	return (0);
 }
 
 /*
  * Find the bmsafemap associated with a cylinder group buffer.
  * If none exists, create one. The buffer must be locked when
  * this routine is called and this routine must be called with
  * the softdep lock held. To avoid giving up the lock while
  * allocating a new bmsafemap, a preallocated bmsafemap may be
  * provided. If it is provided but not needed, it is freed.
  */
 static struct bmsafemap *
 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
 	struct mount *mp;
 	struct buf *bp;
 	int cg;
 	struct bmsafemap *newbmsafemap;
 {
 	struct bmsafemap_hashhead *bmsafemaphd;
 	struct bmsafemap *bmsafemap, *collision;
 	struct worklist *wk;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		if (wk->wk_type == D_BMSAFEMAP) {
 			if (newbmsafemap)
 				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
 			return (WK_BMSAFEMAP(wk));
 		}
 	}
 	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
 	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
 		if (newbmsafemap)
 			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
 		return (bmsafemap);
 	}
 	if (newbmsafemap) {
 		bmsafemap = newbmsafemap;
 	} else {
 		FREE_LOCK(ump);
 		bmsafemap = malloc(sizeof(struct bmsafemap),
 			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 		ACQUIRE_LOCK(ump);
 	}
 	bmsafemap->sm_buf = bp;
 	LIST_INIT(&bmsafemap->sm_inodedephd);
 	LIST_INIT(&bmsafemap->sm_inodedepwr);
 	LIST_INIT(&bmsafemap->sm_newblkhd);
 	LIST_INIT(&bmsafemap->sm_newblkwr);
 	LIST_INIT(&bmsafemap->sm_jaddrefhd);
 	LIST_INIT(&bmsafemap->sm_jnewblkhd);
 	LIST_INIT(&bmsafemap->sm_freehd);
 	LIST_INIT(&bmsafemap->sm_freewr);
 	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
 		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 		return (collision);
 	}
 	bmsafemap->sm_cg = cg;
 	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
 	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
 	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 	return (bmsafemap);
 }
 
 /*
  * Direct block allocation dependencies.
  * 
  * When a new block is allocated, the corresponding disk locations must be
  * initialized (with zeros or new data) before the on-disk inode points to
  * them.  Also, the freemap from which the block was allocated must be
  * updated (on disk) before the inode's pointer. These two dependencies are
  * independent of each other and are needed for all file blocks and indirect
  * blocks that are pointed to directly by the inode.  Just before the
  * "in-core" version of the inode is updated with a newly allocated block
  * number, a procedure (below) is called to setup allocation dependency
  * structures.  These structures are removed when the corresponding
  * dependencies are satisfied or when the block allocation becomes obsolete
  * (i.e., the file is deleted, the block is de-allocated, or the block is a
  * fragment that gets upgraded).  All of these cases are handled in
  * procedures described later.
  * 
  * When a file extension causes a fragment to be upgraded, either to a larger
  * fragment or to a full block, the on-disk location may change (if the
  * previous fragment could not simply be extended). In this case, the old
  * fragment must be de-allocated, but not until after the inode's pointer has
  * been updated. In most cases, this is handled by later procedures, which
  * will construct a "freefrag" structure to be added to the workitem queue
  * when the inode update is complete (or obsolete).  The main exception to
  * this is when an allocation occurs while a pending allocation dependency
  * (for the same block pointer) remains.  This case is handled in the main
  * allocation dependency setup procedure by immediately freeing the
  * unreferenced fragments.
  */ 
 void 
 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;	/* inode to which block is being added */
 	ufs_lbn_t off;		/* block pointer within inode */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
 	long newsize;		/* size of new block */
 	long oldsize;		/* size of new block */
 	struct buf *bp;		/* bp for allocated block */
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
 	struct freefrag *freefrag;
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
 	struct jnewblk *jnewblk;
 	struct newblk *newblk;
 	struct mount *mp;
 	ufs_lbn_t lbn;
 
 	lbn = bp->b_lblkno;
 	mp = ITOVFS(ip);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
 	if (oldblkno && oldblkno != newblkno)
 		/*
 		 * The usual case is that a smaller fragment that
 		 * was just allocated has been replaced with a bigger
 		 * fragment or a full-size block. If it is marked as
 		 * B_DELWRI, the current contents have not been written
 		 * to disk. It is possible that the block was written
 		 * earlier, but very uncommon. If the block has never
 		 * been written, there is no need to send a BIO_DELETE
 		 * for it when it is freed. The gain from avoiding the
 		 * TRIMs for the common case of unwritten blocks far
 		 * exceeds the cost of the write amplification for the
 		 * uncommon case of failing to send a TRIM for a block
 		 * that had been written.
 		 */
 		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
 		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
 	else
 		freefrag = NULL;
 
 	CTR6(KTR_SUJ,
 	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
 	    "off %jd newsize %ld oldsize %d",
 	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
 	ACQUIRE_LOCK(ITOUMP(ip));
 	if (off >= UFS_NDADDR) {
 		if (lbn > 0)
 			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
 			    lbn, off);
 		/* allocating an indirect block */
 		if (oldblkno != 0)
 			panic("softdep_setup_allocdirect: non-zero indir");
 	} else {
 		if (off != lbn)
 			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
 			    lbn, off);
 		/*
 		 * Allocating a direct block.
 		 *
 		 * If we are allocating a directory block, then we must
 		 * allocate an associated pagedep to track additions and
 		 * deletions.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR)
 			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
 			    &pagedep);
 	}
 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocdirect: lost block");
 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 	    ("softdep_setup_allocdirect: newblk already initialized"));
 	/*
 	 * Convert the newblk to an allocdirect.
 	 */
 	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
 	adp = (struct allocdirect *)newblk;
 	newblk->nb_freefrag = freefrag;
 	adp->ad_offset = off;
 	adp->ad_oldblkno = oldblkno;
 	adp->ad_newsize = newsize;
 	adp->ad_oldsize = oldsize;
 
 	/*
 	 * Finish initializing the journal.
 	 */
 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
 		jnewblk->jn_ino = ip->i_number;
 		jnewblk->jn_lbn = lbn;
 		add_to_journal(&jnewblk->jn_list);
 	}
 	if (freefrag && freefrag->ff_jdep != NULL &&
 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
 		add_to_journal(freefrag->ff_jdep);
 	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	adp->ad_inodedep = inodedep;
 
 	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
 	 * first uncommitted block (the size of the file stored on disk
 	 * ends at the end of the lowest committed fragment, or if there
 	 * are no fragments, at the end of the highest committed block).
 	 * Since files generally grow, the typical case is that the new
 	 * block is to be added at the end of the list. We speed this
 	 * special case by checking against the last allocdirect in the
 	 * list before laboriously traversing the list looking for the
 	 * insertion point.
 	 */
 	adphead = &inodedep->id_newinoupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
 	if (oldadp == NULL || oldadp->ad_offset <= off) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_offset == off)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(ITOUMP(ip));
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
 		if (oldadp->ad_offset >= off)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocdirect: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_offset == off)
 		allocdirect_merge(adphead, adp, oldadp);
 
 	FREE_LOCK(ITOUMP(ip));
 }
 
 /*
  * Merge a newer and older journal record to be stored either in a
  * newblock or freefrag.  This handles aggregating journal records for
  * fragment allocation into a second record as well as replacing a
  * journal free with an aborted journal allocation.  A segment for the
  * oldest record will be placed on wkhd if it has been written.  If not
  * the segment for the newer record will suffice.
  */
 static struct worklist *
 jnewblk_merge(new, old, wkhd)
 	struct worklist *new;
 	struct worklist *old;
 	struct workhead *wkhd;
 {
 	struct jnewblk *njnewblk;
 	struct jnewblk *jnewblk;
 
 	/* Handle NULLs to simplify callers. */
 	if (new == NULL)
 		return (old);
 	if (old == NULL)
 		return (new);
 	/* Replace a jfreefrag with a jnewblk. */
 	if (new->wk_type == D_JFREEFRAG) {
 		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
 			panic("jnewblk_merge: blkno mismatch: %p, %p",
 			    old, new);
 		cancel_jfreefrag(WK_JFREEFRAG(new));
 		return (old);
 	}
 	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
 		panic("jnewblk_merge: Bad type: old %d new %d\n",
 		    old->wk_type, new->wk_type);
 	/*
 	 * Handle merging of two jnewblk records that describe
 	 * different sets of fragments in the same block.
 	 */
 	jnewblk = WK_JNEWBLK(old);
 	njnewblk = WK_JNEWBLK(new);
 	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
 		panic("jnewblk_merge: Merging disparate blocks.");
 	/*
 	 * The record may be rolled back in the cg.
 	 */
 	if (jnewblk->jn_state & UNDONE) {
 		jnewblk->jn_state &= ~UNDONE;
 		njnewblk->jn_state |= UNDONE;
 		njnewblk->jn_state &= ~ATTACHED;
 	}
 	/*
 	 * We modify the newer addref and free the older so that if neither
 	 * has been written the most up-to-date copy will be on disk.  If
 	 * both have been written but rolled back we only temporarily need
 	 * one of them to fix the bits when the cg write completes.
 	 */
 	jnewblk->jn_state |= ATTACHED | COMPLETE;
 	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
 	cancel_jnewblk(jnewblk, wkhd);
 	WORKLIST_REMOVE(&jnewblk->jn_list);
 	free_jnewblk(jnewblk);
 	return (new);
 }
 
 /*
  * Replace an old allocdirect dependency with a newer one.
  */
 static void
 allocdirect_merge(adphead, newadp, oldadp)
 	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
 	struct allocdirect *newadp;	/* allocdirect being added */
 	struct allocdirect *oldadp;	/* existing allocdirect being checked */
 {
 	struct worklist *wk;
 	struct freefrag *freefrag;
 
 	freefrag = NULL;
 	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 	    newadp->ad_oldsize != oldadp->ad_newsize ||
 	    newadp->ad_offset >= UFS_NDADDR)
 		panic("%s %jd != new %jd || old size %ld != new %ld",
 		    "allocdirect_merge: old blkno",
 		    (intmax_t)newadp->ad_oldblkno,
 		    (intmax_t)oldadp->ad_newblkno,
 		    newadp->ad_oldsize, oldadp->ad_newsize);
 	newadp->ad_oldblkno = oldadp->ad_oldblkno;
 	newadp->ad_oldsize = oldadp->ad_oldsize;
 	/*
 	 * If the old dependency had a fragment to free or had never
 	 * previously had a block allocated, then the new dependency
 	 * can immediately post its freefrag and adopt the old freefrag.
 	 * This action is done by swapping the freefrag dependencies.
 	 * The new dependency gains the old one's freefrag, and the
 	 * old one gets the new one and then immediately puts it on
 	 * the worklist when it is freed by free_newblk. It is
 	 * not possible to do this swap when the old dependency had a
 	 * non-zero size but no previous fragment to free. This condition
 	 * arises when the new block is an extension of the old block.
 	 * Here, the first part of the fragment allocated to the new
 	 * dependency is part of the block currently claimed on disk by
 	 * the old dependency, so cannot legitimately be freed until the
 	 * conditions for the new dependency are fulfilled.
 	 */
 	freefrag = newadp->ad_freefrag;
 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 		newadp->ad_freefrag = oldadp->ad_freefrag;
 		oldadp->ad_freefrag = freefrag;
 	}
 	/*
 	 * If we are tracking a new directory-block allocation,
 	 * move it from the old allocdirect to the new allocdirect.
 	 */
 	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
 			panic("allocdirect_merge: extra newdirblk");
 		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
 	}
 	TAILQ_REMOVE(adphead, oldadp, ad_next);
 	/*
 	 * We need to move any journal dependencies over to the freefrag
 	 * that releases this block if it exists.  Otherwise we are
 	 * extending an existing block and we'll wait until that is
 	 * complete to release the journal space and extend the
 	 * new journal to cover this old space as well.
 	 */
 	if (freefrag == NULL) {
 		if (oldadp->ad_newblkno != newadp->ad_newblkno)
 			panic("allocdirect_merge: %jd != %jd",
 			    oldadp->ad_newblkno, newadp->ad_newblkno);
 		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
 		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 
 		    &oldadp->ad_block.nb_jnewblk->jn_list,
 		    &newadp->ad_block.nb_jwork);
 		oldadp->ad_block.nb_jnewblk = NULL;
 		cancel_newblk(&oldadp->ad_block, NULL,
 		    &newadp->ad_block.nb_jwork);
 	} else {
 		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
 		    &freefrag->ff_list, &freefrag->ff_jwork);
 		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
 		    &freefrag->ff_jwork);
 	}
 	free_newblk(&oldadp->ad_block);
 }
 
 /*
  * Allocate a jfreefrag structure to journal a single block free.
  */
 static struct jfreefrag *
 newjfreefrag(freefrag, ip, blkno, size, lbn)
 	struct freefrag *freefrag;
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
 	ufs_lbn_t lbn;
 {
 	struct jfreefrag *jfreefrag;
 	struct fs *fs;
 
 	fs = ITOFS(ip);
 	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
 	    M_SOFTDEP_FLAGS);
 	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
 	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
 	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
 	jfreefrag->fr_ino = ip->i_number;
 	jfreefrag->fr_lbn = lbn;
 	jfreefrag->fr_blkno = blkno;
 	jfreefrag->fr_frags = numfrags(fs, size);
 	jfreefrag->fr_freefrag = freefrag;
 
 	return (jfreefrag);
 }
 
 /*
  * Allocate a new freefrag structure.
  */
 static struct freefrag *
 newfreefrag(ip, blkno, size, lbn, key)
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
 	ufs_lbn_t lbn;
 	u_long key;
 {
 	struct freefrag *freefrag;
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
 	    ip->i_number, blkno, size, lbn);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	freefrag = malloc(sizeof(struct freefrag),
 	    M_FREEFRAG, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
 	freefrag->ff_state = ATTACHED;
 	LIST_INIT(&freefrag->ff_jwork);
 	freefrag->ff_inum = ip->i_number;
 	freefrag->ff_vtype = ITOV(ip)->v_type;
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
 	freefrag->ff_key = key;
 
 	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
 		freefrag->ff_jdep = (struct worklist *)
 		    newjfreefrag(freefrag, ip, blkno, size, lbn);
 	} else {
 		freefrag->ff_state |= DEPCOMPLETE;
 		freefrag->ff_jdep = NULL;
 	}
 
 	return (freefrag);
 }
 
 /*
  * This workitem de-allocates fragments that were replaced during
  * file block allocation.
  */
 static void 
 handle_workitem_freefrag(freefrag)
 	struct freefrag *freefrag;
 {
 	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 	struct workhead wkhd;
 
 	CTR3(KTR_SUJ,
 	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
 	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
 	/*
 	 * It would be illegal to add new completion items to the
 	 * freefrag after it was schedule to be done so it must be
 	 * safe to modify the list head here.
 	 */
 	LIST_INIT(&wkhd);
 	ACQUIRE_LOCK(ump);
 	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
 	/*
 	 * If the journal has not been written we must cancel it here.
 	 */
 	if (freefrag->ff_jdep) {
 		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
 			panic("handle_workitem_freefrag: Unexpected type %d\n",
 			    freefrag->ff_jdep->wk_type);
 		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
 	}
 	FREE_LOCK(ump);
 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
 	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
 	   &wkhd, freefrag->ff_key);
 	ACQUIRE_LOCK(ump);
 	WORKITEM_FREE(freefrag, D_FREEFRAG);
 	FREE_LOCK(ump);
 }
 
 /*
  * Set up a dependency structure for an external attributes data block.
  * This routine follows much of the structure of softdep_setup_allocdirect.
  * See the description of softdep_setup_allocdirect above for details.
  */
 void 
 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t off;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
 	struct freefrag *freefrag;
 	struct inodedep *inodedep;
 	struct jnewblk *jnewblk;
 	struct newblk *newblk;
 	struct mount *mp;
 	struct ufsmount *ump;
 	ufs_lbn_t lbn;
 
 	mp = ITOVFS(ip);
 	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocext called on non-softdep filesystem"));
 	KASSERT(off < UFS_NXADDR,
 	    ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
 
 	lbn = bp->b_lblkno;
 	if (oldblkno && oldblkno != newblkno)
 		/*
 		 * The usual case is that a smaller fragment that
 		 * was just allocated has been replaced with a bigger
 		 * fragment or a full-size block. If it is marked as
 		 * B_DELWRI, the current contents have not been written
 		 * to disk. It is possible that the block was written
 		 * earlier, but very uncommon. If the block has never
 		 * been written, there is no need to send a BIO_DELETE
 		 * for it when it is freed. The gain from avoiding the
 		 * TRIMs for the common case of unwritten blocks far
 		 * exceeds the cost of the write amplification for the
 		 * uncommon case of failing to send a TRIM for a block
 		 * that had been written.
 		 */
 		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
 		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
 	else
 		freefrag = NULL;
 
 	ACQUIRE_LOCK(ump);
 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocext: lost block");
 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 	    ("softdep_setup_allocext: newblk already initialized"));
 	/*
 	 * Convert the newblk to an allocdirect.
 	 */
 	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
 	adp = (struct allocdirect *)newblk;
 	newblk->nb_freefrag = freefrag;
 	adp->ad_offset = off;
 	adp->ad_oldblkno = oldblkno;
 	adp->ad_newsize = newsize;
 	adp->ad_oldsize = oldsize;
 	adp->ad_state |=  EXTDATA;
 
 	/*
 	 * Finish initializing the journal.
 	 */
 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
 		jnewblk->jn_ino = ip->i_number;
 		jnewblk->jn_lbn = lbn;
 		add_to_journal(&jnewblk->jn_list);
 	}
 	if (freefrag && freefrag->ff_jdep != NULL &&
 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
 		add_to_journal(freefrag->ff_jdep);
 	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	adp->ad_inodedep = inodedep;
 
 	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
 	 * first uncommitted block (the size of the file stored on disk
 	 * ends at the end of the lowest committed fragment, or if there
 	 * are no fragments, at the end of the highest committed block).
 	 * Since files generally grow, the typical case is that the new
 	 * block is to be added at the end of the list. We speed this
 	 * special case by checking against the last allocdirect in the
 	 * list before laboriously traversing the list looking for the
 	 * insertion point.
 	 */
 	adphead = &inodedep->id_newextupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
 	if (oldadp == NULL || oldadp->ad_offset <= off) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_offset == off)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(ump);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
 		if (oldadp->ad_offset >= off)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocext: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_offset == off)
 		allocdirect_merge(adphead, adp, oldadp);
 	FREE_LOCK(ump);
 }
 
 /*
  * Indirect block allocation dependencies.
  * 
  * The same dependencies that exist for a direct block also exist when
  * a new block is allocated and pointed to by an entry in a block of
  * indirect pointers. The undo/redo states described above are also
  * used here. Because an indirect block contains many pointers that
  * may have dependencies, a second copy of the entire in-memory indirect
  * block is kept. The buffer cache copy is always completely up-to-date.
  * The second copy, which is used only as a source for disk writes,
  * contains only the safe pointers (i.e., those that have no remaining
  * update dependencies). The second copy is freed when all pointers
  * are safe. The cache is not allowed to replace indirect blocks with
  * pending update dependencies. If a buffer containing an indirect
  * block with dependencies is written, these routines will mark it
  * dirty again. It can only be successfully written once all the
  * dependencies are removed. The ffs_fsync routine in conjunction with
  * softdep_sync_metadata work together to get all the dependencies
  * removed so that a file can be successfully written to disk. Three
  * procedures are used when setting up indirect block pointer
  * dependencies. The division is necessary because of the organization
  * of the "balloc" routine and because of the distinction between file
  * pages and file metadata blocks.
  */
 
 /*
  * Allocate a new allocindir structure.
  */
 static struct allocindir *
 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
 	struct inode *ip;	/* inode for file being extended */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 	ufs_lbn_t lbn;
 {
 	struct newblk *newblk;
 	struct allocindir *aip;
 	struct freefrag *freefrag;
 	struct jnewblk *jnewblk;
 
 	if (oldblkno)
 		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
 		    SINGLETON_KEY);
 	else
 		freefrag = NULL;
 	ACQUIRE_LOCK(ITOUMP(ip));
 	if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
 		panic("new_allocindir: lost block");
 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 	    ("newallocindir: newblk already initialized"));
 	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
 	newblk->nb_freefrag = freefrag;
 	aip = (struct allocindir *)newblk;
 	aip->ai_offset = ptrno;
 	aip->ai_oldblkno = oldblkno;
 	aip->ai_lbn = lbn;
 	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
 		jnewblk->jn_ino = ip->i_number;
 		jnewblk->jn_lbn = lbn;
 		add_to_journal(&jnewblk->jn_list);
 	}
 	if (freefrag && freefrag->ff_jdep != NULL &&
 	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
 		add_to_journal(freefrag->ff_jdep);
 	return (aip);
 }
 
 /*
  * Called just before setting an indirect block pointer
  * to a newly allocated file page.
  */
 void
 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct inode *ip;	/* inode for file being extended */
 	ufs_lbn_t lbn;		/* allocated block number within file */
 	struct buf *bp;		/* buffer with indirect blk referencing page */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 	struct buf *nbp;	/* buffer holding allocated page */
 {
 	struct inodedep *inodedep;
 	struct freefrag *freefrag;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct mount *mp;
 	struct ufsmount *ump;
 
 	mp = ITOVFS(ip);
 	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
 	KASSERT(lbn == nbp->b_lblkno,
 	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
 	    lbn, bp->b_lblkno));
 	CTR4(KTR_SUJ,
 	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
 	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
 	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	/*
 	 * If we are allocating a directory page, then we must
 	 * allocate an associated pagedep to track additions and
 	 * deletions.
 	 */
 	if ((ip->i_mode & IFMT) == IFDIR)
 		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
 	FREE_LOCK(ump);
 	if (freefrag)
 		handle_workitem_freefrag(freefrag);
 }
 
 /*
  * Called just before setting an indirect block pointer to a
  * newly allocated indirect block.
  */
 void
 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 	struct buf *nbp;	/* newly allocated indirect block */
 	struct inode *ip;	/* inode for file being extended */
 	struct buf *bp;		/* indirect block referencing allocated block */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 {
 	struct inodedep *inodedep;
 	struct allocindir *aip;
 	struct ufsmount *ump;
 	ufs_lbn_t lbn;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
 	CTR3(KTR_SUJ,
 	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
 	    ip->i_number, newblkno, ptrno);
 	lbn = nbp->b_lblkno;
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
 	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
 		panic("softdep_setup_allocindir_meta: Block already existed");
 	FREE_LOCK(ump);
 }
 
 static void
 indirdep_complete(indirdep)
 	struct indirdep *indirdep;
 {
 	struct allocindir *aip;
 
 	LIST_REMOVE(indirdep, ir_next);
 	indirdep->ir_state |= DEPCOMPLETE;
 
 	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
 		LIST_REMOVE(aip, ai_next);
 		free_newblk(&aip->ai_block);
 	}
 	/*
 	 * If this indirdep is not attached to a buf it was simply waiting
 	 * on completion to clear completehd.  free_indirdep() asserts
 	 * that nothing is dangling.
 	 */
 	if ((indirdep->ir_state & ONWORKLIST) == 0)
 		free_indirdep(indirdep);
 }
 
 static struct indirdep *
 indirdep_lookup(mp, ip, bp)
 	struct mount *mp;
 	struct inode *ip;
 	struct buf *bp;
 {
 	struct indirdep *indirdep, *newindirdep;
 	struct newblk *newblk;
 	struct ufsmount *ump;
 	struct worklist *wk;
 	struct fs *fs;
 	ufs2_daddr_t blkno;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	indirdep = NULL;
 	newindirdep = NULL;
 	fs = ump->um_fs;
 	for (;;) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			if (wk->wk_type != D_INDIRDEP)
 				continue;
 			indirdep = WK_INDIRDEP(wk);
 			break;
 		}
 		/* Found on the buffer worklist, no new structure to free. */
 		if (indirdep != NULL && newindirdep == NULL)
 			return (indirdep);
 		if (indirdep != NULL && newindirdep != NULL)
 			panic("indirdep_lookup: simultaneous create");
 		/* None found on the buffer and a new structure is ready. */
 		if (indirdep == NULL && newindirdep != NULL)
 			break;
 		/* None found and no new structure available. */
 		FREE_LOCK(ump);
 		newindirdep = malloc(sizeof(struct indirdep),
 		    M_INDIRDEP, M_SOFTDEP_FLAGS);
 		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
 		newindirdep->ir_state = ATTACHED;
 		if (I_IS_UFS1(ip))
 			newindirdep->ir_state |= UFS1FMT;
 		TAILQ_INIT(&newindirdep->ir_trunc);
 		newindirdep->ir_saveddata = NULL;
 		LIST_INIT(&newindirdep->ir_deplisthd);
 		LIST_INIT(&newindirdep->ir_donehd);
 		LIST_INIT(&newindirdep->ir_writehd);
 		LIST_INIT(&newindirdep->ir_completehd);
 		if (bp->b_blkno == bp->b_lblkno) {
 			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 			    NULL, NULL);
 			bp->b_blkno = blkno;
 		}
 		newindirdep->ir_freeblks = NULL;
 		newindirdep->ir_savebp =
 		    getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 		newindirdep->ir_bp = bp;
 		BUF_KERNPROC(newindirdep->ir_savebp);
 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 		ACQUIRE_LOCK(ump);
 	}
 	indirdep = newindirdep;
 	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 	/*
 	 * If the block is not yet allocated we don't set DEPCOMPLETE so
 	 * that we don't free dependencies until the pointers are valid.
 	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
 	 * than using the hash.
 	 */
 	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
 		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
 	else
 		indirdep->ir_state |= DEPCOMPLETE;
 	return (indirdep);
 }
 
 /*
  * Called to finish the allocation of the "aip" allocated
  * by one of the two routines above.
  */
 static struct freefrag *
 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
 	struct buf *bp;		/* in-memory copy of the indirect block */
 	struct inode *ip;	/* inode for file being extended */
 	struct inodedep *inodedep; /* Inodedep for ip */
 	struct allocindir *aip;	/* allocindir allocated by the above routines */
 	ufs_lbn_t lbn;		/* Logical block number for this block. */
 {
 	struct fs *fs;
 	struct indirdep *indirdep;
 	struct allocindir *oldaip;
 	struct freefrag *freefrag;
 	struct mount *mp;
 	struct ufsmount *ump;
 
 	mp = ITOVFS(ip);
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	fs = ump->um_fs;
 	if (bp->b_lblkno >= 0)
 		panic("setup_allocindir_phase2: not indir blk");
 	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
 	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
 	indirdep = indirdep_lookup(mp, ip, bp);
 	KASSERT(indirdep->ir_savebp != NULL,
 	    ("setup_allocindir_phase2 NULL ir_savebp"));
 	aip->ai_indirdep = indirdep;
 	/*
 	 * Check for an unwritten dependency for this indirect offset.  If
 	 * there is, merge the old dependency into the new one.  This happens
 	 * as a result of reallocblk only.
 	 */
 	freefrag = NULL;
 	if (aip->ai_oldblkno != 0) {
 		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
 			if (oldaip->ai_offset == aip->ai_offset) {
 				freefrag = allocindir_merge(aip, oldaip);
 				goto done;
 			}
 		}
 		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
 			if (oldaip->ai_offset == aip->ai_offset) {
 				freefrag = allocindir_merge(aip, oldaip);
 				goto done;
 			}
 		}
 	}
 done:
 	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 	return (freefrag);
 }
 
 /*
  * Merge two allocindirs which refer to the same block.  Move newblock
  * dependencies and setup the freefrags appropriately.
  */
 static struct freefrag *
 allocindir_merge(aip, oldaip)
 	struct allocindir *aip;
 	struct allocindir *oldaip;
 {
 	struct freefrag *freefrag;
 	struct worklist *wk;
 
 	if (oldaip->ai_newblkno != aip->ai_oldblkno)
 		panic("allocindir_merge: blkno");
 	aip->ai_oldblkno = oldaip->ai_oldblkno;
 	freefrag = aip->ai_freefrag;
 	aip->ai_freefrag = oldaip->ai_freefrag;
 	oldaip->ai_freefrag = NULL;
 	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
 	/*
 	 * If we are tracking a new directory-block allocation,
 	 * move it from the old allocindir to the new allocindir.
 	 */
 	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
 			panic("allocindir_merge: extra newdirblk");
 		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
 	}
 	/*
 	 * We can skip journaling for this freefrag and just complete
 	 * any pending journal work for the allocindir that is being
 	 * removed after the freefrag completes.
 	 */
 	if (freefrag->ff_jdep)
 		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
 	LIST_REMOVE(oldaip, ai_next);
 	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
 	    &freefrag->ff_list, &freefrag->ff_jwork);
 	free_newblk(&oldaip->ai_block);
 
 	return (freefrag);
 }
 
 static inline void
 setup_freedirect(freeblks, ip, i, needj)
 	struct freeblks *freeblks;
 	struct inode *ip;
 	int i;
 	int needj;
 {
 	struct ufsmount *ump;
 	ufs2_daddr_t blkno;
 	int frags;
 
 	blkno = DIP(ip, i_db[i]);
 	if (blkno == 0)
 		return;
 	DIP_SET(ip, i_db[i], 0);
 	ump = ITOUMP(ip);
 	frags = sblksize(ump->um_fs, ip->i_size, i);
 	frags = numfrags(ump->um_fs, frags);
 	newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
 }
 
 static inline void
 setup_freeext(freeblks, ip, i, needj)
 	struct freeblks *freeblks;
 	struct inode *ip;
 	int i;
 	int needj;
 {
 	struct ufsmount *ump;
 	ufs2_daddr_t blkno;
 	int frags;
 
 	blkno = ip->i_din2->di_extb[i];
 	if (blkno == 0)
 		return;
 	ip->i_din2->di_extb[i] = 0;
 	ump = ITOUMP(ip);
 	frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
 	frags = numfrags(ump->um_fs, frags);
 	newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
 }
 
 static inline void
 setup_freeindir(freeblks, ip, i, lbn, needj)
 	struct freeblks *freeblks;
 	struct inode *ip;
 	int i;
 	ufs_lbn_t lbn;
 	int needj;
 {
 	struct ufsmount *ump;
 	ufs2_daddr_t blkno;
 
 	blkno = DIP(ip, i_ib[i]);
 	if (blkno == 0)
 		return;
 	DIP_SET(ip, i_ib[i], 0);
 	ump = ITOUMP(ip);
 	newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
 	    0, needj);
 }
 
 static inline struct freeblks *
 newfreeblks(mp, ip)
 	struct mount *mp;
 	struct inode *ip;
 {
 	struct freeblks *freeblks;
 
 	freeblks = malloc(sizeof(struct freeblks),
 		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
 	LIST_INIT(&freeblks->fb_jblkdephd);
 	LIST_INIT(&freeblks->fb_jwork);
 	freeblks->fb_ref = 0;
 	freeblks->fb_cgwait = 0;
 	freeblks->fb_state = ATTACHED;
 	freeblks->fb_uid = ip->i_uid;
 	freeblks->fb_inum = ip->i_number;
 	freeblks->fb_vtype = ITOV(ip)->v_type;
 	freeblks->fb_modrev = DIP(ip, i_modrev);
 	freeblks->fb_devvp = ITODEVVP(ip);
 	freeblks->fb_chkcnt = 0;
 	freeblks->fb_len = 0;
 
 	return (freeblks);
 }
 
 static void
 trunc_indirdep(indirdep, freeblks, bp, off)
 	struct indirdep *indirdep;
 	struct freeblks *freeblks;
 	struct buf *bp;
 	int off;
 {
 	struct allocindir *aip, *aipn;
 
 	/*
 	 * The first set of allocindirs won't be in savedbp.
 	 */
 	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
 		if (aip->ai_offset > off)
 			cancel_allocindir(aip, bp, freeblks, 1);
 	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
 		if (aip->ai_offset > off)
 			cancel_allocindir(aip, bp, freeblks, 1);
 	/*
 	 * These will exist in savedbp.
 	 */
 	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
 		if (aip->ai_offset > off)
 			cancel_allocindir(aip, NULL, freeblks, 0);
 	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
 		if (aip->ai_offset > off)
 			cancel_allocindir(aip, NULL, freeblks, 0);
 }
 
 /*
  * Follow the chain of indirects down to lastlbn creating a freework
  * structure for each.  This will be used to start indir_trunc() at
  * the right offset and create the journal records for the parrtial
  * truncation.  A second step will handle the truncated dependencies.
  */
 static int
 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
 	struct freeblks *freeblks;
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs_lbn_t lastlbn;
 	ufs2_daddr_t blkno;
 {
 	struct indirdep *indirdep;
 	struct indirdep *indirn;
 	struct freework *freework;
 	struct newblk *newblk;
 	struct mount *mp;
 	struct ufsmount *ump;
 	struct buf *bp;
 	uint8_t *start;
 	uint8_t *end;
 	ufs_lbn_t lbnadd;
 	int level;
 	int error;
 	int off;
 
 
 	freework = NULL;
 	if (blkno == 0)
 		return (0);
 	mp = freeblks->fb_list.wk_mp;
 	ump = VFSTOUFS(mp);
 	/*
 	 * Here, calls to VOP_BMAP() will fail.  However, we already have
 	 * the on-disk address, so we just pass it to bread() instead of
 	 * having bread() attempt to calculate it using VOP_BMAP().
 	 */
 	error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno),
 	    (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
 	if (error)
 		return (error);
 	level = lbn_level(lbn);
 	lbnadd = lbn_offset(ump->um_fs, level);
 	/*
 	 * Compute the offset of the last block we want to keep.  Store
 	 * in the freework the first block we want to completely free.
 	 */
 	off = (lastlbn - -(lbn + level)) / lbnadd;
 	if (off + 1 == NINDIR(ump->um_fs))
 		goto nowork;
 	freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
 	/*
 	 * Link the freework into the indirdep.  This will prevent any new
 	 * allocations from proceeding until we are finished with the
 	 * truncate and the block is written.
 	 */
 	ACQUIRE_LOCK(ump);
 	indirdep = indirdep_lookup(mp, ip, bp);
 	if (indirdep->ir_freeblks)
 		panic("setup_trunc_indir: indirdep already truncated.");
 	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
 	freework->fw_indir = indirdep;
 	/*
 	 * Cancel any allocindirs that will not make it to disk.
 	 * We have to do this for all copies of the indirdep that
 	 * live on this newblk.
 	 */
 	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
 		if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0,
 		    &newblk) == 0)
 			panic("setup_trunc_indir: lost block");
 		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
 			trunc_indirdep(indirn, freeblks, bp, off);
 	} else
 		trunc_indirdep(indirdep, freeblks, bp, off);
 	FREE_LOCK(ump);
 	/*
 	 * Creation is protected by the buf lock. The saveddata is only
 	 * needed if a full truncation follows a partial truncation but it
 	 * is difficult to allocate in that case so we fetch it anyway.
 	 */
 	if (indirdep->ir_saveddata == NULL)
 		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
 		    M_SOFTDEP_FLAGS);
 nowork:
 	/* Fetch the blkno of the child and the zero start offset. */
 	if (I_IS_UFS1(ip)) {
 		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
 		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
 	} else {
 		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
 		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
 	}
 	if (freework) {
 		/* Zero the truncated pointers. */
 		end = bp->b_data + bp->b_bcount;
 		bzero(start, end - start);
 		bdwrite(bp);
 	} else
 		bqrelse(bp);
 	if (level == 0)
 		return (0);
 	lbn++; /* adjust level */
 	lbn -= (off * lbnadd);
 	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
 }
 
 /*
  * Complete the partial truncation of an indirect block setup by
  * setup_trunc_indir().  This zeros the truncated pointers in the saved
  * copy and writes them to disk before the freeblks is allowed to complete.
  */
 static void
 complete_trunc_indir(freework)
 	struct freework *freework;
 {
 	struct freework *fwn;
 	struct indirdep *indirdep;
 	struct ufsmount *ump;
 	struct buf *bp;
 	uintptr_t start;
 	int count;
 
 	ump = VFSTOUFS(freework->fw_list.wk_mp);
 	LOCK_OWNED(ump);
 	indirdep = freework->fw_indir;
 	for (;;) {
 		bp = indirdep->ir_bp;
 		/* See if the block was discarded. */
 		if (bp == NULL)
 			break;
 		/* Inline part of getdirtybuf().  We dont want bremfree. */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
 			break;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 		    LOCK_PTR(ump)) == 0)
 			BUF_UNLOCK(bp);
 		ACQUIRE_LOCK(ump);
 	}
 	freework->fw_state |= DEPCOMPLETE;
 	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
 	/*
 	 * Zero the pointers in the saved copy.
 	 */
 	if (indirdep->ir_state & UFS1FMT)
 		start = sizeof(ufs1_daddr_t);
 	else
 		start = sizeof(ufs2_daddr_t);
 	start *= freework->fw_start;
 	count = indirdep->ir_savebp->b_bcount - start;
 	start += (uintptr_t)indirdep->ir_savebp->b_data;
 	bzero((char *)start, count);
 	/*
 	 * We need to start the next truncation in the list if it has not
 	 * been started yet.
 	 */
 	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
 	if (fwn != NULL) {
 		if (fwn->fw_freeblks == indirdep->ir_freeblks)
 			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
 		if ((fwn->fw_state & ONWORKLIST) == 0)
 			freework_enqueue(fwn);
 	}
 	/*
 	 * If bp is NULL the block was fully truncated, restore
 	 * the saved block list otherwise free it if it is no
 	 * longer needed.
 	 */
 	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
 		if (bp == NULL)
 			bcopy(indirdep->ir_saveddata,
 			    indirdep->ir_savebp->b_data,
 			    indirdep->ir_savebp->b_bcount);
 		free(indirdep->ir_saveddata, M_INDIRDEP);
 		indirdep->ir_saveddata = NULL;
 	}
 	/*
 	 * When bp is NULL there is a full truncation pending.  We
 	 * must wait for this full truncation to be journaled before
 	 * we can release this freework because the disk pointers will
 	 * never be written as zero.
 	 */
 	if (bp == NULL)  {
 		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
 			handle_written_freework(freework);
 		else
 			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
 			   &freework->fw_list);
 		if (fwn == NULL) {
 			freework->fw_indir = (void *)0x0000deadbeef0000;
 			bp = indirdep->ir_savebp;
 			indirdep->ir_savebp = NULL;
 			free_indirdep(indirdep);
 			FREE_LOCK(ump);
 			brelse(bp);
 			ACQUIRE_LOCK(ump);
 		}
 	} else {
 		/* Complete when the real copy is written. */
 		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
 		BUF_UNLOCK(bp);
 	}
 }
 
 /*
  * Calculate the number of blocks we are going to release where datablocks
  * is the current total and length is the new file size.
  */
 static ufs2_daddr_t
 blkcount(fs, datablocks, length)
 	struct fs *fs;
 	ufs2_daddr_t datablocks;
 	off_t length;
 {
 	off_t totblks, numblks;
 
 	totblks = 0;
 	numblks = howmany(length, fs->fs_bsize);
 	if (numblks <= UFS_NDADDR) {
 		totblks = howmany(length, fs->fs_fsize);
 		goto out;
 	}
         totblks = blkstofrags(fs, numblks);
 	numblks -= UFS_NDADDR;
 	/*
 	 * Count all single, then double, then triple indirects required.
 	 * Subtracting one indirects worth of blocks for each pass
 	 * acknowledges one of each pointed to by the inode.
 	 */
 	for (;;) {
 		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
 		numblks -= NINDIR(fs);
 		if (numblks <= 0)
 			break;
 		numblks = howmany(numblks, NINDIR(fs));
 	}
 out:
 	totblks = fsbtodb(fs, totblks);
 	/*
 	 * Handle sparse files.  We can't reclaim more blocks than the inode
 	 * references.  We will correct it later in handle_complete_freeblks()
 	 * when we know the real count.
 	 */
 	if (totblks > datablocks)
 		return (0);
 	return (datablocks - totblks);
 }
 
 /*
  * Handle freeblocks for journaled softupdate filesystems.
  *
  * Contrary to normal softupdates, we must preserve the block pointers in
  * indirects until their subordinates are free.  This is to avoid journaling
  * every block that is freed which may consume more space than the journal
  * itself.  The recovery program will see the free block journals at the
  * base of the truncated area and traverse them to reclaim space.  The
  * pointers in the inode may be cleared immediately after the journal
  * records are written because each direct and indirect pointer in the
  * inode is recorded in a journal.  This permits full truncation to proceed
  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
  *
  * The algorithm is as follows:
  * 1) Traverse the in-memory state and create journal entries to release
  *    the relevant blocks and full indirect trees.
  * 2) Traverse the indirect block chain adding partial truncation freework
  *    records to indirects in the path to lastlbn.  The freework will
  *    prevent new allocation dependencies from being satisfied in this
  *    indirect until the truncation completes.
  * 3) Read and lock the inode block, performing an update with the new size
  *    and pointers.  This prevents truncated data from becoming valid on
  *    disk through step 4.
  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
  *    eliminate journal work for those records that do not require it.
  * 5) Schedule the journal records to be written followed by the inode block.
  * 6) Allocate any necessary frags for the end of file.
  * 7) Zero any partially truncated blocks.
  *
  * From this truncation proceeds asynchronously using the freework and
  * indir_trunc machinery.  The file will not be extended again into a
  * partially truncated indirect block until all work is completed but
  * the normal dependency mechanism ensures that it is rolled back/forward
  * as appropriate.  Further truncation may occur without delay and is
  * serialized in indir_trunc().
  */
 void
 softdep_journal_freeblocks(ip, cred, length, flags)
 	struct inode *ip;	/* The inode whose length is to be reduced */
 	struct ucred *cred;
 	off_t length;		/* The new length for the file */
 	int flags;		/* IO_EXT and/or IO_NORMAL */
 {
 	struct freeblks *freeblks, *fbn;
 	struct worklist *wk, *wkn;
 	struct inodedep *inodedep;
 	struct jblkdep *jblkdep;
 	struct allocdirect *adp, *adpn;
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct buf *bp;
 	struct vnode *vp;
 	struct mount *mp;
 	daddr_t dbn;
 	ufs2_daddr_t extblocks, datablocks;
 	ufs_lbn_t tmpval, lbn, lastlbn;
 	int frags, lastoff, iboff, allocblock, needj, error, i;
 
 	ump = ITOUMP(ip);
 	mp = UFSTOVFS(ump);
 	fs = ump->um_fs;
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
 	vp = ITOV(ip);
 	needj = 1;
 	iboff = -1;
 	allocblock = 0;
 	extblocks = 0;
 	datablocks = 0;
 	frags = 0;
 	freeblks = newfreeblks(mp, ip);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * If we're truncating a removed file that will never be written
 	 * we don't need to journal the block frees.  The canceled journals
 	 * for the allocations will suffice.
 	 */
 	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
 	    length == 0)
 		needj = 0;
 	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
 	    ip->i_number, length, needj);
 	FREE_LOCK(ump);
 	/*
 	 * Calculate the lbn that we are truncating to.  This results in -1
 	 * if we're truncating the 0 bytes.  So it is the last lbn we want
 	 * to keep, not the first lbn we want to truncate.
 	 */
 	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
 	lastoff = blkoff(fs, length);
 	/*
 	 * Compute frags we are keeping in lastlbn.  0 means all.
 	 */
 	if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
 		frags = fragroundup(fs, lastoff);
 		/* adp offset of last valid allocdirect. */
 		iboff = lastlbn;
 	} else if (lastlbn > 0)
 		iboff = UFS_NDADDR;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	/*
 	 * Handle normal data blocks and indirects.  This section saves
 	 * values used after the inode update to complete frag and indirect
 	 * truncation.
 	 */
 	if ((flags & IO_NORMAL) != 0) {
 		/*
 		 * Handle truncation of whole direct and indirect blocks.
 		 */
 		for (i = iboff + 1; i < UFS_NDADDR; i++)
 			setup_freedirect(freeblks, ip, i, needj);
 		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
 		    i < UFS_NIADDR;
 		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
 			/* Release a whole indirect tree. */
 			if (lbn > lastlbn) {
 				setup_freeindir(freeblks, ip, i, -lbn -i,
 				    needj);
 				continue;
 			}
 			iboff = i + UFS_NDADDR;
 			/*
 			 * Traverse partially truncated indirect tree.
 			 */
 			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
 				setup_trunc_indir(freeblks, ip, -lbn - i,
 				    lastlbn, DIP(ip, i_ib[i]));
 		}
 		/*
 		 * Handle partial truncation to a frag boundary.
 		 */
 		if (frags) {
 			ufs2_daddr_t blkno;
 			long oldfrags;
 
 			oldfrags = blksize(fs, ip, lastlbn);
 			blkno = DIP(ip, i_db[lastlbn]);
 			if (blkno && oldfrags != frags) {
 				oldfrags -= frags;
 				oldfrags = numfrags(fs, oldfrags);
 				blkno += numfrags(fs, frags);
 				newfreework(ump, freeblks, NULL, lastlbn,
 				    blkno, oldfrags, 0, needj);
 				if (needj)
 					adjust_newfreework(freeblks,
 					    numfrags(fs, frags));
 			} else if (blkno == 0)
 				allocblock = 1;
 		}
 		/*
 		 * Add a journal record for partial truncate if we are
 		 * handling indirect blocks.  Non-indirects need no extra
 		 * journaling.
 		 */
 		if (length != 0 && lastlbn >= UFS_NDADDR) {
 			UFS_INODE_SET_FLAG(ip, IN_TRUNCATED);
 			newjtrunc(freeblks, length, 0);
 		}
 		ip->i_size = length;
 		DIP_SET(ip, i_size, ip->i_size);
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
 		datablocks = DIP(ip, i_blocks) - extblocks;
 		if (length != 0)
 			datablocks = blkcount(fs, datablocks, length);
 		freeblks->fb_len = length;
 	}
 	if ((flags & IO_EXT) != 0) {
 		for (i = 0; i < UFS_NXADDR; i++)
 			setup_freeext(freeblks, ip, i, needj);
 		ip->i_din2->di_extsize = 0;
 		datablocks += extblocks;
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
 	}
 #ifdef QUOTA
 	/* Reference the quotas in case the block count is wrong in the end. */
 	quotaref(vp, freeblks->fb_quota);
 	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
 #endif
 	freeblks->fb_chkcnt = -datablocks;
 	UFS_LOCK(ump);
 	fs->fs_pendingblocks += datablocks;
 	UFS_UNLOCK(ump);
 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
 	/*
 	 * Handle truncation of incomplete alloc direct dependencies.  We
 	 * hold the inode block locked to prevent incomplete dependencies
 	 * from reaching the disk while we are eliminating those that
 	 * have been truncated.  This is a partially inlined ffs_update().
 	 */
 	ufs_itimes(vp);
 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 	dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number));
 	error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize,
 	    NULL, NULL, 0, cred, 0, NULL, &bp);
 	if (error) {
 		softdep_error("softdep_journal_freeblocks", error);
 		return;
 	}
 	if (bp->b_bufsize == fs->fs_bsize)
 		bp->b_flags |= B_CLUSTEROK;
 	softdep_update_inodeblock(ip, bp, 0);
 	if (ump->um_fstype == UFS1) {
 		*((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 	} else {
 		ffs_update_dinode_ckhash(fs, ip->i_din2);
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 	}
 	ACQUIRE_LOCK(ump);
 	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & IOSTARTED) != 0)
 		panic("softdep_setup_freeblocks: inode busy");
 	/*
 	 * Add the freeblks structure to the list of operations that
 	 * must await the zero'ed inode being written to disk. If we
 	 * still have a bitmap dependency (needj), then the inode
 	 * has never been written to disk, so we can process the
 	 * freeblks below once we have deleted the dependencies.
 	 */
 	if (needj)
 		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
 	else
 		freeblks->fb_state |= COMPLETE;
 	if ((flags & IO_NORMAL) != 0) {
 		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
 			if (adp->ad_offset > iboff)
 				cancel_allocdirect(&inodedep->id_inoupdt, adp,
 				    freeblks);
 			/*
 			 * Truncate the allocdirect.  We could eliminate
 			 * or modify journal records as well.
 			 */
 			else if (adp->ad_offset == iboff && frags)
 				adp->ad_newsize = frags;
 		}
 	}
 	if ((flags & IO_EXT) != 0)
 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 			cancel_allocdirect(&inodedep->id_extupdt, adp,
 			    freeblks);
 	/*
 	 * Scan the bufwait list for newblock dependencies that will never
 	 * make it to disk.
 	 */
 	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
 		if (wk->wk_type != D_ALLOCDIRECT)
 			continue;
 		adp = WK_ALLOCDIRECT(wk);
 		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
 		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
 			cancel_jfreeblk(freeblks, adp->ad_newblkno);
 			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
 			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
 		}
 	}
 	/*
 	 * Add journal work.
 	 */
 	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
 		add_to_journal(&jblkdep->jb_list);
 	FREE_LOCK(ump);
 	bdwrite(bp);
 	/*
 	 * Truncate dependency structures beyond length.
 	 */
 	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
 	/*
 	 * This is only set when we need to allocate a fragment because
 	 * none existed at the end of a frag-sized file.  It handles only
 	 * allocating a new, zero filled block.
 	 */
 	if (allocblock) {
 		ip->i_size = length - lastoff;
 		DIP_SET(ip, i_size, ip->i_size);
 		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
 		if (error != 0) {
 			softdep_error("softdep_journal_freeblks", error);
 			return;
 		}
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE);
 		allocbuf(bp, frags);
 		ffs_update(vp, 0);
 		bawrite(bp);
 	} else if (lastoff != 0 && vp->v_type != VDIR) {
 		int size;
 
 		/*
 		 * Zero the end of a truncated frag or block.
 		 */
 		size = sblksize(fs, length, lastlbn);
 		error = bread(vp, lastlbn, size, cred, &bp);
 		if (error == 0) {
 			bzero((char *)bp->b_data + lastoff, size - lastoff);
 			bawrite(bp);
 		} else if (!ffs_fsfail_cleanup(ump, error)) {
 			softdep_error("softdep_journal_freeblks", error);
 			return;
 		}
 	}
 	ACQUIRE_LOCK(ump);
 	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
 	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
 	/*
 	 * We zero earlier truncations so they don't erroneously
 	 * update i_blocks.
 	 */
 	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
 		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
 			fbn->fb_len = 0;
 	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
 	    LIST_EMPTY(&freeblks->fb_jblkdephd))
 		freeblks->fb_state |= INPROGRESS;
 	else
 		freeblks = NULL;
 	FREE_LOCK(ump);
 	if (freeblks)
 		handle_workitem_freeblocks(freeblks, 0);
 	trunc_pages(ip, length, extblocks, flags);
 
 }
 
 /*
  * Flush a JOP_SYNC to the journal.
  */
 void
 softdep_journal_fsync(ip)
 	struct inode *ip;
 {
 	struct jfsync *jfsync;
 	struct ufsmount *ump;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_journal_fsync called on non-softdep filesystem"));
 	if ((ip->i_flag & IN_TRUNCATED) == 0)
 		return;
 	ip->i_flag &= ~IN_TRUNCATED;
 	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
 	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
 	jfsync->jfs_size = ip->i_size;
 	jfsync->jfs_ino = ip->i_number;
 	ACQUIRE_LOCK(ump);
 	add_to_journal(&jfsync->jfs_list);
 	jwait(&jfsync->jfs_list, MNT_WAIT);
 	FREE_LOCK(ump);
 }
 
 /*
  * Block de-allocation dependencies.
  * 
  * When blocks are de-allocated, the on-disk pointers must be nullified before
  * the blocks are made available for use by other files.  (The true
  * requirement is that old pointers must be nullified before new on-disk
  * pointers are set.  We chose this slightly more stringent requirement to
  * reduce complexity.) Our implementation handles this dependency by updating
  * the inode (or indirect block) appropriately but delaying the actual block
  * de-allocation (i.e., freemap and free space count manipulation) until
  * after the updated versions reach stable storage.  After the disk is
  * updated, the blocks can be safely de-allocated whenever it is convenient.
  * This implementation handles only the common case of reducing a file's
  * length to zero. Other cases are handled by the conventional synchronous
  * write approach.
  *
  * The ffs implementation with which we worked double-checks
  * the state of the block pointers and file size as it reduces
  * a file's length.  Some of this code is replicated here in our
  * soft updates implementation.  The freeblks->fb_chkcnt field is
  * used to transfer a part of this information to the procedure
  * that eventually de-allocates the blocks.
  *
  * This routine should be called from the routine that shortens
  * a file's length, before the inode's size or block pointers
  * are modified. It will save the block pointer information for
  * later release and zero the inode so that the calling routine
  * can release it.
  */
 void
 softdep_setup_freeblocks(ip, length, flags)
 	struct inode *ip;	/* The inode whose length is to be reduced */
 	off_t length;		/* The new length for the file */
 	int flags;		/* IO_EXT and/or IO_NORMAL */
 {
 	struct ufs1_dinode *dp1;
 	struct ufs2_dinode *dp2;
 	struct freeblks *freeblks;
 	struct inodedep *inodedep;
 	struct allocdirect *adp;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct fs *fs;
 	ufs2_daddr_t extblocks, datablocks;
 	struct mount *mp;
 	int i, delay, error;
 	ufs_lbn_t tmpval;
 	ufs_lbn_t lbn;
 
 	ump = ITOUMP(ip);
 	mp = UFSTOVFS(ump);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
 	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
 	    ip->i_number, length);
 	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
 	fs = ump->um_fs;
 	if ((error = bread(ump->um_devvp,
 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
 		if (!ffs_fsfail_cleanup(ump, error))
 			softdep_error("softdep_setup_freeblocks", error);
 		return;
 	}
 	freeblks = newfreeblks(mp, ip);
 	extblocks = 0;
 	datablocks = 0;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	if ((flags & IO_NORMAL) != 0) {
 		for (i = 0; i < UFS_NDADDR; i++)
 			setup_freedirect(freeblks, ip, i, 0);
 		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
 		    i < UFS_NIADDR;
 		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
 			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
 		ip->i_size = 0;
 		DIP_SET(ip, i_size, 0);
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
 		datablocks = DIP(ip, i_blocks) - extblocks;
 	}
 	if ((flags & IO_EXT) != 0) {
 		for (i = 0; i < UFS_NXADDR; i++)
 			setup_freeext(freeblks, ip, i, 0);
 		ip->i_din2->di_extsize = 0;
 		datablocks += extblocks;
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
 	}
 #ifdef QUOTA
 	/* Reference the quotas in case the block count is wrong in the end. */
 	quotaref(ITOV(ip), freeblks->fb_quota);
 	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
 #endif
 	freeblks->fb_chkcnt = -datablocks;
 	UFS_LOCK(ump);
 	fs->fs_pendingblocks += datablocks;
 	UFS_UNLOCK(ump);
 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
 	/*
 	 * Push the zero'ed inode to its disk buffer so that we are free
 	 * to delete its dependencies below. Once the dependencies are gone
 	 * the buffer can be safely released.
 	 */
 	if (ump->um_fstype == UFS1) {
 		dp1 = ((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number));
 		ip->i_din1->di_freelink = dp1->di_freelink;
 		*dp1 = *ip->i_din1;
 	} else {
 		dp2 = ((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number));
 		ip->i_din2->di_freelink = dp2->di_freelink;
 		ffs_update_dinode_ckhash(fs, ip->i_din2);
 		*dp2 = *ip->i_din2;
 	}
 	/*
 	 * Find and eliminate any inode dependencies.
 	 */
 	ACQUIRE_LOCK(ump);
 	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & IOSTARTED) != 0)
 		panic("softdep_setup_freeblocks: inode busy");
 	/*
 	 * Add the freeblks structure to the list of operations that
 	 * must await the zero'ed inode being written to disk. If we
 	 * still have a bitmap dependency (delay == 0), then the inode
 	 * has never been written to disk, so we can process the
 	 * freeblks below once we have deleted the dependencies.
 	 */
 	delay = (inodedep->id_state & DEPCOMPLETE);
 	if (delay)
 		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
 	else
 		freeblks->fb_state |= COMPLETE;
 	/*
 	 * Because the file length has been truncated to zero, any
 	 * pending block allocation dependency structures associated
 	 * with this inode are obsolete and can simply be de-allocated.
 	 * We must first merge the two dependency lists to get rid of
 	 * any duplicate freefrag structures, then purge the merged list.
 	 * If we still have a bitmap dependency, then the inode has never
 	 * been written to disk, so we can free any fragments without delay.
 	 */
 	if (flags & IO_NORMAL) {
 		merge_inode_lists(&inodedep->id_newinoupdt,
 		    &inodedep->id_inoupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 			cancel_allocdirect(&inodedep->id_inoupdt, adp,
 			    freeblks);
 	}
 	if (flags & IO_EXT) {
 		merge_inode_lists(&inodedep->id_newextupdt,
 		    &inodedep->id_extupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 			cancel_allocdirect(&inodedep->id_extupdt, adp,
 			    freeblks);
 	}
 	FREE_LOCK(ump);
 	bdwrite(bp);
 	trunc_dependencies(ip, freeblks, -1, 0, flags);
 	ACQUIRE_LOCK(ump);
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 	freeblks->fb_state |= DEPCOMPLETE;
 	/*
 	 * If the inode with zeroed block pointers is now on disk
 	 * we can start freeing blocks.
 	 */  
 	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 		freeblks->fb_state |= INPROGRESS;
 	else
 		freeblks = NULL;
 	FREE_LOCK(ump);
 	if (freeblks)
 		handle_workitem_freeblocks(freeblks, 0);
 	trunc_pages(ip, length, extblocks, flags);
 }
 
 /*
  * Eliminate pages from the page cache that back parts of this inode and
  * adjust the vnode pager's idea of our size.  This prevents stale data
  * from hanging around in the page cache.
  */
 static void
 trunc_pages(ip, length, extblocks, flags)
 	struct inode *ip;
 	off_t length;
 	ufs2_daddr_t extblocks;
 	int flags;
 {
 	struct vnode *vp;
 	struct fs *fs;
 	ufs_lbn_t lbn;
 	off_t end, extend;
 
 	vp = ITOV(ip);
 	fs = ITOFS(ip);
 	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
 	if ((flags & IO_EXT) != 0)
 		vn_pages_remove(vp, extend, 0);
 	if ((flags & IO_NORMAL) == 0)
 		return;
 	BO_LOCK(&vp->v_bufobj);
 	drain_output(vp);
 	BO_UNLOCK(&vp->v_bufobj);
 	/*
 	 * The vnode pager eliminates file pages we eliminate indirects
 	 * below.
 	 */
 	vnode_pager_setsize(vp, length);
 	/*
 	 * Calculate the end based on the last indirect we want to keep.  If
 	 * the block extends into indirects we can just use the negative of
 	 * its lbn.  Doubles and triples exist at lower numbers so we must
 	 * be careful not to remove those, if they exist.  double and triple
 	 * indirect lbns do not overlap with others so it is not important
 	 * to verify how many levels are required.
 	 */
 	lbn = lblkno(fs, length);
 	if (lbn >= UFS_NDADDR) {
 		/* Calculate the virtual lbn of the triple indirect. */
 		lbn = -lbn - (UFS_NIADDR - 1);
 		end = OFF_TO_IDX(lblktosize(fs, lbn));
 	} else
 		end = extend;
 	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
 }
 
 /*
  * See if the buf bp is in the range eliminated by truncation.
  */
 static int
 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
 	struct buf *bp;
 	int *blkoffp;
 	ufs_lbn_t lastlbn;
 	int lastoff;
 	int flags;
 {
 	ufs_lbn_t lbn;
 
 	*blkoffp = 0;
 	/* Only match ext/normal blocks as appropriate. */
 	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
 	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
 		return (0);
 	/* ALTDATA is always a full truncation. */
 	if ((bp->b_xflags & BX_ALTDATA) != 0)
 		return (1);
 	/* -1 is full truncation. */
 	if (lastlbn == -1)
 		return (1);
 	/*
 	 * If this is a partial truncate we only want those
 	 * blocks and indirect blocks that cover the range
 	 * we're after.
 	 */
 	lbn = bp->b_lblkno;
 	if (lbn < 0)
 		lbn = -(lbn + lbn_level(lbn));
 	if (lbn < lastlbn)
 		return (0);
 	/* Here we only truncate lblkno if it's partial. */
 	if (lbn == lastlbn) {
 		if (lastoff == 0)
 			return (0);
 		*blkoffp = lastoff;
 	}
 	return (1);
 }
 
 /*
  * Eliminate any dependencies that exist in memory beyond lblkno:off
  */
 static void
 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
 	struct inode *ip;
 	struct freeblks *freeblks;
 	ufs_lbn_t lastlbn;
 	int lastoff;
 	int flags;
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 	struct buf *bp;
 	int blkoff;
 
 	/*
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 * Once they are all there, walk the list and get rid of
 	 * any dependencies.
 	 */
 	vp = ITOV(ip);
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	drain_output(vp);
 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 		bp->b_vflags &= ~BV_SCANNED;
 restart:
 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 		if (bp->b_vflags & BV_SCANNED)
 			continue;
 		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
 			bp->b_vflags |= BV_SCANNED;
 			continue;
 		}
 		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
 		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
 			goto restart;
 		BO_UNLOCK(bo);
 		if (deallocate_dependencies(bp, freeblks, blkoff))
 			bqrelse(bp);
 		else
 			brelse(bp);
 		BO_LOCK(bo);
 		goto restart;
 	}
 	/*
 	 * Now do the work of vtruncbuf while also matching indirect blocks.
 	 */
 	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
 		bp->b_vflags &= ~BV_SCANNED;
 cleanrestart:
 	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
 		if (bp->b_vflags & BV_SCANNED)
 			continue;
 		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
 			bp->b_vflags |= BV_SCANNED;
 			continue;
 		}
 		if (BUF_LOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 		    BO_LOCKPTR(bo)) == ENOLCK) {
 			BO_LOCK(bo);
 			goto cleanrestart;
 		}
 		bp->b_vflags |= BV_SCANNED;
 		bremfree(bp);
 		if (blkoff != 0) {
 			allocbuf(bp, blkoff);
 			bqrelse(bp);
 		} else {
 			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
 			brelse(bp);
 		}
 		BO_LOCK(bo);
 		goto cleanrestart;
 	}
 	drain_output(vp);
 	BO_UNLOCK(bo);
 }
 
 static int
 cancel_pagedep(pagedep, freeblks, blkoff)
 	struct pagedep *pagedep;
 	struct freeblks *freeblks;
 	int blkoff;
 {
 	struct jremref *jremref;
 	struct jmvref *jmvref;
 	struct dirrem *dirrem, *tmp;
 	int i;
 
 	/*
 	 * Copy any directory remove dependencies to the list
 	 * to be processed after the freeblks proceeds.  If
 	 * directory entry never made it to disk they
 	 * can be dumped directly onto the work list.
 	 */
 	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
 		/* Skip this directory removal if it is intended to remain. */
 		if (dirrem->dm_offset < blkoff)
 			continue;
 		/*
 		 * If there are any dirrems we wait for the journal write
 		 * to complete and then restart the buf scan as the lock
 		 * has been dropped.
 		 */
 		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
 			jwait(&jremref->jr_list, MNT_WAIT);
 			return (ERESTART);
 		}
 		LIST_REMOVE(dirrem, dm_next);
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
 	}
 	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
 		jwait(&jmvref->jm_list, MNT_WAIT);
 		return (ERESTART);
 	}
 	/*
 	 * When we're partially truncating a pagedep we just want to flush
 	 * journal entries and return.  There can not be any adds in the
 	 * truncated portion of the directory and newblk must remain if
 	 * part of the block remains.
 	 */
 	if (blkoff != 0) {
 		struct diradd *dap;
 
 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 			if (dap->da_offset > blkoff)
 				panic("cancel_pagedep: diradd %p off %d > %d",
 				    dap, dap->da_offset, blkoff);
 		for (i = 0; i < DAHASHSZ; i++)
 			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
 				if (dap->da_offset > blkoff)
 					panic("cancel_pagedep: diradd %p off %d > %d",
 					    dap, dap->da_offset, blkoff);
 		return (0);
 	}
 	/*
 	 * There should be no directory add dependencies present
 	 * as the directory could not be truncated until all
 	 * children were removed.
 	 */
 	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
 	    ("deallocate_dependencies: pendinghd != NULL"));
 	for (i = 0; i < DAHASHSZ; i++)
 		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
 		    ("deallocate_dependencies: diraddhd != NULL"));
 	if ((pagedep->pd_state & NEWBLOCK) != 0)
 		free_newdirblk(pagedep->pd_newdirblk);
 	if (free_pagedep(pagedep) == 0)
 		panic("Failed to free pagedep %p", pagedep);
 	return (0);
 }
 
 /*
  * Reclaim any dependency structures from a buffer that is about to
  * be reallocated to a new vnode. The buffer must be locked, thus,
  * no I/O completion operations can occur while we are manipulating
  * its associated dependencies. The mutex is held so that other I/O's
  * associated with related dependencies do not occur.
  */
 static int
 deallocate_dependencies(bp, freeblks, off)
 	struct buf *bp;
 	struct freeblks *freeblks;
 	int off;
 {
 	struct indirdep *indirdep;
 	struct pagedep *pagedep;
 	struct worklist *wk, *wkn;
 	struct ufsmount *ump;
 
 	ump = softdep_bp_to_mp(bp);
 	if (ump == NULL)
 		goto done;
 	ACQUIRE_LOCK(ump);
 	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
 		switch (wk->wk_type) {
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			if (bp->b_lblkno >= 0 ||
 			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 				panic("deallocate_dependencies: not indir");
 			cancel_indirdep(indirdep, bp, freeblks);
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			if (cancel_pagedep(pagedep, freeblks, off)) {
 				FREE_LOCK(ump);
 				return (ERESTART);
 			}
 			continue;
 
 		case D_ALLOCINDIR:
 			/*
 			 * Simply remove the allocindir, we'll find it via
 			 * the indirdep where we can clear pointers if
 			 * needed.
 			 */
 			WORKLIST_REMOVE(wk);
 			continue;
 
 		case D_FREEWORK:
 			/*
 			 * A truncation is waiting for the zero'd pointers
 			 * to be written.  It can be freed when the freeblks
 			 * is journaled.
 			 */
 			WORKLIST_REMOVE(wk);
 			wk->wk_state |= ONDEPLIST;
 			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
 			break;
 
 		case D_ALLOCDIRECT:
 			if (off != 0)
 				continue;
 			/* FALLTHROUGH */
 		default:
 			panic("deallocate_dependencies: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	FREE_LOCK(ump);
 done:
 	/*
 	 * Don't throw away this buf, we were partially truncating and
 	 * some deps may always remain.
 	 */
 	if (off) {
 		allocbuf(bp, off);
 		bp->b_vflags |= BV_SCANNED;
 		return (EBUSY);
 	}
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 
 	return (0);
 }
 
 /*
  * An allocdirect is being canceled due to a truncate.  We must make sure
  * the journal entry is released in concert with the blkfree that releases
  * the storage.  Completed journal entries must not be released until the
  * space is no longer pointed to by the inode or in the bitmap.
  */
 static void
 cancel_allocdirect(adphead, adp, freeblks)
 	struct allocdirectlst *adphead;
 	struct allocdirect *adp;
 	struct freeblks *freeblks;
 {
 	struct freework *freework;
 	struct newblk *newblk;
 	struct worklist *wk;
 
 	TAILQ_REMOVE(adphead, adp, ad_next);
 	newblk = (struct newblk *)adp;
 	freework = NULL;
 	/*
 	 * Find the correct freework structure.
 	 */
 	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
 		if (wk->wk_type != D_FREEWORK)
 			continue;
 		freework = WK_FREEWORK(wk);
 		if (freework->fw_blkno == newblk->nb_newblkno)
 			break;
 	}
 	if (freework == NULL)
 		panic("cancel_allocdirect: Freework not found");
 	/*
 	 * If a newblk exists at all we still have the journal entry that
 	 * initiated the allocation so we do not need to journal the free.
 	 */
 	cancel_jfreeblk(freeblks, freework->fw_blkno);
 	/*
 	 * If the journal hasn't been written the jnewblk must be passed
 	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
 	 * this by linking the journal dependency into the freework to be
 	 * freed when freework_freeblock() is called.  If the journal has
 	 * been written we can simply reclaim the journal space when the
 	 * freeblks work is complete.
 	 */
 	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
 	    &freeblks->fb_jwork);
 	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
 }
 
 
 /*
  * Cancel a new block allocation.  May be an indirect or direct block.  We
  * remove it from various lists and return any journal record that needs to
  * be resolved by the caller.
  *
  * A special consideration is made for indirects which were never pointed
  * at on disk and will never be found once this block is released.
  */
 static struct jnewblk *
 cancel_newblk(newblk, wk, wkhd)
 	struct newblk *newblk;
 	struct worklist *wk;
 	struct workhead *wkhd;
 {
 	struct jnewblk *jnewblk;
 
 	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
 	    
 	newblk->nb_state |= GOINGAWAY;
 	/*
 	 * Previously we traversed the completedhd on each indirdep
 	 * attached to this newblk to cancel them and gather journal
 	 * work.  Since we need only the oldest journal segment and
 	 * the lowest point on the tree will always have the oldest
 	 * journal segment we are free to release the segments
 	 * of any subordinates and may leave the indirdep list to
 	 * indirdep_complete() when this newblk is freed.
 	 */
 	if (newblk->nb_state & ONDEPLIST) {
 		newblk->nb_state &= ~ONDEPLIST;
 		LIST_REMOVE(newblk, nb_deps);
 	}
 	if (newblk->nb_state & ONWORKLIST)
 		WORKLIST_REMOVE(&newblk->nb_list);
 	/*
 	 * If the journal entry hasn't been written we save a pointer to
 	 * the dependency that frees it until it is written or the
 	 * superseding operation completes.
 	 */
 	jnewblk = newblk->nb_jnewblk;
 	if (jnewblk != NULL && wk != NULL) {
 		newblk->nb_jnewblk = NULL;
 		jnewblk->jn_dep = wk;
 	}
 	if (!LIST_EMPTY(&newblk->nb_jwork))
 		jwork_move(wkhd, &newblk->nb_jwork);
 	/*
 	 * When truncating we must free the newdirblk early to remove
 	 * the pagedep from the hash before returning.
 	 */
 	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
 		free_newdirblk(WK_NEWDIRBLK(wk));
 	if (!LIST_EMPTY(&newblk->nb_newdirblk))
 		panic("cancel_newblk: extra newdirblk");
 
 	return (jnewblk);
 }
 
 /*
  * Schedule the freefrag associated with a newblk to be released once
  * the pointers are written and the previous block is no longer needed.
  */
 static void
 newblk_freefrag(newblk)
 	struct newblk *newblk;
 {
 	struct freefrag *freefrag;
 
 	if (newblk->nb_freefrag == NULL)
 		return;
 	freefrag = newblk->nb_freefrag;
 	newblk->nb_freefrag = NULL;
 	freefrag->ff_state |= COMPLETE;
 	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
 		add_to_worklist(&freefrag->ff_list, 0);
 }
 
 /*
  * Free a newblk. Generate a new freefrag work request if appropriate.
  * This must be called after the inode pointer and any direct block pointers
  * are valid or fully removed via truncate or frag extension.
  */
 static void
 free_newblk(newblk)
 	struct newblk *newblk;
 {
 	struct indirdep *indirdep;
 	struct worklist *wk;
 
 	KASSERT(newblk->nb_jnewblk == NULL,
 	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
 	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
 	    ("free_newblk: unclaimed newblk"));
 	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
 	newblk_freefrag(newblk);
 	if (newblk->nb_state & ONDEPLIST)
 		LIST_REMOVE(newblk, nb_deps);
 	if (newblk->nb_state & ONWORKLIST)
 		WORKLIST_REMOVE(&newblk->nb_list);
 	LIST_REMOVE(newblk, nb_hash);
 	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
 		free_newdirblk(WK_NEWDIRBLK(wk));
 	if (!LIST_EMPTY(&newblk->nb_newdirblk))
 		panic("free_newblk: extra newdirblk");
 	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
 		indirdep_complete(indirdep);
 	handle_jwork(&newblk->nb_jwork);
 	WORKITEM_FREE(newblk, D_NEWBLK);
 }
 
 /*
  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
  */
 static void
 free_newdirblk(newdirblk)
 	struct newdirblk *newdirblk;
 {
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	struct worklist *wk;
 
 	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
 	WORKLIST_REMOVE(&newdirblk->db_list);
 	/*
 	 * If the pagedep is still linked onto the directory buffer
 	 * dependency chain, then some of the entries on the
 	 * pd_pendinghd list may not be committed to disk yet. In
 	 * this case, we will simply clear the NEWBLOCK flag and
 	 * let the pd_pendinghd list be processed when the pagedep
 	 * is next written. If the pagedep is no longer on the buffer
 	 * dependency chain, then all the entries on the pd_pending
 	 * list are committed to disk and we can free them here.
 	 */
 	pagedep = newdirblk->db_pagedep;
 	pagedep->pd_state &= ~NEWBLOCK;
 	if ((pagedep->pd_state & ONWORKLIST) == 0) {
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap, NULL);
 		/*
 		 * If no dependencies remain, the pagedep will be freed.
 		 */
 		free_pagedep(pagedep);
 	}
 	/* Should only ever be one item in the list. */
 	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 	}
 	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 }
 
 /*
  * Prepare an inode to be freed. The actual free operation is not
  * done until the zero'ed inode has been written to disk.
  */
 void
 softdep_freefile(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	struct inode *ip = VTOI(pvp);
 	struct inodedep *inodedep;
 	struct freefile *freefile;
 	struct freeblks *freeblks;
 	struct ufsmount *ump;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_freefile called on non-softdep filesystem"));
 	/*
 	 * This sets up the inode de-allocation dependency.
 	 */
 	freefile = malloc(sizeof(struct freefile),
 		M_FREEFILE, M_SOFTDEP_FLAGS);
 	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 	freefile->fx_mode = mode;
 	freefile->fx_oldinum = ino;
 	freefile->fx_devvp = ump->um_devvp;
 	LIST_INIT(&freefile->fx_jwork);
 	UFS_LOCK(ump);
 	ump->um_fs->fs_pendinginodes += 1;
 	UFS_UNLOCK(ump);
 
 	/*
 	 * If the inodedep does not exist, then the zero'ed inode has
 	 * been written to disk. If the allocated inode has never been
 	 * written to disk, then the on-disk inode is zero'ed. In either
 	 * case we can free the file immediately.  If the journal was
 	 * canceled before being written the inode will never make it to
 	 * disk and we must send the canceled journal entrys to
 	 * ffs_freefile() to be cleared in conjunction with the bitmap.
 	 * Any blocks waiting on the inode to write can be safely freed
 	 * here as it will never been written.
 	 */
 	ACQUIRE_LOCK(ump);
 	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
 	if (inodedep) {
 		/*
 		 * Clear out freeblks that no longer need to reference
 		 * this inode.
 		 */
 		while ((freeblks =
 		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
 			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
 			    fb_next);
 			freeblks->fb_state &= ~ONDEPLIST;
 		}
 		/*
 		 * Remove this inode from the unlinked list.
 		 */
 		if (inodedep->id_state & UNLINKED) {
 			/*
 			 * Save the journal work to be freed with the bitmap
 			 * before we clear UNLINKED.  Otherwise it can be lost
 			 * if the inode block is written.
 			 */
 			handle_bufwait(inodedep, &freefile->fx_jwork);
 			clear_unlinked_inodedep(inodedep);
 			/*
 			 * Re-acquire inodedep as we've dropped the
 			 * per-filesystem lock in clear_unlinked_inodedep().
 			 */
 			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
 		}
 	}
 	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
 		FREE_LOCK(ump);
 		handle_workitem_freefile(freefile);
 		return;
 	}
 	if ((inodedep->id_state & DEPCOMPLETE) == 0)
 		inodedep->id_state |= GOINGAWAY;
 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 	FREE_LOCK(ump);
 	if (ip->i_number == ino)
 		UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
 }
 
 /*
  * Check to see if an inode has never been written to disk. If
  * so free the inodedep and return success, otherwise return failure.
  *
  * If we still have a bitmap dependency, then the inode has never
  * been written to disk. Drop the dependency as it is no longer
  * necessary since the inode is being deallocated. We set the
  * ALLCOMPLETE flags since the bitmap now properly shows that the
  * inode is not allocated. Even if the inode is actively being
  * written, it has been rolled back to its zero'ed state, so we
  * are ensured that a zero inode is what is on the disk. For short
  * lived files, this change will usually result in removing all the
  * dependencies from the inode so that it can be freed immediately.
  */
 static int
 check_inode_unwritten(inodedep)
 	struct inodedep *inodedep;
 {
 
 	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
 
 	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
 	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !LIST_EMPTY(&inodedep->id_inowait) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
 	    inodedep->id_mkdiradd != NULL || 
 	    inodedep->id_nlinkdelta != 0)
 		return (0);
 	/*
 	 * Another process might be in initiate_write_inodeblock_ufs[12]
 	 * trying to allocate memory without holding "Softdep Lock".
 	 */
 	if ((inodedep->id_state & IOSTARTED) != 0 &&
 	    inodedep->id_savedino1 == NULL)
 		return (0);
 
 	if (inodedep->id_state & ONDEPLIST)
 		LIST_REMOVE(inodedep, id_deps);
 	inodedep->id_state &= ~ONDEPLIST;
 	inodedep->id_state |= ALLCOMPLETE;
 	inodedep->id_bmsafemap = NULL;
 	if (inodedep->id_state & ONWORKLIST)
 		WORKLIST_REMOVE(&inodedep->id_list);
 	if (inodedep->id_savedino1 != NULL) {
 		free(inodedep->id_savedino1, M_SAVEDINO);
 		inodedep->id_savedino1 = NULL;
 	}
 	if (free_inodedep(inodedep) == 0)
 		panic("check_inode_unwritten: busy inode");
 	return (1);
 }
 
 static int
 check_inodedep_free(inodedep)
 	struct inodedep *inodedep;
 {
 
 	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
 	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
 	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
 	    !LIST_EMPTY(&inodedep->id_bufwait) ||
 	    !LIST_EMPTY(&inodedep->id_inowait) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
 	    inodedep->id_mkdiradd != NULL ||
 	    inodedep->id_nlinkdelta != 0 ||
 	    inodedep->id_savedino1 != NULL)
 		return (0);
 	return (1);
 }
 
 /*
  * Try to free an inodedep structure. Return 1 if it could be freed.
  */
 static int
 free_inodedep(inodedep)
 	struct inodedep *inodedep;
 {
 
 	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
 	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
 	    !check_inodedep_free(inodedep))
 		return (0);
 	if (inodedep->id_state & ONDEPLIST)
 		LIST_REMOVE(inodedep, id_deps);
 	LIST_REMOVE(inodedep, id_hash);
 	WORKITEM_FREE(inodedep, D_INODEDEP);
 	return (1);
 }
 
 /*
  * Free the block referenced by a freework structure.  The parent freeblks
  * structure is released and completed when the final cg bitmap reaches
  * the disk.  This routine may be freeing a jnewblk which never made it to
  * disk in which case we do not have to wait as the operation is undone
  * in memory immediately.
  */
 static void
 freework_freeblock(freework, key)
 	struct freework *freework;
 	u_long key;
 {
 	struct freeblks *freeblks;
 	struct jnewblk *jnewblk;
 	struct ufsmount *ump;
 	struct workhead wkhd;
 	struct fs *fs;
 	int bsize;
 	int needj;
 
 	ump = VFSTOUFS(freework->fw_list.wk_mp);
 	LOCK_OWNED(ump);
 	/*
 	 * Handle partial truncate separately.
 	 */
 	if (freework->fw_indir) {
 		complete_trunc_indir(freework);
 		return;
 	}
 	freeblks = freework->fw_freeblks;
 	fs = ump->um_fs;
 	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
 	bsize = lfragtosize(fs, freework->fw_frags);
 	LIST_INIT(&wkhd);
 	/*
 	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
 	 * on the indirblk hashtable and prevents premature freeing.
 	 */
 	freework->fw_state |= DEPCOMPLETE;
 	/*
 	 * SUJ needs to wait for the segment referencing freed indirect
 	 * blocks to expire so that we know the checker will not confuse
 	 * a re-allocated indirect block with its old contents.
 	 */
 	if (needj && freework->fw_lbn <= -UFS_NDADDR)
 		indirblk_insert(freework);
 	/*
 	 * If we are canceling an existing jnewblk pass it to the free
 	 * routine, otherwise pass the freeblk which will ultimately
 	 * release the freeblks.  If we're not journaling, we can just
 	 * free the freeblks immediately.
 	 */
 	jnewblk = freework->fw_jnewblk;
 	if (jnewblk != NULL) {
 		cancel_jnewblk(jnewblk, &wkhd);
 		needj = 0;
 	} else if (needj) {
 		freework->fw_state |= DELAYEDFREE;
 		freeblks->fb_cgwait++;
 		WORKLIST_INSERT(&wkhd, &freework->fw_list);
 	}
 	FREE_LOCK(ump);
 	freeblks_free(ump, freeblks, btodb(bsize));
 	CTR4(KTR_SUJ,
 	    "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
 	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
 	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * The jnewblk will be discarded and the bits in the map never
 	 * made it to disk.  We can immediately free the freeblk.
 	 */
 	if (needj == 0)
 		handle_written_freework(freework);
 }
 
 /*
  * We enqueue freework items that need processing back on the freeblks and
  * add the freeblks to the worklist.  This makes it easier to find all work
  * required to flush a truncation in process_truncates().
  */
 static void
 freework_enqueue(freework)
 	struct freework *freework;
 {
 	struct freeblks *freeblks;
 
 	freeblks = freework->fw_freeblks;
 	if ((freework->fw_state & INPROGRESS) == 0)
 		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
 	if ((freeblks->fb_state &
 	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
 	    LIST_EMPTY(&freeblks->fb_jblkdephd))
 		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 }
 
 /*
  * Start, continue, or finish the process of freeing an indirect block tree.
  * The free operation may be paused at any point with fw_off containing the
  * offset to restart from.  This enables us to implement some flow control
  * for large truncates which may fan out and generate a huge number of
  * dependencies.
  */
 static void
 handle_workitem_indirblk(freework)
 	struct freework *freework;
 {
 	struct freeblks *freeblks;
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	freeblks = freework->fw_freeblks;
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	if (freework->fw_state & DEPCOMPLETE) {
 		handle_written_freework(freework);
 		return;
 	}
 	if (freework->fw_off == NINDIR(fs)) {
 		freework_freeblock(freework, SINGLETON_KEY);
 		return;
 	}
 	freework->fw_state |= INPROGRESS;
 	FREE_LOCK(ump);
 	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
 	    freework->fw_lbn);
 	ACQUIRE_LOCK(ump);
 }
 
 /*
  * Called when a freework structure attached to a cg buf is written.  The
  * ref on either the parent or the freeblks structure is released and
  * the freeblks is added back to the worklist if there is more work to do.
  */
 static void
 handle_written_freework(freework)
 	struct freework *freework;
 {
 	struct freeblks *freeblks;
 	struct freework *parent;
 
 	freeblks = freework->fw_freeblks;
 	parent = freework->fw_parent;
 	if (freework->fw_state & DELAYEDFREE)
 		freeblks->fb_cgwait--;
 	freework->fw_state |= COMPLETE;
 	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
 		WORKITEM_FREE(freework, D_FREEWORK);
 	if (parent) {
 		if (--parent->fw_ref == 0)
 			freework_enqueue(parent);
 		return;
 	}
 	if (--freeblks->fb_ref != 0)
 		return;
 	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
 	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 
 		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 }
 
 /*
  * This workitem routine performs the block de-allocation.
  * The workitem is added to the pending list after the updated
  * inode block has been written to disk.  As mentioned above,
  * checks regarding the number of blocks de-allocated (compared
  * to the number of blocks allocated for the file) are also
  * performed in this function.
  */
 static int
 handle_workitem_freeblocks(freeblks, flags)
 	struct freeblks *freeblks;
 	int flags;
 {
 	struct freework *freework;
 	struct newblk *newblk;
 	struct allocindir *aip;
 	struct ufsmount *ump;
 	struct worklist *wk;
 	u_long key;
 
 	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
 	    ("handle_workitem_freeblocks: Journal entries not written."));
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
 	ACQUIRE_LOCK(ump);
 	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		switch (wk->wk_type) {
 		case D_DIRREM:
 			wk->wk_state |= COMPLETE;
 			add_to_worklist(wk, 0);
 			continue;
 
 		case D_ALLOCDIRECT:
 			free_newblk(WK_NEWBLK(wk));
 			continue;
 
 		case D_ALLOCINDIR:
 			aip = WK_ALLOCINDIR(wk);
 			freework = NULL;
 			if (aip->ai_state & DELAYEDFREE) {
 				FREE_LOCK(ump);
 				freework = newfreework(ump, freeblks, NULL,
 				    aip->ai_lbn, aip->ai_newblkno,
 				    ump->um_fs->fs_frag, 0, 0);
 				ACQUIRE_LOCK(ump);
 			}
 			newblk = WK_NEWBLK(wk);
 			if (newblk->nb_jnewblk) {
 				freework->fw_jnewblk = newblk->nb_jnewblk;
 				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
 				newblk->nb_jnewblk = NULL;
 			}
 			free_newblk(newblk);
 			continue;
 
 		case D_FREEWORK:
 			freework = WK_FREEWORK(wk);
 			if (freework->fw_lbn <= -UFS_NDADDR)
 				handle_workitem_indirblk(freework);
 			else
 				freework_freeblock(freework, key);
 			continue;
 		default:
 			panic("handle_workitem_freeblocks: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 		}
 	}
 	if (freeblks->fb_ref != 0) {
 		freeblks->fb_state &= ~INPROGRESS;
 		wake_worklist(&freeblks->fb_list);
 		freeblks = NULL;
 	}
 	FREE_LOCK(ump);
 	ffs_blkrelease_finish(ump, key);
 	if (freeblks)
 		return handle_complete_freeblocks(freeblks, flags);
 	return (0);
 }
 
 /*
  * Handle completion of block free via truncate.  This allows fs_pending
  * to track the actual free block count more closely than if we only updated
  * it at the end.  We must be careful to handle cases where the block count
  * on free was incorrect.
  */
 static void
 freeblks_free(ump, freeblks, blocks)
 	struct ufsmount *ump;
 	struct freeblks *freeblks;
 	int blocks;
 {
 	struct fs *fs;
 	ufs2_daddr_t remain;
 
 	UFS_LOCK(ump);
 	remain = -freeblks->fb_chkcnt;
 	freeblks->fb_chkcnt += blocks;
 	if (remain > 0) {
 		if (remain < blocks)
 			blocks = remain;
 		fs = ump->um_fs;
 		fs->fs_pendingblocks -= blocks;
 	}
 	UFS_UNLOCK(ump);
 }
 
 /*
  * Once all of the freework workitems are complete we can retire the
  * freeblocks dependency and any journal work awaiting completion.  This
  * can not be called until all other dependencies are stable on disk.
  */
 static int
 handle_complete_freeblocks(freeblks, flags)
 	struct freeblks *freeblks;
 	int flags;
 {
 	struct inodedep *inodedep;
 	struct inode *ip;
 	struct vnode *vp;
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t spare;
 
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	flags = LK_EXCLUSIVE | flags;
 	spare = freeblks->fb_chkcnt;
 
 	/*
 	 * If we did not release the expected number of blocks we may have
 	 * to adjust the inode block count here.  Only do so if it wasn't
 	 * a truncation to zero and the modrev still matches.
 	 */
 	if (spare && freeblks->fb_len != 0) {
 		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
 		    flags, &vp, FFSV_FORCEINSMQ) != 0)
 			return (EBUSY);
 		ip = VTOI(vp);
 		if (ip->i_mode == 0) {
 			vgone(vp);
 		} else if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 			/*
 			 * We must wait so this happens before the
 			 * journal is reclaimed.
 			 */
 			ffs_update(vp, 1);
 		}
 		vput(vp);
 	}
 	if (spare < 0) {
 		UFS_LOCK(ump);
 		fs->fs_pendingblocks += spare;
 		UFS_UNLOCK(ump);
 	}
 #ifdef QUOTA
 	/* Handle spare. */
 	if (spare)
 		quotaadj(freeblks->fb_quota, ump, -spare);
 	quotarele(freeblks->fb_quota);
 #endif
 	ACQUIRE_LOCK(ump);
 	if (freeblks->fb_state & ONDEPLIST) {
 		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
 		    0, &inodedep);
 		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
 		freeblks->fb_state &= ~ONDEPLIST;
 		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
 			free_inodedep(inodedep);
 	}
 	/*
 	 * All of the freeblock deps must be complete prior to this call
 	 * so it's now safe to complete earlier outstanding journal entries.
 	 */
 	handle_jwork(&freeblks->fb_jwork);
 	WORKITEM_FREE(freeblks, D_FREEBLKS);
 	FREE_LOCK(ump);
 	return (0);
 }
 
 /*
  * Release blocks associated with the freeblks and stored in the indirect
  * block dbn. If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  *
  * This handles partial and complete truncation of blocks.  Partial is noted
  * with goingaway == 0.  In this case the freework is completed after the
  * zero'd indirects are written to disk.  For full truncation the freework
  * is completed after the block is freed.
  */
 static void
 indir_trunc(freework, dbn, lbn)
 	struct freework *freework;
 	ufs2_daddr_t dbn;
 	ufs_lbn_t lbn;
 {
 	struct freework *nfreework;
 	struct workhead wkhd;
 	struct freeblks *freeblks;
 	struct buf *bp;
 	struct fs *fs;
 	struct indirdep *indirdep;
 	struct mount *mp;
 	struct ufsmount *ump;
 	ufs1_daddr_t *bap1;
 	ufs2_daddr_t nb, nnb, *bap2;
 	ufs_lbn_t lbnadd, nlbn;
 	u_long key;
 	int nblocks, ufs1fmt, freedblocks;
 	int goingaway, freedeps, needj, level, cnt, i, error;
 
 	freeblks = freework->fw_freeblks;
 	mp = freeblks->fb_list.wk_mp;
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	/*
 	 * Get buffer of block pointers to be freed.  There are three cases:
 	 * 
 	 * 1) Partial truncate caches the indirdep pointer in the freework
 	 *    which provides us a back copy to the save bp which holds the
 	 *    pointers we want to clear.  When this completes the zero
 	 *    pointers are written to the real copy.
 	 * 2) The indirect is being completely truncated, cancel_indirdep()
 	 *    eliminated the real copy and placed the indirdep on the saved
 	 *    copy.  The indirdep and buf are discarded when this completes.
 	 * 3) The indirect was not in memory, we read a copy off of the disk
 	 *    using the devvp and drop and invalidate the buffer when we're
 	 *    done.
 	 */
 	goingaway = 1;
 	indirdep = NULL;
 	if (freework->fw_indir != NULL) {
 		goingaway = 0;
 		indirdep = freework->fw_indir;
 		bp = indirdep->ir_savebp;
 		if (bp == NULL || bp->b_blkno != dbn)
 			panic("indir_trunc: Bad saved buf %p blkno %jd",
 			    bp, (intmax_t)dbn);
 	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
 		/*
 		 * The lock prevents the buf dep list from changing and
 	 	 * indirects on devvp should only ever have one dependency.
 		 */
 		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
 		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
 			panic("indir_trunc: Bad indirdep %p from buf %p",
 			    indirdep, bp);
 	} else {
 		error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn,
 		    (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
 		if (error)
 			return;
 	}
 	ACQUIRE_LOCK(ump);
 	/* Protects against a race with complete_trunc_indir(). */
 	freework->fw_state &= ~INPROGRESS;
 	/*
 	 * If we have an indirdep we need to enforce the truncation order
 	 * and discard it when it is complete.
 	 */
 	if (indirdep) {
 		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
 		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
 			/*
 			 * Add the complete truncate to the list on the
 			 * indirdep to enforce in-order processing.
 			 */
 			if (freework->fw_indir == NULL)
 				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
 				    freework, fw_next);
 			FREE_LOCK(ump);
 			return;
 		}
 		/*
 		 * If we're goingaway, free the indirdep.  Otherwise it will
 		 * linger until the write completes.
 		 */
 		if (goingaway) {
 			KASSERT(indirdep->ir_savebp == bp,
 			    ("indir_trunc: losing ir_savebp %p",
 			    indirdep->ir_savebp));
 			indirdep->ir_savebp = NULL;
 			free_indirdep(indirdep);
 		}
 	}
 	FREE_LOCK(ump);
 	/* Initialize pointers depending on block size. */
 	if (ump->um_fstype == UFS1) {
 		bap1 = (ufs1_daddr_t *)bp->b_data;
 		nb = bap1[freework->fw_off];
 		ufs1fmt = 1;
 		bap2 = NULL;
 	} else {
 		bap2 = (ufs2_daddr_t *)bp->b_data;
 		nb = bap2[freework->fw_off];
 		ufs1fmt = 0;
 		bap1 = NULL;
 	}
 	level = lbn_level(lbn);
 	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
 	lbnadd = lbn_offset(fs, level);
 	nblocks = btodb(fs->fs_bsize);
 	nfreework = freework;
 	freedeps = 0;
 	cnt = 0;
 	/*
 	 * Reclaim blocks.  Traverses into nested indirect levels and
 	 * arranges for the current level to be freed when subordinates
 	 * are free when journaling.
 	 */
 	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
 	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
 		if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb,
 		    fs->fs_bsize) != 0)
 			nb = 0;
 		if (i != NINDIR(fs) - 1) {
 			if (ufs1fmt)
 				nnb = bap1[i+1];
 			else
 				nnb = bap2[i+1];
 		} else
 			nnb = 0;
 		if (nb == 0)
 			continue;
 		cnt++;
 		if (level != 0) {
 			nlbn = (lbn + 1) - (i * lbnadd);
 			if (needj != 0) {
 				nfreework = newfreework(ump, freeblks, freework,
 				    nlbn, nb, fs->fs_frag, 0, 0);
 				freedeps++;
 			}
 			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
 		} else {
 			struct freedep *freedep;
 
 			/*
 			 * Attempt to aggregate freedep dependencies for
 			 * all blocks being released to the same CG.
 			 */
 			LIST_INIT(&wkhd);
 			if (needj != 0 &&
 			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
 				freedep = newfreedep(freework);
 				WORKLIST_INSERT_UNLOCKED(&wkhd,
 				    &freedep->fd_list);
 				freedeps++;
 			}
 			CTR3(KTR_SUJ,
 			    "indir_trunc: ino %jd blkno %jd size %d",
 			    freeblks->fb_inum, nb, fs->fs_bsize);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
 			    fs->fs_bsize, freeblks->fb_inum,
 			    freeblks->fb_vtype, &wkhd, key);
 		}
 	}
 	ffs_blkrelease_finish(ump, key);
 	if (goingaway) {
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	freedblocks = 0;
 	if (level == 0)
 		freedblocks = (nblocks * cnt);
 	if (needj == 0)
 		freedblocks += nblocks;
 	freeblks_free(ump, freeblks, freedblocks);
 	/*
 	 * If we are journaling set up the ref counts and offset so this
 	 * indirect can be completed when its children are free.
 	 */
 	if (needj) {
 		ACQUIRE_LOCK(ump);
 		freework->fw_off = i;
 		freework->fw_ref += freedeps;
 		freework->fw_ref -= NINDIR(fs) + 1;
 		if (level == 0)
 			freeblks->fb_cgwait += freedeps;
 		if (freework->fw_ref == 0)
 			freework_freeblock(freework, SINGLETON_KEY);
 		FREE_LOCK(ump);
 		return;
 	}
 	/*
 	 * If we're not journaling we can free the indirect now.
 	 */
 	dbn = dbtofsb(fs, dbn);
 	CTR3(KTR_SUJ,
 	    "indir_trunc 2: ino %jd blkno %jd size %d",
 	    freeblks->fb_inum, dbn, fs->fs_bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
 	    freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
 	/* Non SUJ softdep does single-threaded truncations. */
 	if (freework->fw_blkno == dbn) {
 		freework->fw_state |= ALLCOMPLETE;
 		ACQUIRE_LOCK(ump);
 		handle_written_freework(freework);
 		FREE_LOCK(ump);
 	}
 	return;
 }
 
 /*
  * Cancel an allocindir when it is removed via truncation.  When bp is not
  * NULL the indirect never appeared on disk and is scheduled to be freed
  * independently of the indir so we can more easily track journal work.
  */
 static void
 cancel_allocindir(aip, bp, freeblks, trunc)
 	struct allocindir *aip;
 	struct buf *bp;
 	struct freeblks *freeblks;
 	int trunc;
 {
 	struct indirdep *indirdep;
 	struct freefrag *freefrag;
 	struct newblk *newblk;
 
 	newblk = (struct newblk *)aip;
 	LIST_REMOVE(aip, ai_next);
 	/*
 	 * We must eliminate the pointer in bp if it must be freed on its
 	 * own due to partial truncate or pending journal work.
 	 */
 	if (bp && (trunc || newblk->nb_jnewblk)) {
 		/*
 		 * Clear the pointer and mark the aip to be freed
 		 * directly if it never existed on disk.
 		 */
 		aip->ai_state |= DELAYEDFREE;
 		indirdep = aip->ai_indirdep;
 		if (indirdep->ir_state & UFS1FMT)
 			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
 		else
 			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
 	}
 	/*
 	 * When truncating the previous pointer will be freed via
 	 * savedbp.  Eliminate the freefrag which would dup free.
 	 */
 	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
 		newblk->nb_freefrag = NULL;
 		if (freefrag->ff_jdep)
 			cancel_jfreefrag(
 			    WK_JFREEFRAG(freefrag->ff_jdep));
 		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
 		WORKITEM_FREE(freefrag, D_FREEFRAG);
 	}
 	/*
 	 * If the journal hasn't been written the jnewblk must be passed
 	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
 	 * this by leaving the journal dependency on the newblk to be freed
 	 * when a freework is created in handle_workitem_freeblocks().
 	 */
 	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
 	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
 }
 
 /*
  * Create the mkdir dependencies for . and .. in a new directory.  Link them
  * in to a newdirblk so any subsequent additions are tracked properly.  The
  * caller is responsible for adding the mkdir1 dependency to the journal
  * and updating id_mkdiradd.  This function returns with the per-filesystem
  * lock held.
  */
 static struct mkdir *
 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
 	struct diradd *dap;
 	ino_t newinum;
 	ino_t dinum;
 	struct buf *newdirbp;
 	struct mkdir **mkdirp;
 {
 	struct newblk *newblk;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct newdirblk *newdirblk;
 	struct mkdir *mkdir1, *mkdir2;
 	struct worklist *wk;
 	struct jaddref *jaddref;
 	struct ufsmount *ump;
 	struct mount *mp;
 
 	mp = dap->da_list.wk_mp;
 	ump = VFSTOUFS(mp);
 	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
 	    M_SOFTDEP_FLAGS);
 	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 	LIST_INIT(&newdirblk->db_mkdir);
 	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
 	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
 	mkdir1->md_state = ATTACHED | MKDIR_BODY;
 	mkdir1->md_diradd = dap;
 	mkdir1->md_jaddref = NULL;
 	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
 	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
 	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
 	mkdir2->md_diradd = dap;
 	mkdir2->md_jaddref = NULL;
 	if (MOUNTEDSUJ(mp) == 0) {
 		mkdir1->md_state |= DEPCOMPLETE;
 		mkdir2->md_state |= DEPCOMPLETE;
 	}
 	/*
 	 * Dependency on "." and ".." being written to disk.
 	 */
 	mkdir1->md_buf = newdirbp;
 	ACQUIRE_LOCK(VFSTOUFS(mp));
 	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
 	/*
 	 * We must link the pagedep, allocdirect, and newdirblk for
 	 * the initial file page so the pointer to the new directory
 	 * is not written until the directory contents are live and
 	 * any subsequent additions are not marked live until the
 	 * block is reachable via the inode.
 	 */
 	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
 		panic("setup_newdir: lost pagedep");
 	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
 		if (wk->wk_type == D_ALLOCDIRECT)
 			break;
 	if (wk == NULL)
 		panic("setup_newdir: lost allocdirect");
 	if (pagedep->pd_state & NEWBLOCK)
 		panic("setup_newdir: NEWBLOCK already set");
 	newblk = WK_NEWBLK(wk);
 	pagedep->pd_state |= NEWBLOCK;
 	pagedep->pd_newdirblk = newdirblk;
 	newdirblk->db_pagedep = pagedep;
 	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
 	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
 	/*
 	 * Look up the inodedep for the parent directory so that we
 	 * can link mkdir2 into the pending dotdot jaddref or
 	 * the inode write if there is none.  If the inode is
 	 * ALLCOMPLETE and no jaddref is present all dependencies have
 	 * been satisfied and mkdir2 can be freed.
 	 */
 	inodedep_lookup(mp, dinum, 0, &inodedep);
 	if (MOUNTEDSUJ(mp)) {
 		if (inodedep == NULL)
 			panic("setup_newdir: Lost parent.");
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
 		    (jaddref->ja_state & MKDIR_PARENT),
 		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
 		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
 		mkdir2->md_jaddref = jaddref;
 		jaddref->ja_mkdir = mkdir2;
 	} else if (inodedep == NULL ||
 	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		dap->da_state &= ~MKDIR_PARENT;
 		WORKITEM_FREE(mkdir2, D_MKDIR);
 		mkdir2 = NULL;
 	} else {
 		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
 		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
 	}
 	*mkdirp = mkdir2;
 
 	return (mkdir1);
 }
 
 /*
  * Directory entry addition dependencies.
  * 
  * When adding a new directory entry, the inode (with its incremented link
  * count) must be written to disk before the directory entry's pointer to it.
  * Also, if the inode is newly allocated, the corresponding freemap must be
  * updated (on disk) before the directory entry's pointer. These requirements
  * are met via undo/redo on the directory entry's pointer, which consists
  * simply of the inode number.
  * 
  * As directory entries are added and deleted, the free space within a
  * directory block can become fragmented.  The ufs filesystem will compact
  * a fragmented directory block to make space for a new entry. When this
  * occurs, the offsets of previously added entries change. Any "diradd"
  * dependency structures corresponding to these entries must be updated with
  * the new offsets.
  */
 
 /*
  * This routine is called after the in-memory inode's link
  * count has been incremented, but before the directory entry's
  * pointer to the inode has been set.
  */
 int
 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for directory */
 	off_t diroffset;	/* offset of new entry in directory */
 	ino_t newinum;		/* inode referenced by new directory entry */
 	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
 	int isnewblk;		/* entry is in a newly allocated block */
 {
 	int offset;		/* offset of new entry within directory block */
 	ufs_lbn_t lbn;		/* block in directory containing new entry */
 	struct fs *fs;
 	struct diradd *dap;
 	struct newblk *newblk;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct newdirblk *newdirblk;
 	struct mkdir *mkdir1, *mkdir2;
 	struct jaddref *jaddref;
 	struct ufsmount *ump;
 	struct mount *mp;
 	int isindir;
 
 	mp = ITOVFS(dp);
 	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_directory_add called on non-softdep filesystem"));
 	/*
 	 * Whiteouts have no dependencies.
 	 */
 	if (newinum == UFS_WINO) {
 		if (newdirbp != NULL)
 			bdwrite(newdirbp);
 		return (0);
 	}
 	jaddref = NULL;
 	mkdir1 = mkdir2 = NULL;
 	fs = ump->um_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
 	dap = malloc(sizeof(struct diradd), M_DIRADD,
 		M_SOFTDEP_FLAGS|M_ZERO);
 	workitem_alloc(&dap->da_list, D_DIRADD, mp);
 	dap->da_offset = offset;
 	dap->da_newinum = newinum;
 	dap->da_state = ATTACHED;
 	LIST_INIT(&dap->da_jwork);
 	isindir = bp->b_lblkno >= UFS_NDADDR;
 	newdirblk = NULL;
 	if (isnewblk &&
 	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
 		newdirblk = malloc(sizeof(struct newdirblk),
 		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
 		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 		LIST_INIT(&newdirblk->db_mkdir);
 	}
 	/*
 	 * If we're creating a new directory setup the dependencies and set
 	 * the dap state to wait for them.  Otherwise it's COMPLETE and
 	 * we can move on.
 	 */
 	if (newdirbp == NULL) {
 		dap->da_state |= DEPCOMPLETE;
 		ACQUIRE_LOCK(ump);
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
 		    &mkdir2);
 	}
 	/*
 	 * Link into parent directory pagedep to await its being written.
 	 */
 	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
 #ifdef INVARIANTS
 	if (diradd_lookup(pagedep, offset) != NULL)
 		panic("softdep_setup_directory_add: %p already at off %d\n",
 		    diradd_lookup(pagedep, offset), offset);
 #endif
 	dap->da_pagedep = pagedep;
 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 	    da_pdlist);
 	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 	/*
 	 * If we're journaling, link the diradd into the jaddref so it
 	 * may be completed after the journal entry is written.  Otherwise,
 	 * link the diradd into its inodedep.  If the inode is not yet
 	 * written place it on the bufwait list, otherwise do the post-inode
 	 * write processing to put it on the id_pendinghd list.
 	 */
 	if (MOUNTEDSUJ(mp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
 		jaddref->ja_diroff = diroffset;
 		jaddref->ja_diradd = dap;
 		add_to_journal(&jaddref->ja_list);
 	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 		diradd_inode_written(dap, inodedep);
 	else
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	/*
 	 * Add the journal entries for . and .. links now that the primary
 	 * link is written.
 	 */
 	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
 		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
 		    inoreflst, if_deps);
 		KASSERT(jaddref != NULL &&
 		    jaddref->ja_ino == jaddref->ja_parent &&
 		    (jaddref->ja_state & MKDIR_BODY),
 		    ("softdep_setup_directory_add: bad dot jaddref %p",
 		    jaddref));
 		mkdir1->md_jaddref = jaddref;
 		jaddref->ja_mkdir = mkdir1;
 		/*
 		 * It is important that the dotdot journal entry
 		 * is added prior to the dot entry since dot writes
 		 * both the dot and dotdot links.  These both must
 		 * be added after the primary link for the journal
 		 * to remain consistent.
 		 */
 		add_to_journal(&mkdir2->md_jaddref->ja_list);
 		add_to_journal(&jaddref->ja_list);
 	}
 	/*
 	 * If we are adding a new directory remember this diradd so that if
 	 * we rename it we can keep the dot and dotdot dependencies.  If
 	 * we are adding a new name for an inode that has a mkdiradd we
 	 * must be in rename and we have to move the dot and dotdot
 	 * dependencies to this new name.  The old name is being orphaned
 	 * soon.
 	 */
 	if (mkdir1 != NULL) {
 		if (inodedep->id_mkdiradd != NULL)
 			panic("softdep_setup_directory_add: Existing mkdir");
 		inodedep->id_mkdiradd = dap;
 	} else if (inodedep->id_mkdiradd)
 		merge_diradd(inodedep, dap);
 	if (newdirblk != NULL) {
 		/*
 		 * There is nothing to do if we are already tracking
 		 * this block.
 		 */
 		if ((pagedep->pd_state & NEWBLOCK) != 0) {
 			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 			FREE_LOCK(ump);
 			return (0);
 		}
 		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
 		    == 0)
 			panic("softdep_setup_directory_add: lost entry");
 		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
 		pagedep->pd_state |= NEWBLOCK;
 		pagedep->pd_newdirblk = newdirblk;
 		newdirblk->db_pagedep = pagedep;
 		FREE_LOCK(ump);
 		/*
 		 * If we extended into an indirect signal direnter to sync.
 		 */
 		if (isindir)
 			return (1);
 		return (0);
 	}
 	FREE_LOCK(ump);
 	return (0);
 }
 
 /*
  * This procedure is called to change the offset of a directory
  * entry when compacting a directory block which must be owned
  * exclusively by the caller. Note that the actual entry movement
  * must be done in this procedure to ensure that no I/O completions
  * occur while the move is in progress.
  */
 void 
 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
 	struct buf *bp;		/* Buffer holding directory block. */
 	struct inode *dp;	/* inode for directory */
 	caddr_t base;		/* address of dp->i_offset */
 	caddr_t oldloc;		/* address of old directory location */
 	caddr_t newloc;		/* address of new directory location */
 	int entrysize;		/* size of directory entry */
 {
 	int offset, oldoffset, newoffset;
 	struct pagedep *pagedep;
 	struct jmvref *jmvref;
 	struct diradd *dap;
 	struct direct *de;
 	struct mount *mp;
 	struct ufsmount *ump;
 	ufs_lbn_t lbn;
 	int flags;
 
 	mp = ITOVFS(dp);
 	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_change_directoryentry_offset called on "
 	     "non-softdep filesystem"));
 	de = (struct direct *)oldloc;
 	jmvref = NULL;
 	flags = 0;
 	/*
 	 * Moves are always journaled as it would be too complex to
 	 * determine if any affected adds or removes are present in the
 	 * journal.
 	 */
 	if (MOUNTEDSUJ(mp)) {
 		flags = DEPALLOC;
 		jmvref = newjmvref(dp, de->d_ino,
 		    dp->i_offset + (oldloc - base),
 		    dp->i_offset + (newloc - base));
 	}
 	lbn = lblkno(ump->um_fs, dp->i_offset);
 	offset = blkoff(ump->um_fs, dp->i_offset);
 	oldoffset = offset + (oldloc - base);
 	newoffset = offset + (newloc - base);
 	ACQUIRE_LOCK(ump);
 	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
 		goto done;
 	dap = diradd_lookup(pagedep, oldoffset);
 	if (dap) {
 		dap->da_offset = newoffset;
 		newoffset = DIRADDHASH(newoffset);
 		oldoffset = DIRADDHASH(oldoffset);
 		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
 		    newoffset != oldoffset) {
 			LIST_REMOVE(dap, da_pdlist);
 			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
 			    dap, da_pdlist);
 		}
 	}
 done:
 	if (jmvref) {
 		jmvref->jm_pagedep = pagedep;
 		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
 		add_to_journal(&jmvref->jm_list);
 	}
 	bcopy(oldloc, newloc, entrysize);
 	FREE_LOCK(ump);
 }
 
 /*
  * Move the mkdir dependencies and journal work from one diradd to another
  * when renaming a directory.  The new name must depend on the mkdir deps
  * completing as the old name did.  Directories can only have one valid link
  * at a time so one must be canonical.
  */
 static void
 merge_diradd(inodedep, newdap)
 	struct inodedep *inodedep;
 	struct diradd *newdap;
 {
 	struct diradd *olddap;
 	struct mkdir *mkdir, *nextmd;
 	struct ufsmount *ump;
 	short state;
 
 	olddap = inodedep->id_mkdiradd;
 	inodedep->id_mkdiradd = newdap;
 	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 		newdap->da_state &= ~DEPCOMPLETE;
 		ump = VFSTOUFS(inodedep->id_list.wk_mp);
 		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
 		     mkdir = nextmd) {
 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
 			if (mkdir->md_diradd != olddap)
 				continue;
 			mkdir->md_diradd = newdap;
 			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
 			newdap->da_state |= state;
 			olddap->da_state &= ~state;
 			if ((olddap->da_state &
 			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
 				break;
 		}
 		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 			panic("merge_diradd: unfound ref");
 	}
 	/*
 	 * Any mkdir related journal items are not safe to be freed until
 	 * the new name is stable.
 	 */
 	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
 	olddap->da_state |= DEPCOMPLETE;
 	complete_diradd(olddap);
 }
 
 /*
  * Move the diradd to the pending list when all diradd dependencies are
  * complete.
  */
 static void
 complete_diradd(dap)
 	struct diradd *dap;
 {
 	struct pagedep *pagedep;
 
 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 	}
 }
 
 /*
  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
  * add entries and conditonally journal the remove.
  */
 static void
 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
 	struct diradd *dap;
 	struct dirrem *dirrem;
 	struct jremref *jremref;
 	struct jremref *dotremref;
 	struct jremref *dotdotremref;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct mkdir *mkdir;
 
 	/*
 	 * If no remove references were allocated we're on a non-journaled
 	 * filesystem and can skip the cancel step.
 	 */
 	if (jremref == NULL) {
 		free_diradd(dap, NULL);
 		return;
 	}
 	/*
 	 * Cancel the primary name an free it if it does not require
 	 * journaling.
 	 */
 	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
 	    0, &inodedep) != 0) {
 		/* Abort the addref that reference this diradd.  */
 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
 			if (inoref->if_list.wk_type != D_JADDREF)
 				continue;
 			jaddref = (struct jaddref *)inoref;
 			if (jaddref->ja_diradd != dap)
 				continue;
 			if (cancel_jaddref(jaddref, inodedep,
 			    &dirrem->dm_jwork) == 0) {
 				free_jremref(jremref);
 				jremref = NULL;
 			}
 			break;
 		}
 	}
 	/*
 	 * Cancel subordinate names and free them if they do not require
 	 * journaling.
 	 */
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 		ump = VFSTOUFS(dap->da_list.wk_mp);
 		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
 			if (mkdir->md_diradd != dap)
 				continue;
 			if ((jaddref = mkdir->md_jaddref) == NULL)
 				continue;
 			mkdir->md_jaddref = NULL;
 			if (mkdir->md_state & MKDIR_PARENT) {
 				if (cancel_jaddref(jaddref, NULL,
 				    &dirrem->dm_jwork) == 0) {
 					free_jremref(dotdotremref);
 					dotdotremref = NULL;
 				}
 			} else {
 				if (cancel_jaddref(jaddref, inodedep,
 				    &dirrem->dm_jwork) == 0) {
 					free_jremref(dotremref);
 					dotremref = NULL;
 				}
 			}
 		}
 	}
 
 	if (jremref)
 		journal_jremref(dirrem, jremref, inodedep);
 	if (dotremref)
 		journal_jremref(dirrem, dotremref, inodedep);
 	if (dotdotremref)
 		journal_jremref(dirrem, dotdotremref, NULL);
 	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
 	free_diradd(dap, &dirrem->dm_jwork);
 }
 
 /*
  * Free a diradd dependency structure.
  */
 static void
 free_diradd(dap, wkhd)
 	struct diradd *dap;
 	struct workhead *wkhd;
 {
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct mkdir *mkdir, *nextmd;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(dap->da_list.wk_mp);
 	LOCK_OWNED(ump);
 	LIST_REMOVE(dap, da_pdlist);
 	if (dap->da_state & ONWORKLIST)
 		WORKLIST_REMOVE(&dap->da_list);
 	if ((dap->da_state & DIRCHG) == 0) {
 		pagedep = dap->da_pagedep;
 	} else {
 		dirrem = dap->da_previous;
 		pagedep = dirrem->dm_pagedep;
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		dirrem->dm_state |= COMPLETE;
 		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 			add_to_worklist(&dirrem->dm_list, 0);
 	}
 	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 	    0, &inodedep) != 0)
 		if (inodedep->id_mkdiradd == dap)
 			inodedep->id_mkdiradd = NULL;
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
 		     mkdir = nextmd) {
 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
 			if (mkdir->md_diradd != dap)
 				continue;
 			dap->da_state &=
 			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
 			LIST_REMOVE(mkdir, md_mkdirs);
 			if (mkdir->md_state & ONWORKLIST)
 				WORKLIST_REMOVE(&mkdir->md_list);
 			if (mkdir->md_jaddref != NULL)
 				panic("free_diradd: Unexpected jaddref");
 			WORKITEM_FREE(mkdir, D_MKDIR);
 			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 				break;
 		}
 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 			panic("free_diradd: unfound ref");
 	}
 	if (inodedep)
 		free_inodedep(inodedep);
 	/*
 	 * Free any journal segments waiting for the directory write.
 	 */
 	handle_jwork(&dap->da_jwork);
 	WORKITEM_FREE(dap, D_DIRADD);
 }
 
 /*
  * Directory entry removal dependencies.
  * 
  * When removing a directory entry, the entry's inode pointer must be
  * zero'ed on disk before the corresponding inode's link count is decremented
  * (possibly freeing the inode for re-use). This dependency is handled by
  * updating the directory entry but delaying the inode count reduction until
  * after the directory block has been written to disk. After this point, the
  * inode count can be decremented whenever it is convenient.
  */
 
 /*
  * This routine should be called immediately after removing
  * a directory entry.  The inode's link count should not be
  * decremented by the calling procedure -- the soft updates
  * code will do this task when it is safe.
  */
 void 
 softdep_setup_remove(bp, dp, ip, isrmdir)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	struct dirrem *dirrem, *prevdirrem;
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 	int direct;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_setup_remove called on non-softdep filesystem"));
 	/*
 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
 	 * newdirrem() to setup the full directory remove which requires
 	 * isrmdir > 1.
 	 */
 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 	/*
 	 * Add the dirrem to the inodedep's pending remove list for quick
 	 * discovery later.
 	 */
 	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
 		panic("softdep_setup_remove: Lost inodedep.");
 	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
 	dirrem->dm_state |= ONDEPLIST;
 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
 	 * entries and we want to roll back to a zeroed entry until
 	 * the new inode is committed to disk. If the COMPLETE flag is
 	 * set then we have deleted an entry that never made it to
 	 * disk. If the entry we deleted resulted from a name change,
 	 * then the old name still resides on disk. We cannot delete
 	 * its inode (returned to us in prevdirrem) until the zeroed
 	 * directory entry gets to disk. The new inode has never been
 	 * referenced on the disk, so can be deleted immediately.
 	 */
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 		    dm_next);
 		FREE_LOCK(ump);
 	} else {
 		if (prevdirrem != NULL)
 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 			    prevdirrem, dm_next);
 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
 		FREE_LOCK(ump);
 		if (direct)
 			handle_workitem_remove(dirrem, 0);
 	}
 }
 
 /*
  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
  * pd_pendinghd list of a pagedep.
  */
 static struct diradd *
 diradd_lookup(pagedep, offset)
 	struct pagedep *pagedep;
 	int offset;
 {
 	struct diradd *dap;
 
 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 		if (dap->da_offset == offset)
 			return (dap);
 	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 		if (dap->da_offset == offset)
 			return (dap);
 	return (NULL);
 }
 
 /*
  * Search for a .. diradd dependency in a directory that is being removed.
  * If the directory was renamed to a new parent we have a diradd rather
  * than a mkdir for the .. entry.  We need to cancel it now before
  * it is found in truncate().
  */
 static struct jremref *
 cancel_diradd_dotdot(ip, dirrem, jremref)
 	struct inode *ip;
 	struct dirrem *dirrem;
 	struct jremref *jremref;
 {
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	struct worklist *wk;
 
 	if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
 		return (jremref);
 	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
 	if (dap == NULL)
 		return (jremref);
 	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
 	/*
 	 * Mark any journal work as belonging to the parent so it is freed
 	 * with the .. reference.
 	 */
 	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
 		wk->wk_state |= MKDIR_PARENT;
 	return (NULL);
 }
 
 /*
  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
  * replace it with a dirrem/diradd pair as a result of re-parenting a
  * directory.  This ensures that we don't simultaneously have a mkdir and
  * a diradd for the same .. entry.
  */
 static struct jremref *
 cancel_mkdir_dotdot(ip, dirrem, jremref)
 	struct inode *ip;
 	struct dirrem *dirrem;
 	struct jremref *jremref;
 {
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct ufsmount *ump;
 	struct mkdir *mkdir;
 	struct diradd *dap;
 	struct mount *mp;
 
 	mp = ITOVFS(ip);
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 		return (jremref);
 	dap = inodedep->id_mkdiradd;
 	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
 		return (jremref);
 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
 	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
 	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
 		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
 			break;
 	if (mkdir == NULL)
 		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
 	if ((jaddref = mkdir->md_jaddref) != NULL) {
 		mkdir->md_jaddref = NULL;
 		jaddref->ja_state &= ~MKDIR_PARENT;
 		if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
 			panic("cancel_mkdir_dotdot: Lost parent inodedep");
 		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
 			journal_jremref(dirrem, jremref, inodedep);
 			jremref = NULL;
 		}
 	}
 	if (mkdir->md_state & ONWORKLIST)
 		WORKLIST_REMOVE(&mkdir->md_list);
 	mkdir->md_state |= ALLCOMPLETE;
 	complete_mkdir(mkdir);
 	return (jremref);
 }
 
 static void
 journal_jremref(dirrem, jremref, inodedep)
 	struct dirrem *dirrem;
 	struct jremref *jremref;
 	struct inodedep *inodedep;
 {
 
 	if (inodedep == NULL)
 		if (inodedep_lookup(jremref->jr_list.wk_mp,
 		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
 			panic("journal_jremref: Lost inodedep");
 	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
 	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
 	add_to_journal(&jremref->jr_list);
 }
 
 static void
 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
 	struct dirrem *dirrem;
 	struct jremref *jremref;
 	struct jremref *dotremref;
 	struct jremref *dotdotremref;
 {
 	struct inodedep *inodedep;
 
 
 	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
 	    &inodedep) == 0)
 		panic("dirrem_journal: Lost inodedep");
 	journal_jremref(dirrem, jremref, inodedep);
 	if (dotremref)
 		journal_jremref(dirrem, dotremref, inodedep);
 	if (dotdotremref)
 		journal_jremref(dirrem, dotdotremref, NULL);
 }
 
 /*
  * Allocate a new dirrem if appropriate and return it along with
  * its associated pagedep. Called without a lock, returns with lock.
  */
 static struct dirrem *
 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	int isrmdir;		/* indicates if doing RMDIR */
 	struct dirrem **prevdirremp; /* previously referenced inode, if any */
 {
 	int offset;
 	ufs_lbn_t lbn;
 	struct diradd *dap;
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
 	struct jremref *jremref;
 	struct jremref *dotremref;
 	struct jremref *dotdotremref;
 	struct vnode *dvp;
 	struct ufsmount *ump;
 
 	/*
 	 * Whiteouts have no deletion dependencies.
 	 */
 	if (ip == NULL)
 		panic("newdirrem: whiteout");
 	dvp = ITOV(dp);
 	ump = ITOUMP(dp);
 
 	/*
 	 * If the system is over its limit and our filesystem is
 	 * responsible for more than our share of that usage and
 	 * we are not a snapshot, request some inodedep cleanup.
 	 * Limiting the number of dirrem structures will also limit
 	 * the number of freefile and freeblks structures.
 	 */
 	ACQUIRE_LOCK(ump);
 	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
 		schedule_cleanup(UFSTOVFS(ump));
 	else
 		FREE_LOCK(ump);
 	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
 	    M_ZERO);
 	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
 	LIST_INIT(&dirrem->dm_jremrefhd);
 	LIST_INIT(&dirrem->dm_jwork);
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
 	dirrem->dm_oldinum = ip->i_number;
 	*prevdirremp = NULL;
 	/*
 	 * Allocate remove reference structures to track journal write
 	 * dependencies.  We will always have one for the link and
 	 * when doing directories we will always have one more for dot.
 	 * When renaming a directory we skip the dotdot link change so
 	 * this is not needed.
 	 */
 	jremref = dotremref = dotdotremref = NULL;
 	if (DOINGSUJ(dvp)) {
 		if (isrmdir) {
 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
 			    ip->i_effnlink + 2);
 			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
 			    ip->i_effnlink + 1);
 			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
 			    dp->i_effnlink + 1);
 			dotdotremref->jr_state |= MKDIR_PARENT;
 		} else
 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
 			    ip->i_effnlink + 1);
 	}
 	ACQUIRE_LOCK(ump);
 	lbn = lblkno(ump->um_fs, dp->i_offset);
 	offset = blkoff(ump->um_fs, dp->i_offset);
 	pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
 	    &pagedep);
 	dirrem->dm_pagedep = pagedep;
 	dirrem->dm_offset = offset;
 	/*
 	 * If we're renaming a .. link to a new directory, cancel any
 	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
 	 * the jremref is preserved for any potential diradd in this
 	 * location.  This can not coincide with a rmdir.
 	 */
 	if (dp->i_offset == DOTDOT_OFFSET) {
 		if (isrmdir)
 			panic("newdirrem: .. directory change during remove?");
 		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
 	}
 	/*
 	 * If we're removing a directory search for the .. dependency now and
 	 * cancel it.  Any pending journal work will be added to the dirrem
 	 * to be completed when the workitem remove completes.
 	 */
 	if (isrmdir)
 		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
 	/*
 	 * Check for a diradd dependency for the same directory entry.
 	 * If present, then both dependencies become obsolete and can
 	 * be de-allocated.
 	 */
 	dap = diradd_lookup(pagedep, offset);
 	if (dap == NULL) {
 		/*
 		 * Link the jremref structures into the dirrem so they are
 		 * written prior to the pagedep.
 		 */
 		if (jremref)
 			dirrem_journal(dirrem, jremref, dotremref,
 			    dotdotremref);
 		return (dirrem);
 	}
 	/*
 	 * Must be ATTACHED at this point.
 	 */
 	if ((dap->da_state & ATTACHED) == 0)
 		panic("newdirrem: not ATTACHED");
 	if (dap->da_newinum != ip->i_number)
 		panic("newdirrem: inum %ju should be %ju",
 		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
 	/*
 	 * If we are deleting a changed name that never made it to disk,
 	 * then return the dirrem describing the previous inode (which
 	 * represents the inode currently referenced from this entry on disk).
 	 */
 	if ((dap->da_state & DIRCHG) != 0) {
 		*prevdirremp = dap->da_previous;
 		dap->da_state &= ~DIRCHG;
 		dap->da_pagedep = pagedep;
 	}
 	/*
 	 * We are deleting an entry that never made it to disk.
 	 * Mark it COMPLETE so we can delete its inode immediately.
 	 */
 	dirrem->dm_state |= COMPLETE;
 	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
 #ifdef INVARIANTS
 	if (isrmdir == 0) {
 		struct worklist *wk;
 
 		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
 			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
 				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
 	}
 #endif
 
 	return (dirrem);
 }
 
 /*
  * Directory entry change dependencies.
  * 
  * Changing an existing directory entry requires that an add operation
  * be completed first followed by a deletion. The semantics for the addition
  * are identical to the description of adding a new entry above except
  * that the rollback is to the old inode number rather than zero. Once
  * the addition dependency is completed, the removal is done as described
  * in the removal routine above.
  */
 
 /*
  * This routine should be called immediately after changing
  * a directory entry.  The inode's link count should not be
  * decremented by the calling procedure -- the soft updates
  * code will perform this task when it is safe.
  */
 void 
 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	ino_t newinum;		/* new inode number for changed entry */
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	int offset;
 	struct diradd *dap = NULL;
 	struct dirrem *dirrem, *prevdirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct mount *mp;
 	struct ufsmount *ump;
 
 	mp = ITOVFS(dp);
 	ump = VFSTOUFS(mp);
 	offset = blkoff(ump->um_fs, dp->i_offset);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	   ("softdep_setup_directory_change called on non-softdep filesystem"));
 
 	/*
 	 * Whiteouts do not need diradd dependencies.
 	 */
 	if (newinum != UFS_WINO) {
 		dap = malloc(sizeof(struct diradd),
 		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
 		workitem_alloc(&dap->da_list, D_DIRADD, mp);
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 		dap->da_offset = offset;
 		dap->da_newinum = newinum;
 		LIST_INIT(&dap->da_jwork);
 	}
 
 	/*
 	 * Allocate a new dirrem and ACQUIRE_LOCK.
 	 */
 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 	pagedep = dirrem->dm_pagedep;
 	/*
 	 * The possible values for isrmdir:
 	 *	0 - non-directory file rename
 	 *	1 - directory rename within same directory
 	 *   inum - directory rename to new directory of given inode number
 	 * When renaming to a new directory, we are both deleting and
 	 * creating a new directory entry, so the link count on the new
 	 * directory should not change. Thus we do not need the followup
 	 * dirrem which is usually done in handle_workitem_remove. We set
 	 * the DIRCHG flag to tell handle_workitem_remove to skip the 
 	 * followup dirrem.
 	 */
 	if (isrmdir > 1)
 		dirrem->dm_state |= DIRCHG;
 
 	/*
 	 * Whiteouts have no additional dependencies,
 	 * so just put the dirrem on the correct list.
 	 */
 	if (newinum == UFS_WINO) {
 		if ((dirrem->dm_state & COMPLETE) == 0) {
 			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
 			    dm_next);
 		} else {
 			dirrem->dm_dirinum = pagedep->pd_ino;
 			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 				add_to_worklist(&dirrem->dm_list, 0);
 		}
 		FREE_LOCK(ump);
 		return;
 	}
 	/*
 	 * Add the dirrem to the inodedep's pending remove list for quick
 	 * discovery later.  A valid nlinkdelta ensures that this lookup
 	 * will not fail.
 	 */
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 		panic("softdep_setup_directory_change: Lost inodedep.");
 	dirrem->dm_state |= ONDEPLIST;
 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
 	 * entries and we want to roll back to the previous inode until
 	 * the new inode is committed to disk. If the COMPLETE flag is
 	 * set, then we have deleted an entry that never made it to disk.
 	 * If the entry we deleted resulted from a name change, then the old
 	 * inode reference still resides on disk. Any rollback that we do
 	 * needs to be to that old inode (returned to us in prevdirrem). If
 	 * the entry we deleted resulted from a create, then there is
 	 * no entry on the disk, so we want to roll back to zero rather
 	 * than the uncommitted inode. In either of the COMPLETE cases we
 	 * want to immediately free the unwritten and unreferenced inode.
 	 */
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		dap->da_previous = dirrem;
 	} else {
 		if (prevdirrem != NULL) {
 			dap->da_previous = prevdirrem;
 		} else {
 			dap->da_state &= ~DIRCHG;
 			dap->da_pagedep = pagedep;
 		}
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 			add_to_worklist(&dirrem->dm_list, 0);
 	}
 	/*
 	 * Lookup the jaddref for this journal entry.  We must finish
 	 * initializing it and make the diradd write dependent on it.
 	 * If we're not journaling, put it on the id_bufwait list if the
 	 * inode is not yet written. If it is written, do the post-inode
 	 * write processing to put it on the id_pendinghd list.
 	 */
 	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 	if (MOUNTEDSUJ(mp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
 		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 		    ("softdep_setup_directory_change: bad jaddref %p",
 		    jaddref));
 		jaddref->ja_diroff = dp->i_offset;
 		jaddref->ja_diradd = dap;
 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 		    dap, da_pdlist);
 		add_to_journal(&jaddref->ja_list);
 	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		dap->da_state |= COMPLETE;
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 	} else {
 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 		    dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	}
 	/*
 	 * If we're making a new name for a directory that has not been
 	 * committed when need to move the dot and dotdot references to
 	 * this new name.
 	 */
 	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
 		merge_diradd(inodedep, dap);
 	FREE_LOCK(ump);
 }
 
 /*
  * Called whenever the link count on an inode is changed.
  * It creates an inode dependency so that the new reference(s)
  * to the inode cannot be committed to disk until the updated
  * inode has been written.
  */
 void
 softdep_change_linkcnt(ip)
 	struct inode *ip;	/* the inode with the increased link count */
 {
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_change_linkcnt called on non-softdep filesystem"));
 	ACQUIRE_LOCK(ump);
 	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("softdep_change_linkcnt: bad delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	FREE_LOCK(ump);
 }
 
 /*
  * Attach a sbdep dependency to the superblock buf so that we can keep
  * track of the head of the linked list of referenced but unlinked inodes.
  */
 void
 softdep_setup_sbupdate(ump, fs, bp)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct buf *bp;
 {
 	struct sbdep *sbdep;
 	struct worklist *wk;
 
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
 	LIST_FOREACH(wk, &bp->b_dep, wk_list)
 		if (wk->wk_type == D_SBDEP)
 			break;
 	if (wk != NULL)
 		return;
 	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
 	sbdep->sb_fs = fs;
 	sbdep->sb_ump = ump;
 	ACQUIRE_LOCK(ump);
 	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
 	FREE_LOCK(ump);
 }
 
 /*
  * Return the first unlinked inodedep which is ready to be the head of the
  * list.  The inodedep and all those after it must have valid next pointers.
  */
 static struct inodedep *
 first_unlinked_inodedep(ump)
 	struct ufsmount *ump;
 {
 	struct inodedep *inodedep;
 	struct inodedep *idp;
 
 	LOCK_OWNED(ump);
 	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
 	    inodedep; inodedep = idp) {
 		if ((inodedep->id_state & UNLINKNEXT) == 0)
 			return (NULL);
 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
 		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
 			break;
 		if ((inodedep->id_state & UNLINKPREV) == 0)
 			break;
 	}
 	return (inodedep);
 }
 
 /*
  * Set the sujfree unlinked head pointer prior to writing a superblock.
  */
 static void
 initiate_write_sbdep(sbdep)
 	struct sbdep *sbdep;
 {
 	struct inodedep *inodedep;
 	struct fs *bpfs;
 	struct fs *fs;
 
 	bpfs = sbdep->sb_fs;
 	fs = sbdep->sb_ump->um_fs;
 	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
 	if (inodedep) {
 		fs->fs_sujfree = inodedep->id_ino;
 		inodedep->id_state |= UNLINKPREV;
 	} else
 		fs->fs_sujfree = 0;
 	bpfs->fs_sujfree = fs->fs_sujfree;
 	/*
 	 * Because we have made changes to the superblock, we need to
 	 * recompute its check-hash.
 	 */
 	bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
 }
 
 /*
  * After a superblock is written determine whether it must be written again
  * due to a changing unlinked list head.
  */
 static int
 handle_written_sbdep(sbdep, bp)
 	struct sbdep *sbdep;
 	struct buf *bp;
 {
 	struct inodedep *inodedep;
 	struct fs *fs;
 
 	LOCK_OWNED(sbdep->sb_ump);
 	fs = sbdep->sb_fs;
 	/*
 	 * If the superblock doesn't match the in-memory list start over.
 	 */
 	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
 	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
 	    (inodedep == NULL && fs->fs_sujfree != 0)) {
 		bdirty(bp);
 		return (1);
 	}
 	WORKITEM_FREE(sbdep, D_SBDEP);
 	if (fs->fs_sujfree == 0)
 		return (0);
 	/*
 	 * Now that we have a record of this inode in stable store allow it
 	 * to be written to free up pending work.  Inodes may see a lot of
 	 * write activity after they are unlinked which we must not hold up.
 	 */
 	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
 		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
 			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
 			    inodedep, inodedep->id_state);
 		if (inodedep->id_state & UNLINKONLIST)
 			break;
 		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
 	}
 
 	return (0);
 }
 
 /*
  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
  */
 static void
 unlinked_inodedep(mp, inodedep)
 	struct mount *mp;
 	struct inodedep *inodedep;
 {
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	if (MOUNTEDSUJ(mp) == 0)
 		return;
 	ump->um_fs->fs_fmod = 1;
 	if (inodedep->id_state & UNLINKED)
 		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
 	inodedep->id_state |= UNLINKED;
 	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
 }
 
 /*
  * Remove an inodedep from the unlinked inodedep list.  This may require
  * disk writes if the inode has made it that far.
  */
 static void
 clear_unlinked_inodedep(inodedep)
 	struct inodedep *inodedep;
 {
 	struct ufs2_dinode *dip;
 	struct ufsmount *ump;
 	struct inodedep *idp;
 	struct inodedep *idn;
 	struct fs *fs, *bpfs;
 	struct buf *bp;
 	daddr_t dbn;
 	ino_t ino;
 	ino_t nino;
 	ino_t pino;
 	int error;
 
 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
 	fs = ump->um_fs;
 	ino = inodedep->id_ino;
 	error = 0;
 	for (;;) {
 		LOCK_OWNED(ump);
 		KASSERT((inodedep->id_state & UNLINKED) != 0,
 		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
 		    inodedep));
 		/*
 		 * If nothing has yet been written simply remove us from
 		 * the in memory list and return.  This is the most common
 		 * case where handle_workitem_remove() loses the final
 		 * reference.
 		 */
 		if ((inodedep->id_state & UNLINKLINKS) == 0)
 			break;
 		/*
 		 * If we have a NEXT pointer and no PREV pointer we can simply
 		 * clear NEXT's PREV and remove ourselves from the list.  Be
 		 * careful not to clear PREV if the superblock points at
 		 * next as well.
 		 */
 		idn = TAILQ_NEXT(inodedep, id_unlinked);
 		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
 			if (idn && fs->fs_sujfree != idn->id_ino)
 				idn->id_state &= ~UNLINKPREV;
 			break;
 		}
 		/*
 		 * Here we have an inodedep which is actually linked into
 		 * the list.  We must remove it by forcing a write to the
 		 * link before us, whether it be the superblock or an inode.
 		 * Unfortunately the list may change while we're waiting
 		 * on the buf lock for either resource so we must loop until
 		 * we lock the right one.  If both the superblock and an
 		 * inode point to this inode we must clear the inode first
 		 * followed by the superblock.
 		 */
 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
 		pino = 0;
 		if (idp && (idp->id_state & UNLINKNEXT))
 			pino = idp->id_ino;
 		FREE_LOCK(ump);
 		if (pino == 0) {
 			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
 			    (int)fs->fs_sbsize, 0, 0, 0);
 		} else {
 			dbn = fsbtodb(fs, ino_to_fsba(fs, pino));
 			error = ffs_breadz(ump, ump->um_devvp, dbn, dbn,
 			    (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL,
 			    &bp);
 		}
 		ACQUIRE_LOCK(ump);
 		if (error)
 			break;
 		/* If the list has changed restart the loop. */
 		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
 		nino = 0;
 		if (idp && (idp->id_state & UNLINKNEXT))
 			nino = idp->id_ino;
 		if (nino != pino ||
 		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
 			FREE_LOCK(ump);
 			brelse(bp);
 			ACQUIRE_LOCK(ump);
 			continue;
 		}
 		nino = 0;
 		idn = TAILQ_NEXT(inodedep, id_unlinked);
 		if (idn)
 			nino = idn->id_ino;
 		/*
 		 * Remove us from the in memory list.  After this we cannot
 		 * access the inodedep.
 		 */
 		KASSERT((inodedep->id_state & UNLINKED) != 0,
 		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
 		    inodedep));
 		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
 		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
 		FREE_LOCK(ump);
 		/*
 		 * The predecessor's next pointer is manually updated here
 		 * so that the NEXT flag is never cleared for an element
 		 * that is in the list.
 		 */
 		if (pino == 0) {
 			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 			bpfs = (struct fs *)bp->b_data;
 			ffs_oldfscompat_write(bpfs, ump);
 			softdep_setup_sbupdate(ump, bpfs, bp);
 			/*
 			 * Because we may have made changes to the superblock,
 			 * we need to recompute its check-hash.
 			 */
 			bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
 		} else if (fs->fs_magic == FS_UFS1_MAGIC) {
 			((struct ufs1_dinode *)bp->b_data +
 			    ino_to_fsbo(fs, pino))->di_freelink = nino;
 		} else {
 			dip = (struct ufs2_dinode *)bp->b_data +
 			    ino_to_fsbo(fs, pino);
 			dip->di_freelink = nino;
 			ffs_update_dinode_ckhash(fs, dip);
 		}
 		/*
 		 * If the bwrite fails we have no recourse to recover.  The
 		 * filesystem is corrupted already.
 		 */
 		bwrite(bp);
 		ACQUIRE_LOCK(ump);
 		/*
 		 * If the superblock pointer still needs to be cleared force
 		 * a write here.
 		 */
 		if (fs->fs_sujfree == ino) {
 			FREE_LOCK(ump);
 			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
 			    (int)fs->fs_sbsize, 0, 0, 0);
 			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 			bpfs = (struct fs *)bp->b_data;
 			ffs_oldfscompat_write(bpfs, ump);
 			softdep_setup_sbupdate(ump, bpfs, bp);
 			/*
 			 * Because we may have made changes to the superblock,
 			 * we need to recompute its check-hash.
 			 */
 			bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
 			bwrite(bp);
 			ACQUIRE_LOCK(ump);
 		}
 
 		if (fs->fs_sujfree != ino)
 			return;
 		panic("clear_unlinked_inodedep: Failed to clear free head");
 	}
 	if (inodedep->id_ino == fs->fs_sujfree)
 		panic("clear_unlinked_inodedep: Freeing head of free list");
 	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
 	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
 	return;
 }
 
 /*
  * This workitem decrements the inode's link count.
  * If the link count reaches zero, the file is removed.
  */
 static int
 handle_workitem_remove(dirrem, flags)
 	struct dirrem *dirrem;
 	int flags;
 {
 	struct inodedep *inodedep;
 	struct workhead dotdotwk;
 	struct worklist *wk;
 	struct ufsmount *ump;
 	struct mount *mp;
 	struct vnode *vp;
 	struct inode *ip;
 	ino_t oldinum;
 
 	if (dirrem->dm_state & ONWORKLIST)
 		panic("handle_workitem_remove: dirrem %p still on worklist",
 		    dirrem);
 	oldinum = dirrem->dm_oldinum;
 	mp = dirrem->dm_list.wk_mp;
 	ump = VFSTOUFS(mp);
 	flags |= LK_EXCLUSIVE;
 	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
 		return (EBUSY);
 	ip = VTOI(vp);
 	MPASS(ip->i_mode != 0);
 	ACQUIRE_LOCK(ump);
 	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
 		panic("handle_workitem_remove: lost inodedep");
 	if (dirrem->dm_state & ONDEPLIST)
 		LIST_REMOVE(dirrem, dm_inonext);
 	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
 	    ("handle_workitem_remove:  Journal entries not written."));
 
 	/*
 	 * Move all dependencies waiting on the remove to complete
 	 * from the dirrem to the inode inowait list to be completed
 	 * after the inode has been updated and written to disk.
 	 *
 	 * Any marked MKDIR_PARENT are saved to be completed when the 
 	 * dotdot ref is removed unless DIRCHG is specified.  For
 	 * directory change operations there will be no further
 	 * directory writes and the jsegdeps need to be moved along
 	 * with the rest to be completed when the inode is free or
 	 * stable in the inode free list.
 	 */
 	LIST_INIT(&dotdotwk);
 	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		if ((dirrem->dm_state & DIRCHG) == 0 &&
 		    wk->wk_state & MKDIR_PARENT) {
 			wk->wk_state &= ~MKDIR_PARENT;
 			WORKLIST_INSERT(&dotdotwk, wk);
 			continue;
 		}
 		WORKLIST_INSERT(&inodedep->id_inowait, wk);
 	}
 	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
 	/*
 	 * Normal file deletion.
 	 */
 	if ((dirrem->dm_state & RMDIR) == 0) {
 		ip->i_nlink--;
 		KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino "
 		    "%ju negative i_nlink %d", (intmax_t)ip->i_number,
 		    ip->i_nlink));
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		if (ip->i_nlink < ip->i_effnlink)
 			panic("handle_workitem_remove: bad file delta");
 		if (ip->i_nlink == 0) 
 			unlinked_inodedep(mp, inodedep);
 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
 		    ("handle_workitem_remove: worklist not empty. %s",
 		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
 		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(ump);
 		goto out;
 	}
 	/*
 	 * Directory deletion. Decrement reference count for both the
 	 * just deleted parent directory entry and the reference for ".".
 	 * Arrange to have the reference count on the parent decremented
 	 * to account for the loss of "..".
 	 */
 	ip->i_nlink -= 2;
 	KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino "
 	    "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink));
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("handle_workitem_remove: bad dir delta");
 	if (ip->i_nlink == 0)
 		unlinked_inodedep(mp, inodedep);
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	/*
 	 * Rename a directory to a new parent. Since, we are both deleting
 	 * and creating a new directory entry, the link count on the new
 	 * directory should not change. Thus we skip the followup dirrem.
 	 */
 	if (dirrem->dm_state & DIRCHG) {
 		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
 		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
 		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(ump);
 		goto out;
 	}
 	dirrem->dm_state = ONDEPLIST;
 	dirrem->dm_oldinum = dirrem->dm_dirinum;
 	/*
 	 * Place the dirrem on the parent's diremhd list.
 	 */
 	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
 		panic("handle_workitem_remove: lost dir inodedep");
 	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 	/*
 	 * If the allocated inode has never been written to disk, then
 	 * the on-disk inode is zero'ed and we can remove the file
 	 * immediately.  When journaling if the inode has been marked
 	 * unlinked and not DEPCOMPLETE we know it can never be written.
 	 */
 	inodedep_lookup(mp, oldinum, 0, &inodedep);
 	if (inodedep == NULL ||
 	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
 	    check_inode_unwritten(inodedep)) {
 		FREE_LOCK(ump);
 		vput(vp);
 		return handle_workitem_remove(dirrem, flags);
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 	FREE_LOCK(ump);
 	UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 out:
 	ffs_update(vp, 0);
 	vput(vp);
 	return (0);
 }
 
 /*
  * Inode de-allocation dependencies.
  * 
  * When an inode's link count is reduced to zero, it can be de-allocated. We
  * found it convenient to postpone de-allocation until after the inode is
  * written to disk with its new link count (zero).  At this point, all of the
  * on-disk inode's block pointers are nullified and, with careful dependency
  * list ordering, all dependencies related to the inode will be satisfied and
  * the corresponding dependency structures de-allocated.  So, if/when the
  * inode is reused, there will be no mixing of old dependencies with new
  * ones.  This artificial dependency is set up by the block de-allocation
  * procedure above (softdep_setup_freeblocks) and completed by the
  * following procedure.
  */
 static void 
 handle_workitem_freefile(freefile)
 	struct freefile *freefile;
 {
 	struct workhead wkhd;
 	struct fs *fs;
 	struct ufsmount *ump;
 	int error;
 #ifdef INVARIANTS
 	struct inodedep *idp;
 #endif
 
 	ump = VFSTOUFS(freefile->fx_list.wk_mp);
 	fs = ump->um_fs;
 #ifdef INVARIANTS
 	ACQUIRE_LOCK(ump);
 	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 	FREE_LOCK(ump);
 	if (error)
 		panic("handle_workitem_freefile: inodedep %p survived", idp);
 #endif
 	UFS_LOCK(ump);
 	fs->fs_pendinginodes -= 1;
 	UFS_UNLOCK(ump);
 	LIST_INIT(&wkhd);
 	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
 	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
 	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
 		softdep_error("handle_workitem_freefile", error);
 	ACQUIRE_LOCK(ump);
 	WORKITEM_FREE(freefile, D_FREEFILE);
 	FREE_LOCK(ump);
 }
 
 
 /*
  * Helper function which unlinks marker element from work list and returns
  * the next element on the list.
  */
 static __inline struct worklist *
 markernext(struct worklist *marker)
 {
 	struct worklist *next;
 	
 	next = LIST_NEXT(marker, wk_list);
 	LIST_REMOVE(marker, wk_list);
 	return next;
 }
 
 /*
  * Disk writes.
  * 
  * The dependency structures constructed above are most actively used when file
  * system blocks are written to disk.  No constraints are placed on when a
  * block can be written, but unsatisfied update dependencies are made safe by
  * modifying (or replacing) the source memory for the duration of the disk
  * write.  When the disk write completes, the memory block is again brought
  * up-to-date.
  *
  * In-core inode structure reclamation.
  * 
  * Because there are a finite number of "in-core" inode structures, they are
  * reused regularly.  By transferring all inode-related dependencies to the
  * in-memory inode block and indexing them separately (via "inodedep"s), we
  * can allow "in-core" inode structures to be reused at any time and avoid
  * any increase in contention.
  *
  * Called just before entering the device driver to initiate a new disk I/O.
  * The buffer must be locked, thus, no I/O completion operations can occur
  * while we are manipulating its associated dependencies.
  */
 static void 
 softdep_disk_io_initiation(bp)
 	struct buf *bp;		/* structure describing disk write to occur */
 {
 	struct worklist *wk;
 	struct worklist marker;
 	struct inodedep *inodedep;
 	struct freeblks *freeblks;
 	struct jblkdep *jblkdep;
 	struct newblk *newblk;
 	struct ufsmount *ump;
 
 	/*
 	 * We only care about write operations. There should never
 	 * be dependencies for reads.
 	 */
 	if (bp->b_iocmd != BIO_WRITE)
 		panic("softdep_disk_io_initiation: not write");
 
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("softdep_disk_io_initiation: Writing buffer with "
 		    "background write in progress: %p", bp);
 
 	ump = softdep_bp_to_mp(bp);
 	if (ump == NULL)
 		return;
 
 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
 	PHOLD(curproc);			/* Don't swap out kernel stack */
 	ACQUIRE_LOCK(ump);
 	/*
 	 * Do any necessary pre-I/O processing.
 	 */
 	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
 	     wk = markernext(&marker)) {
 		LIST_INSERT_AFTER(wk, &marker, wk_list);
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
 			initiate_write_filepage(WK_PAGEDEP(wk), bp);
 			continue;
 
 		case D_INODEDEP:
 			inodedep = WK_INODEDEP(wk);
 			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
 				initiate_write_inodeblock_ufs1(inodedep, bp);
 			else
 				initiate_write_inodeblock_ufs2(inodedep, bp);
 			continue;
 
 		case D_INDIRDEP:
 			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
 			continue;
 
 		case D_BMSAFEMAP:
 			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
 			continue;
 
 		case D_JSEG:
 			WK_JSEG(wk)->js_buf = NULL;
 			continue;
 
 		case D_FREEBLKS:
 			freeblks = WK_FREEBLKS(wk);
 			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
 			/*
 			 * We have to wait for the freeblks to be journaled
 			 * before we can write an inodeblock with updated
 			 * pointers.  Be careful to arrange the marker so
 			 * we revisit the freeblks if it's not removed by
 			 * the first jwait().
 			 */
 			if (jblkdep != NULL) {
 				LIST_REMOVE(&marker, wk_list);
 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
 				jwait(&jblkdep->jb_list, MNT_WAIT);
 			}
 			continue;
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 			/*
 			 * We have to wait for the jnewblk to be journaled
 			 * before we can write to a block if the contents
 			 * may be confused with an earlier file's indirect
 			 * at recovery time.  Handle the marker as described
 			 * above.
 			 */
 			newblk = WK_NEWBLK(wk);
 			if (newblk->nb_jnewblk != NULL &&
 			    indirblk_lookup(newblk->nb_list.wk_mp,
 			    newblk->nb_newblkno)) {
 				LIST_REMOVE(&marker, wk_list);
 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
 				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
 			}
 			continue;
 
 		case D_SBDEP:
 			initiate_write_sbdep(WK_SBDEP(wk));
 			continue;
 
 		case D_MKDIR:
 		case D_FREEWORK:
 		case D_FREEDEP:
 		case D_JSEGDEP:
 			continue;
 
 		default:
 			panic("handle_disk_io_initiation: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	FREE_LOCK(ump);
 	PRELE(curproc);			/* Allow swapout of kernel stack */
 }
 
 /*
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in a directory. The buffer must be locked,
  * thus, no I/O completion operations can occur while we are
  * manipulating its associated dependencies.
  */
 static void
 initiate_write_filepage(pagedep, bp)
 	struct pagedep *pagedep;
 	struct buf *bp;
 {
 	struct jremref *jremref;
 	struct jmvref *jmvref;
 	struct dirrem *dirrem;
 	struct diradd *dap;
 	struct direct *ep;
 	int i;
 
 	if (pagedep->pd_state & IOSTARTED) {
 		/*
 		 * This can only happen if there is a driver that does not
 		 * understand chaining. Here biodone will reissue the call
 		 * to strategy for the incomplete buffers.
 		 */
 		printf("initiate_write_filepage: already started\n");
 		return;
 	}
 	pagedep->pd_state |= IOSTARTED;
 	/*
 	 * Wait for all journal remove dependencies to hit the disk.
 	 * We can not allow any potentially conflicting directory adds
 	 * to be visible before removes and rollback is too difficult.
 	 * The per-filesystem lock may be dropped and re-acquired, however 
 	 * we hold the buf locked so the dependency can not go away.
 	 */
 	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
 		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
 			jwait(&jremref->jr_list, MNT_WAIT);
 	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
 		jwait(&jmvref->jm_list, MNT_WAIT);
 	for (i = 0; i < DAHASHSZ; i++) {
 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 			ep = (struct direct *)
 			    ((char *)bp->b_data + dap->da_offset);
 			if (ep->d_ino != dap->da_newinum)
 				panic("%s: dir inum %ju != new %ju",
 				    "initiate_write_filepage",
 				    (uintmax_t)ep->d_ino,
 				    (uintmax_t)dap->da_newinum);
 			if (dap->da_state & DIRCHG)
 				ep->d_ino = dap->da_previous->dm_oldinum;
 			else
 				ep->d_ino = 0;
 			dap->da_state &= ~ATTACHED;
 			dap->da_state |= UNDONE;
 		}
 	}
 }
 
 /*
  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
  * Note that any bug fixes made to this routine must be done in the
  * version found below.
  *
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in an inodeblock. The buffer must be
  * locked, thus, no I/O completion operations can occur while we
  * are manipulating its associated dependencies.
  */
 static void 
 initiate_write_inodeblock_ufs1(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;			/* The inode block */
 {
 	struct allocdirect *adp, *lastadp;
 	struct ufs1_dinode *dp;
 	struct ufs1_dinode *sip;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct fs *fs;
 	ufs_lbn_t i;
 #ifdef INVARIANTS
 	ufs_lbn_t prevlbn = 0;
 #endif
 	int deplist;
 
 	if (inodedep->id_state & IOSTARTED)
 		panic("initiate_write_inodeblock_ufs1: already started");
 	inodedep->id_state |= IOSTARTED;
 	fs = inodedep->id_fs;
 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
 	LOCK_OWNED(ump);
 	dp = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
 
 	/*
 	 * If we're on the unlinked list but have not yet written our
 	 * next pointer initialize it here.
 	 */
 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
 		struct inodedep *inon;
 
 		inon = TAILQ_NEXT(inodedep, id_unlinked);
 		dp->di_freelink = inon ? inon->id_ino : 0;
 	}
 	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 		if (inodedep->id_savedino1 != NULL)
 			panic("initiate_write_inodeblock_ufs1: I/O underway");
 		FREE_LOCK(ump);
 		sip = malloc(sizeof(struct ufs1_dinode),
 		    M_SAVEDINO, M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(ump);
 		inodedep->id_savedino1 = sip;
 		*inodedep->id_savedino1 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 		dp->di_gen = inodedep->id_savedino1->di_gen;
 		dp->di_freelink = inodedep->id_savedino1->di_freelink;
 		return;
 	}
 	/*
 	 * If no dependencies, then there is nothing to roll back.
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = 0;
 	inodedep->id_savednlink = dp->di_nlink;
 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
 	    TAILQ_EMPTY(&inodedep->id_inoreflst))
 		return;
 	/*
 	 * Revert the link count to that of the first unwritten journal entry.
 	 */
 	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
 	if (inoref)
 		dp->di_nlink = inoref->if_nlink;
 	/*
 	 * Set the dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_offset)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_offset;
 		if (adp->ad_offset < UFS_NDADDR &&
 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
 			panic("initiate_write_inodeblock_ufs1: "
 			    "direct pointer #%jd mismatch %d != %jd",
 			    (intmax_t)adp->ad_offset,
 			    dp->di_db[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_offset >= UFS_NDADDR &&
 		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
 			panic("initiate_write_inodeblock_ufs1: "
 			    "indirect pointer #%jd mismatch %d != %jd",
 			    (intmax_t)adp->ad_offset - UFS_NDADDR,
 			    dp->di_ib[adp->ad_offset - UFS_NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("initiate_write_inodeblock_ufs1: "
 			    "Unknown state 0x%x", adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		if (adp->ad_offset >= UFS_NDADDR)
 			break;
 		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
 		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("initiate_write_inodeblock_ufs1: "
 				    "lost dep1");
 #endif /* INVARIANTS */
 			dp->di_db[i] = 0;
 		}
 		for (i = 0; i < UFS_NIADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
 				panic("initiate_write_inodeblock_ufs1: "
 				    "lost dep2");
 #endif /* INVARIANTS */
 			dp->di_ib[i] = 0;
 		}
 		return;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the file,
 	 * roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_offset; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * The only dependencies are for indirect blocks.
 	 *
 	 * The file size for indirect block additions is not guaranteed.
 	 * Such a guarantee would be non-trivial to achieve. The conventional
 	 * synchronous write implementation also does not make this guarantee.
 	 * Fsck should catch and fix discrepancies. Arguably, the file size
 	 * can be over-estimated without destroying integrity when the file
 	 * moves into the indirect blocks (i.e., is large). If we want to
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
 }
 		
 /*
  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
  * Note that any bug fixes made to this routine must be done in the
  * version found above.
  *
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in an inodeblock. The buffer must be
  * locked, thus, no I/O completion operations can occur while we
  * are manipulating its associated dependencies.
  */
 static void 
 initiate_write_inodeblock_ufs2(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;			/* The inode block */
 {
 	struct allocdirect *adp, *lastadp;
 	struct ufs2_dinode *dp;
 	struct ufs2_dinode *sip;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct fs *fs;
 	ufs_lbn_t i;
 #ifdef INVARIANTS
 	ufs_lbn_t prevlbn = 0;
 #endif
 	int deplist;
 
 	if (inodedep->id_state & IOSTARTED)
 		panic("initiate_write_inodeblock_ufs2: already started");
 	inodedep->id_state |= IOSTARTED;
 	fs = inodedep->id_fs;
 	ump = VFSTOUFS(inodedep->id_list.wk_mp);
 	LOCK_OWNED(ump);
 	dp = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
 
 	/*
 	 * If we're on the unlinked list but have not yet written our
 	 * next pointer initialize it here.
 	 */
 	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
 		struct inodedep *inon;
 
 		inon = TAILQ_NEXT(inodedep, id_unlinked);
 		dp->di_freelink = inon ? inon->id_ino : 0;
 		ffs_update_dinode_ckhash(fs, dp);
 	}
 	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 		if (inodedep->id_savedino2 != NULL)
 			panic("initiate_write_inodeblock_ufs2: I/O underway");
 		FREE_LOCK(ump);
 		sip = malloc(sizeof(struct ufs2_dinode),
 		    M_SAVEDINO, M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(ump);
 		inodedep->id_savedino2 = sip;
 		*inodedep->id_savedino2 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 		dp->di_gen = inodedep->id_savedino2->di_gen;
 		dp->di_freelink = inodedep->id_savedino2->di_freelink;
 		return;
 	}
 	/*
 	 * If no dependencies, then there is nothing to roll back.
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = dp->di_extsize;
 	inodedep->id_savednlink = dp->di_nlink;
 	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
 	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
 	    TAILQ_EMPTY(&inodedep->id_inoreflst))
 		return;
 	/*
 	 * Revert the link count to that of the first unwritten journal entry.
 	 */
 	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
 	if (inoref)
 		dp->di_nlink = inoref->if_nlink;
 
 	/*
 	 * Set the ext data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_offset)
 			panic("initiate_write_inodeblock_ufs2: lbn order");
 		prevlbn = adp->ad_offset;
 		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
 			panic("initiate_write_inodeblock_ufs2: "
 			    "ext pointer #%jd mismatch %jd != %jd",
 			    (intmax_t)adp->ad_offset,
 			    (intmax_t)dp->di_extb[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("initiate_write_inodeblock_ufs2: Unknown "
 			    "state 0x%x", adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the ext
 	 * data which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
 		for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("initiate_write_inodeblock_ufs2: "
 				    "lost dep1");
 #endif /* INVARIANTS */
 			dp->di_extb[i] = 0;
 		}
 		lastadp = NULL;
 		break;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the ext
 	 * data, roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_offset; i >= 0; i--)
 			if (dp->di_extb[i] != 0)
 				break;
 		dp->di_extsize = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * Set the file data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_offset)
 			panic("softdep_write_inodeblock: lbn order");
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("inodedep %p and adp %p not attached", inodedep, adp);
 		prevlbn = adp->ad_offset;
 		if (!ffs_fsfail_cleanup(ump, 0) &&
 		    adp->ad_offset < UFS_NDADDR &&
 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
 			panic("initiate_write_inodeblock_ufs2: "
 			    "direct pointer #%jd mismatch %jd != %jd",
 			    (intmax_t)adp->ad_offset,
 			    (intmax_t)dp->di_db[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
 		if (!ffs_fsfail_cleanup(ump, 0) &&
 		    adp->ad_offset >= UFS_NDADDR &&
 		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
 			panic("initiate_write_inodeblock_ufs2: "
 			    "indirect pointer #%jd mismatch %jd != %jd",
 			    (intmax_t)adp->ad_offset - UFS_NDADDR,
 			    (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("initiate_write_inodeblock_ufs2: Unknown "
 			     "state 0x%x", adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		if (adp->ad_offset >= UFS_NDADDR)
 			break;
 		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
 		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("initiate_write_inodeblock_ufs2: "
 				    "lost dep2");
 #endif /* INVARIANTS */
 			dp->di_db[i] = 0;
 		}
 		for (i = 0; i < UFS_NIADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
 				panic("initiate_write_inodeblock_ufs2: "
 				    "lost dep3");
 #endif /* INVARIANTS */
 			dp->di_ib[i] = 0;
 		}
 		ffs_update_dinode_ckhash(fs, dp);
 		return;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the file,
 	 * roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_offset; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * The only dependencies are for indirect blocks.
 	 *
 	 * The file size for indirect block additions is not guaranteed.
 	 * Such a guarantee would be non-trivial to achieve. The conventional
 	 * synchronous write implementation also does not make this guarantee.
 	 * Fsck should catch and fix discrepancies. Arguably, the file size
 	 * can be over-estimated without destroying integrity when the file
 	 * moves into the indirect blocks (i.e., is large). If we want to
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
 	ffs_update_dinode_ckhash(fs, dp);
 }
 
 /*
  * Cancel an indirdep as a result of truncation.  Release all of the
  * children allocindirs and place their journal work on the appropriate
  * list.
  */
 static void
 cancel_indirdep(indirdep, bp, freeblks)
 	struct indirdep *indirdep;
 	struct buf *bp;
 	struct freeblks *freeblks;
 {
 	struct allocindir *aip;
 
 	/*
 	 * None of the indirect pointers will ever be visible,
 	 * so they can simply be tossed. GOINGAWAY ensures
 	 * that allocated pointers will be saved in the buffer
 	 * cache until they are freed. Note that they will
 	 * only be able to be found by their physical address
 	 * since the inode mapping the logical address will
 	 * be gone. The save buffer used for the safe copy
 	 * was allocated in setup_allocindir_phase2 using
 	 * the physical address so it could be used for this
 	 * purpose. Hence we swap the safe copy with the real
 	 * copy, allowing the safe copy to be freed and holding
 	 * on to the real copy for later use in indir_trunc.
 	 */
 	if (indirdep->ir_state & GOINGAWAY)
 		panic("cancel_indirdep: already gone");
 	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
 		indirdep->ir_state |= DEPCOMPLETE;
 		LIST_REMOVE(indirdep, ir_next);
 	}
 	indirdep->ir_state |= GOINGAWAY;
 	/*
 	 * Pass in bp for blocks still have journal writes
 	 * pending so we can cancel them on their own.
 	 */
 	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
 		cancel_allocindir(aip, bp, freeblks, 0);
 	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
 		cancel_allocindir(aip, NULL, freeblks, 0);
 	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
 		cancel_allocindir(aip, NULL, freeblks, 0);
 	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
 		cancel_allocindir(aip, NULL, freeblks, 0);
 	/*
 	 * If there are pending partial truncations we need to keep the
 	 * old block copy around until they complete.  This is because
 	 * the current b_data is not a perfect superset of the available
 	 * blocks.
 	 */
 	if (TAILQ_EMPTY(&indirdep->ir_trunc))
 		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
 	else
 		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 	WORKLIST_REMOVE(&indirdep->ir_list);
 	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
 	indirdep->ir_bp = NULL;
 	indirdep->ir_freeblks = freeblks;
 }
 
 /*
  * Free an indirdep once it no longer has new pointers to track.
  */
 static void
 free_indirdep(indirdep)
 	struct indirdep *indirdep;
 {
 
 	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
 	    ("free_indirdep: Indir trunc list not empty."));
 	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
 	    ("free_indirdep: Complete head not empty."));
 	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
 	    ("free_indirdep: write head not empty."));
 	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
 	    ("free_indirdep: done head not empty."));
 	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
 	    ("free_indirdep: deplist head not empty."));
 	KASSERT((indirdep->ir_state & DEPCOMPLETE),
 	    ("free_indirdep: %p still on newblk list.", indirdep));
 	KASSERT(indirdep->ir_saveddata == NULL,
 	    ("free_indirdep: %p still has saved data.", indirdep));
 	KASSERT(indirdep->ir_savebp == NULL,
 	    ("free_indirdep: %p still has savebp buffer.", indirdep));
 	if (indirdep->ir_state & ONWORKLIST)
 		WORKLIST_REMOVE(&indirdep->ir_list);
 	WORKITEM_FREE(indirdep, D_INDIRDEP);
 }
 
 /*
  * Called before a write to an indirdep.  This routine is responsible for
  * rolling back pointers to a safe state which includes only those
  * allocindirs which have been completed.
  */
 static void
 initiate_write_indirdep(indirdep, bp)
 	struct indirdep *indirdep;
 	struct buf *bp;
 {
 	struct ufsmount *ump;
 
 	indirdep->ir_state |= IOSTARTED;
 	if (indirdep->ir_state & GOINGAWAY)
 		panic("disk_io_initiation: indirdep gone");
 	/*
 	 * If there are no remaining dependencies, this will be writing
 	 * the real pointers.
 	 */
 	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
 	    TAILQ_EMPTY(&indirdep->ir_trunc))
 		return;
 	/*
 	 * Replace up-to-date version with safe version.
 	 */
 	if (indirdep->ir_saveddata == NULL) {
 		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
 		LOCK_OWNED(ump);
 		FREE_LOCK(ump);
 		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
 		    M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(ump);
 	}
 	indirdep->ir_state &= ~ATTACHED;
 	indirdep->ir_state |= UNDONE;
 	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
 	    bp->b_bcount);
 }
 
 /*
  * Called when an inode has been cleared in a cg bitmap.  This finally
  * eliminates any canceled jaddrefs
  */
 void
 softdep_setup_inofree(mp, bp, ino, wkhd)
 	struct mount *mp;
 	struct buf *bp;
 	ino_t ino;
 	struct workhead *wkhd;
 {
 	struct worklist *wk, *wkn;
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 	uint8_t *inosused;
 	struct cg *cgp;
 	struct fs *fs;
 
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_inofree called on non-softdep filesystem"));
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(ump);
 	if (!ffs_fsfail_cleanup(ump, 0)) {
 		fs = ump->um_fs;
 		cgp = (struct cg *)bp->b_data;
 		inosused = cg_inosused(cgp);
 		if (isset(inosused, ino % fs->fs_ipg))
 			panic("softdep_setup_inofree: inode %ju not freed.",
 			    (uintmax_t)ino);
 	}
 	if (inodedep_lookup(mp, ino, 0, &inodedep))
 		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
 		    (uintmax_t)ino, inodedep);
 	if (wkhd) {
 		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
 			if (wk->wk_type != D_JADDREF)
 				continue;
 			WORKLIST_REMOVE(wk);
 			/*
 			 * We can free immediately even if the jaddref
 			 * isn't attached in a background write as now
 			 * the bitmaps are reconciled.
 			 */
 			wk->wk_state |= COMPLETE | ATTACHED;
 			free_jaddref(WK_JADDREF(wk));
 		}
 		jwork_move(&bp->b_dep, wkhd);
 	}
 	FREE_LOCK(ump);
 }
 
 /*
  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
  * map.  Any dependencies waiting for the write to clear are added to the
  * buf's list and any jnewblks that are being canceled are discarded
  * immediately.
  */
 void
 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
 	struct mount *mp;
 	struct buf *bp;
 	ufs2_daddr_t blkno;
 	int frags;
 	struct workhead *wkhd;
 {
 	struct bmsafemap *bmsafemap;
 	struct jnewblk *jnewblk;
 	struct ufsmount *ump;
 	struct worklist *wk;
 	struct fs *fs;
 #ifdef INVARIANTS
 	uint8_t *blksfree;
 	struct cg *cgp;
 	ufs2_daddr_t jstart;
 	ufs2_daddr_t jend;
 	ufs2_daddr_t end;
 	long bno;
 	int i;
 #endif
 
 	CTR3(KTR_SUJ,
 	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
 	    blkno, frags, wkhd);
 
 	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_setup_blkfree called on non-softdep filesystem"));
 	ACQUIRE_LOCK(ump);
 	/* Lookup the bmsafemap so we track when it is dirty. */
 	fs = ump->um_fs;
 	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
 	/*
 	 * Detach any jnewblks which have been canceled.  They must linger
 	 * until the bitmap is cleared again by ffs_blkfree() to prevent
 	 * an unjournaled allocation from hitting the disk.
 	 */
 	if (wkhd) {
 		while ((wk = LIST_FIRST(wkhd)) != NULL) {
 			CTR2(KTR_SUJ,
 			    "softdep_setup_blkfree: blkno %jd wk type %d",
 			    blkno, wk->wk_type);
 			WORKLIST_REMOVE(wk);
 			if (wk->wk_type != D_JNEWBLK) {
 				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
 				continue;
 			}
 			jnewblk = WK_JNEWBLK(wk);
 			KASSERT(jnewblk->jn_state & GOINGAWAY,
 			    ("softdep_setup_blkfree: jnewblk not canceled."));
 #ifdef INVARIANTS
 			/*
 			 * Assert that this block is free in the bitmap
 			 * before we discard the jnewblk.
 			 */
 			cgp = (struct cg *)bp->b_data;
 			blksfree = cg_blksfree(cgp);
 			bno = dtogd(fs, jnewblk->jn_blkno);
 			for (i = jnewblk->jn_oldfrags;
 			    i < jnewblk->jn_frags; i++) {
 				if (isset(blksfree, bno + i))
 					continue;
 				panic("softdep_setup_blkfree: not free");
 			}
 #endif
 			/*
 			 * Even if it's not attached we can free immediately
 			 * as the new bitmap is correct.
 			 */
 			wk->wk_state |= COMPLETE | ATTACHED;
 			free_jnewblk(jnewblk);
 		}
 	}
 
 #ifdef INVARIANTS
 	/*
 	 * Assert that we are not freeing a block which has an outstanding
 	 * allocation dependency.
 	 */
 	fs = VFSTOUFS(mp)->um_fs;
 	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
 	end = blkno + frags;
 	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
 		/*
 		 * Don't match against blocks that will be freed when the
 		 * background write is done.
 		 */
 		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
 		    (COMPLETE | DEPCOMPLETE))
 			continue;
 		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
 		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
 		if ((blkno >= jstart && blkno < jend) ||
 		    (end > jstart && end <= jend)) {
 			printf("state 0x%X %jd - %d %d dep %p\n",
 			    jnewblk->jn_state, jnewblk->jn_blkno,
 			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
 			    jnewblk->jn_dep);
 			panic("softdep_setup_blkfree: "
 			    "%jd-%jd(%d) overlaps with %jd-%jd",
 			    blkno, end, frags, jstart, jend);
 		}
 	}
 #endif
 	FREE_LOCK(ump);
 }
 
 /*
  * Revert a block allocation when the journal record that describes it
  * is not yet written.
  */
 static int
 jnewblk_rollback(jnewblk, fs, cgp, blksfree)
 	struct jnewblk *jnewblk;
 	struct fs *fs;
 	struct cg *cgp;
 	uint8_t *blksfree;
 {
 	ufs1_daddr_t fragno;
 	long cgbno, bbase;
 	int frags, blk;
 	int i;
 
 	frags = 0;
 	cgbno = dtogd(fs, jnewblk->jn_blkno);
 	/*
 	 * We have to test which frags need to be rolled back.  We may
 	 * be operating on a stale copy when doing background writes.
 	 */
 	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
 		if (isclr(blksfree, cgbno + i))
 			frags++;
 	if (frags == 0)
 		return (0);
 	/*
 	 * This is mostly ffs_blkfree() sans some validation and
 	 * superblock updates.
 	 */
 	if (frags == fs->fs_frag) {
 		fragno = fragstoblks(fs, cgbno);
 		ffs_setblock(fs, blksfree, fragno);
 		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 	} else {
 		cgbno += jnewblk->jn_oldfrags;
 		bbase = cgbno - fragnum(fs, cgbno);
 		/* Decrement the old frags.  */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/* Deallocate the fragment */
 		for (i = 0; i < frags; i++)
 			setbit(blksfree, cgbno + i);
 		cgp->cg_cs.cs_nffree += frags;
 		/* Add back in counts associated with the new frags */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/* If a complete block has been reassembled, account for it. */
 		fragno = fragstoblks(fs, bbase);
 		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 		}
 	}
 	stat_jnewblk++;
 	jnewblk->jn_state &= ~ATTACHED;
 	jnewblk->jn_state |= UNDONE;
 
 	return (frags);
 }
 
 static void
 initiate_write_bmsafemap(bmsafemap, bp)
 	struct bmsafemap *bmsafemap;
 	struct buf *bp;			/* The cg block. */
 {
 	struct jaddref *jaddref;
 	struct jnewblk *jnewblk;
 	uint8_t *inosused;
 	uint8_t *blksfree;
 	struct cg *cgp;
 	struct fs *fs;
 	ino_t ino;
 
 	/*
 	 * If this is a background write, we did this at the time that
 	 * the copy was made, so do not need to do it again.
 	 */
 	if (bmsafemap->sm_state & IOSTARTED)
 		return;
 	bmsafemap->sm_state |= IOSTARTED;
 	/*
 	 * Clear any inode allocations which are pending journal writes.
 	 */
 	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
 		cgp = (struct cg *)bp->b_data;
 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
 		inosused = cg_inosused(cgp);
 		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
 			ino = jaddref->ja_ino % fs->fs_ipg;
 			if (isset(inosused, ino)) {
 				if ((jaddref->ja_mode & IFMT) == IFDIR)
 					cgp->cg_cs.cs_ndir--;
 				cgp->cg_cs.cs_nifree++;
 				clrbit(inosused, ino);
 				jaddref->ja_state &= ~ATTACHED;
 				jaddref->ja_state |= UNDONE;
 				stat_jaddref++;
 			} else
 				panic("initiate_write_bmsafemap: inode %ju "
 				    "marked free", (uintmax_t)jaddref->ja_ino);
 		}
 	}
 	/*
 	 * Clear any block allocations which are pending journal writes.
 	 */
 	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
 		cgp = (struct cg *)bp->b_data;
 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
 		blksfree = cg_blksfree(cgp);
 		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
 			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
 				continue;
 			panic("initiate_write_bmsafemap: block %jd "
 			    "marked free", jnewblk->jn_blkno);
 		}
 	}
 	/*
 	 * Move allocation lists to the written lists so they can be
 	 * cleared once the block write is complete.
 	 */
 	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
 	    inodedep, id_deps);
 	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
 	    newblk, nb_deps);
 	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
 	    wk_list);
 }
 
 void
 softdep_handle_error(struct buf *bp)
 {
 	struct ufsmount *ump;
 
 	ump = softdep_bp_to_mp(bp);
 	if (ump == NULL)
 		return;
 
 	if (ffs_fsfail_cleanup(ump, bp->b_error)) {
 		/*
 		 * No future writes will succeed, so the on-disk image is safe.
 		 * Pretend that this write succeeded so that the softdep state
 		 * will be cleaned up naturally.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bp->b_error = 0;
 	}
 }
 
 /*
  * This routine is called during the completion interrupt
  * service routine for a disk write (from the procedure called
  * by the device driver to inform the filesystem caches of
  * a request completion).  It should be called early in this
  * procedure, before the block is made available to other
  * processes or other routines are called.
  *
  */
 static void 
 softdep_disk_write_complete(bp)
 	struct buf *bp;		/* describes the completed disk write */
 {
 	struct worklist *wk;
 	struct worklist *owk;
 	struct ufsmount *ump;
 	struct workhead reattach;
 	struct freeblks *freeblks;
 	struct buf *sbp;
 
 	ump = softdep_bp_to_mp(bp);
 	KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL,
 	    ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL "
 	     "with outstanding dependencies for buffer %p", bp));
 	if (ump == NULL)
 		return;
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		softdep_handle_error(bp);
 	/*
 	 * If an error occurred while doing the write, then the data
 	 * has not hit the disk and the dependencies cannot be processed.
 	 * But we do have to go through and roll forward any dependencies
 	 * that were rolled back before the disk write.
 	 */
 	sbp = NULL;
 	ACQUIRE_LOCK(ump);
 	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			switch (wk->wk_type) {
 
 			case D_PAGEDEP:
 				handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
 				continue;
 
 			case D_INODEDEP:
 				handle_written_inodeblock(WK_INODEDEP(wk),
 				    bp, 0);
 				continue;
 
 			case D_BMSAFEMAP:
 				handle_written_bmsafemap(WK_BMSAFEMAP(wk),
 				    bp, 0);
 				continue;
 
 			case D_INDIRDEP:
 				handle_written_indirdep(WK_INDIRDEP(wk),
 				    bp, &sbp, 0);
 				continue;
 			default:
 				/* nothing to roll forward */
 				continue;
 			}
 		}
 		FREE_LOCK(ump);
 		if (sbp)
 			brelse(sbp);
 		return;
 	}
 	LIST_INIT(&reattach);
 
 	/*
 	 * Ump SU lock must not be released anywhere in this code segment.
 	 */
 	owk = NULL;
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		atomic_add_long(&dep_write[wk->wk_type], 1);
 		if (wk == owk)
 			panic("duplicate worklist: %p\n", wk);
 		owk = wk;
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
 			if (handle_written_filepage(WK_PAGEDEP(wk), bp,
 			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_INODEDEP:
 			if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
 			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_BMSAFEMAP:
 			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
 			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_MKDIR:
 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 			continue;
 
 		case D_ALLOCDIRECT:
 			wk->wk_state |= COMPLETE;
 			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
 			continue;
 
 		case D_ALLOCINDIR:
 			wk->wk_state |= COMPLETE;
 			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
 			continue;
 
 		case D_INDIRDEP:
 			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
 			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_FREEBLKS:
 			wk->wk_state |= COMPLETE;
 			freeblks = WK_FREEBLKS(wk);
 			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
 			    LIST_EMPTY(&freeblks->fb_jblkdephd))
 				add_to_worklist(wk, WK_NODELAY);
 			continue;
 
 		case D_FREEWORK:
 			handle_written_freework(WK_FREEWORK(wk));
 			break;
 
 		case D_JSEGDEP:
 			free_jsegdep(WK_JSEGDEP(wk));
 			continue;
 
 		case D_JSEG:
 			handle_written_jseg(WK_JSEG(wk), bp);
 			continue;
 
 		case D_SBDEP:
 			if (handle_written_sbdep(WK_SBDEP(wk), bp))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_FREEDEP:
 			free_freedep(WK_FREEDEP(wk));
 			continue;
 
 		default:
 			panic("handle_disk_write_complete: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	/*
 	 * Reattach any requests that must be redone.
 	 */
 	while ((wk = LIST_FIRST(&reattach)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&bp->b_dep, wk);
 	}
 	FREE_LOCK(ump);
 	if (sbp)
 		brelse(sbp);
 }
 
 /*
  * Called from within softdep_disk_write_complete above.
  */
 static void 
 handle_allocdirect_partdone(adp, wkhd)
 	struct allocdirect *adp;	/* the completed allocdirect */
 	struct workhead *wkhd;		/* Work to do when inode is writtne. */
 {
 	struct allocdirectlst *listhead;
 	struct allocdirect *listadp;
 	struct inodedep *inodedep;
 	long bsize;
 
 	LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp));
 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem. Thus, we cannot free any
 	 * allocdirects after one whose ad_oldblkno claims a fragment as
 	 * these blocks must be rolled back to zero before writing the inode.
 	 * We check the currently active set of allocdirects in id_inoupdt
 	 * or id_extupdt as appropriate.
 	 */
 	inodedep = adp->ad_inodedep;
 	bsize = inodedep->id_fs->fs_bsize;
 	if (adp->ad_state & EXTDATA)
 		listhead = &inodedep->id_extupdt;
 	else
 		listhead = &inodedep->id_inoupdt;
 	TAILQ_FOREACH(listadp, listhead, ad_next) {
 		/* found our block */
 		if (listadp == adp)
 			break;
 		/* continue if ad_oldlbn is not a fragment */
 		if (listadp->ad_oldsize == 0 ||
 		    listadp->ad_oldsize == bsize)
 			continue;
 		/* hit a fragment */
 		return;
 	}
 	/*
 	 * If we have reached the end of the current list without
 	 * finding the just finished dependency, then it must be
 	 * on the future dependency list. Future dependencies cannot
 	 * be freed until they are moved to the current list.
 	 */
 	if (listadp == NULL) {
 #ifdef INVARIANTS
 		if (adp->ad_state & EXTDATA)
 			listhead = &inodedep->id_newextupdt;
 		else
 			listhead = &inodedep->id_newinoupdt;
 		TAILQ_FOREACH(listadp, listhead, ad_next)
 			/* found our block */
 			if (listadp == adp)
 				break;
 		if (listadp == NULL)
 			panic("handle_allocdirect_partdone: lost dep");
 #endif /* INVARIANTS */
 		return;
 	}
 	/*
 	 * If we have found the just finished dependency, then queue
 	 * it along with anything that follows it that is complete.
 	 * Since the pointer has not yet been written in the inode
 	 * as the dependency prevents it, place the allocdirect on the
 	 * bufwait list where it will be freed once the pointer is
 	 * valid.
 	 */
 	if (wkhd == NULL)
 		wkhd = &inodedep->id_bufwait;
 	for (; adp; adp = listadp) {
 		listadp = TAILQ_NEXT(adp, ad_next);
 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 			return;
 		TAILQ_REMOVE(listhead, adp, ad_next);
 		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
 	}
 }
 
 /*
  * Called from within softdep_disk_write_complete above.  This routine
  * completes successfully written allocindirs.
  */
 static void
 handle_allocindir_partdone(aip)
 	struct allocindir *aip;		/* the completed allocindir */
 {
 	struct indirdep *indirdep;
 
 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	indirdep = aip->ai_indirdep;
 	LIST_REMOVE(aip, ai_next);
 	/*
 	 * Don't set a pointer while the buffer is undergoing IO or while
 	 * we have active truncations.
 	 */
 	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 		return;
 	}
 	if (indirdep->ir_state & UFS1FMT)
 		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
 	else
 		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
 	/*
 	 * Await the pointer write before freeing the allocindir.
 	 */
 	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
 }
 
 /*
  * Release segments held on a jwork list.
  */
 static void
 handle_jwork(wkhd)
 	struct workhead *wkhd;
 {
 	struct worklist *wk;
 
 	while ((wk = LIST_FIRST(wkhd)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		switch (wk->wk_type) {
 		case D_JSEGDEP:
 			free_jsegdep(WK_JSEGDEP(wk));
 			continue;
 		case D_FREEDEP:
 			free_freedep(WK_FREEDEP(wk));
 			continue;
 		case D_FREEFRAG:
 			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
 			WORKITEM_FREE(wk, D_FREEFRAG);
 			continue;
 		case D_FREEWORK:
 			handle_written_freework(WK_FREEWORK(wk));
 			continue;
 		default:
 			panic("handle_jwork: Unknown type %s\n",
 			    TYPENAME(wk->wk_type));
 		}
 	}
 }
 
 /*
  * Handle the bufwait list on an inode when it is safe to release items
  * held there.  This normally happens after an inode block is written but
  * may be delayed and handled later if there are pending journal items that
  * are not yet safe to be released.
  */
 static struct freefile *
 handle_bufwait(inodedep, refhd)
 	struct inodedep *inodedep;
 	struct workhead *refhd;
 {
 	struct jaddref *jaddref;
 	struct freefile *freefile;
 	struct worklist *wk;
 
 	freefile = NULL;
 	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		switch (wk->wk_type) {
 		case D_FREEFILE:
 			/*
 			 * We defer adding freefile to the worklist
 			 * until all other additions have been made to
 			 * ensure that it will be done after all the
 			 * old blocks have been freed.
 			 */
 			if (freefile != NULL)
 				panic("handle_bufwait: freefile");
 			freefile = WK_FREEFILE(wk);
 			continue;
 
 		case D_MKDIR:
 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
 			continue;
 
 		case D_DIRADD:
 			diradd_inode_written(WK_DIRADD(wk), inodedep);
 			continue;
 
 		case D_FREEFRAG:
 			wk->wk_state |= COMPLETE;
 			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
 				add_to_worklist(wk, 0);
 			continue;
 
 		case D_DIRREM:
 			wk->wk_state |= COMPLETE;
 			add_to_worklist(wk, 0);
 			continue;
 
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 			free_newblk(WK_NEWBLK(wk));
 			continue;
 
 		case D_JNEWBLK:
 			wk->wk_state |= COMPLETE;
 			free_jnewblk(WK_JNEWBLK(wk));
 			continue;
 
 		/*
 		 * Save freed journal segments and add references on
 		 * the supplied list which will delay their release
 		 * until the cg bitmap is cleared on disk.
 		 */
 		case D_JSEGDEP:
 			if (refhd == NULL)
 				free_jsegdep(WK_JSEGDEP(wk));
 			else
 				WORKLIST_INSERT(refhd, wk);
 			continue;
 
 		case D_JADDREF:
 			jaddref = WK_JADDREF(wk);
 			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
 			    if_deps);
 			/*
 			 * Transfer any jaddrefs to the list to be freed with
 			 * the bitmap if we're handling a removed file.
 			 */
 			if (refhd == NULL) {
 				wk->wk_state |= COMPLETE;
 				free_jaddref(jaddref);
 			} else
 				WORKLIST_INSERT(refhd, wk);
 			continue;
 
 		default:
 			panic("handle_bufwait: Unknown type %p(%s)",
 			    wk, TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	return (freefile);
 }
 /*
  * Called from within softdep_disk_write_complete above to restore
  * in-memory inode block contents to their most up-to-date state. Note
  * that this routine is always called from interrupt level with further
  * interrupts from this device blocked.
  *
  * If the write did not succeed, we will do all the roll-forward
  * operations, but we will not take the actions that will allow its
  * dependencies to be processed.
  */
 static int 
 handle_written_inodeblock(inodedep, bp, flags)
 	struct inodedep *inodedep;
 	struct buf *bp;		/* buffer containing the inode block */
 	int flags;
 {
 	struct freefile *freefile;
 	struct allocdirect *adp, *nextadp;
 	struct ufs1_dinode *dp1 = NULL;
 	struct ufs2_dinode *dp2 = NULL;
 	struct workhead wkhd;
 	int hadchanges, fstype;
 	ino_t freelink;
 
 	LIST_INIT(&wkhd);
 	hadchanges = 0;
 	freefile = NULL;
 	if ((inodedep->id_state & IOSTARTED) == 0)
 		panic("handle_written_inodeblock: not started");
 	inodedep->id_state &= ~IOSTARTED;
 	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
 		fstype = UFS1;
 		dp1 = (struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 		freelink = dp1->di_freelink;
 	} else {
 		fstype = UFS2;
 		dp2 = (struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 		freelink = dp2->di_freelink;
 	}
 	/*
 	 * Leave this inodeblock dirty until it's in the list.
 	 */
 	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
 	    (flags & WRITESUCCEEDED)) {
 		struct inodedep *inon;
 
 		inon = TAILQ_NEXT(inodedep, id_unlinked);
 		if ((inon == NULL && freelink == 0) ||
 		    (inon && inon->id_ino == freelink)) {
 			if (inon)
 				inon->id_state |= UNLINKPREV;
 			inodedep->id_state |= UNLINKNEXT;
 		}
 		hadchanges = 1;
 	}
 	/*
 	 * If we had to rollback the inode allocation because of
 	 * bitmaps being incomplete, then simply restore it.
 	 * Keep the block dirty so that it will not be reclaimed until
 	 * all associated dependencies have been cleared and the
 	 * corresponding updates written to disk.
 	 */
 	if (inodedep->id_savedino1 != NULL) {
 		hadchanges = 1;
 		if (fstype == UFS1)
 			*dp1 = *inodedep->id_savedino1;
 		else
 			*dp2 = *inodedep->id_savedino2;
 		free(inodedep->id_savedino1, M_SAVEDINO);
 		inodedep->id_savedino1 = NULL;
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_inode_bitmap++;
 		bdirty(bp);
 		/*
 		 * If the inode is clear here and GOINGAWAY it will never
 		 * be written.  Process the bufwait and clear any pending
 		 * work which may include the freefile.
 		 */
 		if (inodedep->id_state & GOINGAWAY)
 			goto bufwait;
 		return (1);
 	}
 	if (flags & WRITESUCCEEDED)
 		inodedep->id_state |= COMPLETE;
 	/*
 	 * Roll forward anything that had to be rolled back before 
 	 * the inode could be updated.
 	 */
 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (fstype == UFS1) {
 			if (adp->ad_offset < UFS_NDADDR) {
 				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
 					panic("%s %s #%jd mismatch %d != %jd",
 					    "handle_written_inodeblock:",
 					    "direct pointer",
 					    (intmax_t)adp->ad_offset,
 					    dp1->di_db[adp->ad_offset],
 					    (intmax_t)adp->ad_oldblkno);
 				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
 			} else {
 				if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
 				    0)
 					panic("%s: %s #%jd allocated as %d",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
 					    (intmax_t)adp->ad_offset -
 					    UFS_NDADDR,
 					    dp1->di_ib[adp->ad_offset -
 					    UFS_NDADDR]);
 				dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
 				    adp->ad_newblkno;
 			}
 		} else {
 			if (adp->ad_offset < UFS_NDADDR) {
 				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
 					panic("%s: %s #%jd %s %jd != %jd",
 					    "handle_written_inodeblock",
 					    "direct pointer",
 					    (intmax_t)adp->ad_offset, "mismatch",
 					    (intmax_t)dp2->di_db[adp->ad_offset],
 					    (intmax_t)adp->ad_oldblkno);
 				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
 			} else {
 				if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
 				    0)
 					panic("%s: %s #%jd allocated as %jd",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
 					    (intmax_t)adp->ad_offset -
 					    UFS_NDADDR,
 					    (intmax_t)
 					    dp2->di_ib[adp->ad_offset -
 					    UFS_NDADDR]);
 				dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
 				    adp->ad_newblkno;
 			}
 		}
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
 	}
 	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
 			panic("%s: direct pointers #%jd %s %jd != %jd",
 			    "handle_written_inodeblock",
 			    (intmax_t)adp->ad_offset, "mismatch",
 			    (intmax_t)dp2->di_extb[adp->ad_offset],
 			    (intmax_t)adp->ad_oldblkno);
 		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
 	}
 	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
 		stat_direct_blk_ptrs++;
 	/*
 	 * Reset the file size to its most up-to-date value.
 	 */
 	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
 		panic("handle_written_inodeblock: bad size");
 	if (inodedep->id_savednlink > UFS_LINK_MAX)
 		panic("handle_written_inodeblock: Invalid link count "
 		    "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
 		    inodedep);
 	if (fstype == UFS1) {
 		if (dp1->di_nlink != inodedep->id_savednlink) { 
 			dp1->di_nlink = inodedep->id_savednlink;
 			hadchanges = 1;
 		}
 		if (dp1->di_size != inodedep->id_savedsize) {
 			dp1->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 	} else {
 		if (dp2->di_nlink != inodedep->id_savednlink) { 
 			dp2->di_nlink = inodedep->id_savednlink;
 			hadchanges = 1;
 		}
 		if (dp2->di_size != inodedep->id_savedsize) {
 			dp2->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 		if (dp2->di_extsize != inodedep->id_savedextsize) {
 			dp2->di_extsize = inodedep->id_savedextsize;
 			hadchanges = 1;
 		}
 	}
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
 	inodedep->id_savednlink = -1;
 	/*
 	 * If there were any rollbacks in the inode block, then it must be
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
 	if (hadchanges) {
 		if (fstype == UFS2)
 			ffs_update_dinode_ckhash(inodedep->id_fs, dp2);
 		bdirty(bp);
 	}
 bufwait:
 	/*
 	 * If the write did not succeed, we have done all the roll-forward
 	 * operations, but we cannot take the actions that will allow its
 	 * dependencies to be processed.
 	 */
 	if ((flags & WRITESUCCEEDED) == 0)
 		return (hadchanges);
 	/*
 	 * Process any allocdirects that completed during the update.
 	 */
 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 		handle_allocdirect_partdone(adp, &wkhd);
 	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 		handle_allocdirect_partdone(adp, &wkhd);
 	/*
 	 * Process deallocations that were held pending until the
 	 * inode had been written to disk. Freeing of the inode
 	 * is delayed until after all blocks have been freed to
 	 * avoid creation of new <vfsid, inum, lbn> triples
 	 * before the old ones have been deleted.  Completely
 	 * unlinked inodes are not processed until the unlinked
 	 * inode list is written or the last reference is removed.
 	 */
 	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
 		freefile = handle_bufwait(inodedep, NULL);
 		if (freefile && !LIST_EMPTY(&wkhd)) {
 			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
 			freefile = NULL;
 		}
 	}
 	/*
 	 * Move rolled forward dependency completions to the bufwait list
 	 * now that those that were already written have been processed.
 	 */
 	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
 		panic("handle_written_inodeblock: bufwait but no changes");
 	jwork_move(&inodedep->id_bufwait, &wkhd);
 
 	if (freefile != NULL) {
 		/*
 		 * If the inode is goingaway it was never written.  Fake up
 		 * the state here so free_inodedep() can succeed.
 		 */
 		if (inodedep->id_state & GOINGAWAY)
 			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
 		if (free_inodedep(inodedep) == 0)
 			panic("handle_written_inodeblock: live inodedep %p",
 			    inodedep);
 		add_to_worklist(&freefile->fx_list, 0);
 		return (0);
 	}
 
 	/*
 	 * If no outstanding dependencies, free it.
 	 */
 	if (free_inodedep(inodedep) ||
 	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
 	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
 	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
 	     LIST_FIRST(&inodedep->id_bufwait) == 0))
 		return (0);
 	return (hadchanges);
 }
 
 /*
  * Perform needed roll-forwards and kick off any dependencies that
  * can now be processed.
  *
  * If the write did not succeed, we will do all the roll-forward
  * operations, but we will not take the actions that will allow its
  * dependencies to be processed.
  */
 static int
 handle_written_indirdep(indirdep, bp, bpp, flags)
 	struct indirdep *indirdep;
 	struct buf *bp;
 	struct buf **bpp;
 	int flags;
 {
 	struct allocindir *aip;
 	struct buf *sbp;
 	int chgs;
 
 	if (indirdep->ir_state & GOINGAWAY)
 		panic("handle_written_indirdep: indirdep gone");
 	if ((indirdep->ir_state & IOSTARTED) == 0)
 		panic("handle_written_indirdep: IO not started");
 	chgs = 0;
 	/*
 	 * If there were rollbacks revert them here.
 	 */
 	if (indirdep->ir_saveddata) {
 		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
 		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
 			free(indirdep->ir_saveddata, M_INDIRDEP);
 			indirdep->ir_saveddata = NULL;
 		}
 		chgs = 1;
 	}
 	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
 	indirdep->ir_state |= ATTACHED;
 	/*
 	 * If the write did not succeed, we have done all the roll-forward
 	 * operations, but we cannot take the actions that will allow its
 	 * dependencies to be processed.
 	 */
 	if ((flags & WRITESUCCEEDED) == 0) {
 		stat_indir_blk_ptrs++;
 		bdirty(bp);
 		return (1);
 	}
 	/*
 	 * Move allocindirs with written pointers to the completehd if
 	 * the indirdep's pointer is not yet written.  Otherwise
 	 * free them here.
 	 */
 	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
 		LIST_REMOVE(aip, ai_next);
 		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
 			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
 			    ai_next);
 			newblk_freefrag(&aip->ai_block);
 			continue;
 		}
 		free_newblk(&aip->ai_block);
 	}
 	/*
 	 * Move allocindirs that have finished dependency processing from
 	 * the done list to the write list after updating the pointers.
 	 */
 	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
 		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
 			handle_allocindir_partdone(aip);
 			if (aip == LIST_FIRST(&indirdep->ir_donehd))
 				panic("disk_write_complete: not gone");
 			chgs = 1;
 		}
 	}
 	/*
 	 * Preserve the indirdep if there were any changes or if it is not
 	 * yet valid on disk.
 	 */
 	if (chgs) {
 		stat_indir_blk_ptrs++;
 		bdirty(bp);
 		return (1);
 	}
 	/*
 	 * If there were no changes we can discard the savedbp and detach
 	 * ourselves from the buf.  We are only carrying completed pointers
 	 * in this case.
 	 */
 	sbp = indirdep->ir_savebp;
 	sbp->b_flags |= B_INVAL | B_NOCACHE;
 	indirdep->ir_savebp = NULL;
 	indirdep->ir_bp = NULL;
 	if (*bpp != NULL)
 		panic("handle_written_indirdep: bp already exists.");
 	*bpp = sbp;
 	/*
 	 * The indirdep may not be freed until its parent points at it.
 	 */
 	if (indirdep->ir_state & DEPCOMPLETE)
 		free_indirdep(indirdep);
 
 	return (0);
 }
 
 /*
  * Process a diradd entry after its dependent inode has been written.
  */
 static void
 diradd_inode_written(dap, inodedep)
 	struct diradd *dap;
 	struct inodedep *inodedep;
 {
 
 	LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp));
 	dap->da_state |= COMPLETE;
 	complete_diradd(dap);
 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 }
 
 /*
  * Returns true if the bmsafemap will have rollbacks when written.  Must only
  * be called with the per-filesystem lock and the buf lock on the cg held.
  */
 static int
 bmsafemap_backgroundwrite(bmsafemap, bp)
 	struct bmsafemap *bmsafemap;
 	struct buf *bp;
 {
 	int dirty;
 
 	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
 	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 
 	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
 	/*
 	 * If we're initiating a background write we need to process the
 	 * rollbacks as they exist now, not as they exist when IO starts.
 	 * No other consumers will look at the contents of the shadowed
 	 * buf so this is safe to do here.
 	 */
 	if (bp->b_xflags & BX_BKGRDMARKER)
 		initiate_write_bmsafemap(bmsafemap, bp);
 
 	return (dirty);
 }
 
 /*
  * Re-apply an allocation when a cg write is complete.
  */
 static int
 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
 	struct jnewblk *jnewblk;
 	struct fs *fs;
 	struct cg *cgp;
 	uint8_t *blksfree;
 {
 	ufs1_daddr_t fragno;
 	ufs2_daddr_t blkno;
 	long cgbno, bbase;
 	int frags, blk;
 	int i;
 
 	frags = 0;
 	cgbno = dtogd(fs, jnewblk->jn_blkno);
 	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
 		if (isclr(blksfree, cgbno + i))
 			panic("jnewblk_rollforward: re-allocated fragment");
 		frags++;
 	}
 	if (frags == fs->fs_frag) {
 		blkno = fragstoblks(fs, cgbno);
 		ffs_clrblock(fs, blksfree, (long)blkno);
 		ffs_clusteracct(fs, cgp, blkno, -1);
 		cgp->cg_cs.cs_nbfree--;
 	} else {
 		bbase = cgbno - fragnum(fs, cgbno);
 		cgbno += jnewblk->jn_oldfrags;
                 /* If a complete block had been reassembled, account for it. */
 		fragno = fragstoblks(fs, bbase);
 		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree += fs->fs_frag;
 			ffs_clusteracct(fs, cgp, fragno, -1);
 			cgp->cg_cs.cs_nbfree--;
 		}
 		/* Decrement the old frags.  */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/* Allocate the fragment */
 		for (i = 0; i < frags; i++)
 			clrbit(blksfree, cgbno + i);
 		cgp->cg_cs.cs_nffree -= frags;
 		/* Add back in counts associated with the new frags */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 	}
 	return (frags);
 }
 
 /*
  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
  * changes if it's not a background write.  Set all written dependencies 
  * to DEPCOMPLETE and free the structure if possible.
  *
  * If the write did not succeed, we will do all the roll-forward
  * operations, but we will not take the actions that will allow its
  * dependencies to be processed.
  */
 static int
 handle_written_bmsafemap(bmsafemap, bp, flags)
 	struct bmsafemap *bmsafemap;
 	struct buf *bp;
 	int flags;
 {
 	struct newblk *newblk;
 	struct inodedep *inodedep;
 	struct jaddref *jaddref, *jatmp;
 	struct jnewblk *jnewblk, *jntmp;
 	struct ufsmount *ump;
 	uint8_t *inosused;
 	uint8_t *blksfree;
 	struct cg *cgp;
 	struct fs *fs;
 	ino_t ino;
 	int foreground;
 	int chgs;
 
 	if ((bmsafemap->sm_state & IOSTARTED) == 0)
 		panic("handle_written_bmsafemap: Not started\n");
 	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
 	chgs = 0;
 	bmsafemap->sm_state &= ~IOSTARTED;
 	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
 	/*
 	 * If write was successful, release journal work that was waiting
 	 * on the write. Otherwise move the work back.
 	 */
 	if (flags & WRITESUCCEEDED)
 		handle_jwork(&bmsafemap->sm_freewr);
 	else
 		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
 		    worklist, wk_list);
 
 	/*
 	 * Restore unwritten inode allocation pending jaddref writes.
 	 */
 	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
 		cgp = (struct cg *)bp->b_data;
 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
 		inosused = cg_inosused(cgp);
 		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
 		    ja_bmdeps, jatmp) {
 			if ((jaddref->ja_state & UNDONE) == 0)
 				continue;
 			ino = jaddref->ja_ino % fs->fs_ipg;
 			if (isset(inosused, ino))
 				panic("handle_written_bmsafemap: "
 				    "re-allocated inode");
 			/* Do the roll-forward only if it's a real copy. */
 			if (foreground) {
 				if ((jaddref->ja_mode & IFMT) == IFDIR)
 					cgp->cg_cs.cs_ndir++;
 				cgp->cg_cs.cs_nifree--;
 				setbit(inosused, ino);
 				chgs = 1;
 			}
 			jaddref->ja_state &= ~UNDONE;
 			jaddref->ja_state |= ATTACHED;
 			free_jaddref(jaddref);
 		}
 	}
 	/*
 	 * Restore any block allocations which are pending journal writes.
 	 */
 	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
 		cgp = (struct cg *)bp->b_data;
 		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
 		blksfree = cg_blksfree(cgp);
 		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
 		    jntmp) {
 			if ((jnewblk->jn_state & UNDONE) == 0)
 				continue;
 			/* Do the roll-forward only if it's a real copy. */
 			if (foreground &&
 			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
 				chgs = 1;
 			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
 			jnewblk->jn_state |= ATTACHED;
 			free_jnewblk(jnewblk);
 		}
 	}
 	/*
 	 * If the write did not succeed, we have done all the roll-forward
 	 * operations, but we cannot take the actions that will allow its
 	 * dependencies to be processed.
 	 */
 	if ((flags & WRITESUCCEEDED) == 0) {
 		LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
 		    newblk, nb_deps);
 		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
 		    worklist, wk_list);
 		if (foreground)
 			bdirty(bp);
 		return (1);
 	}
 	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
 		newblk->nb_state |= DEPCOMPLETE;
 		newblk->nb_state &= ~ONDEPLIST;
 		newblk->nb_bmsafemap = NULL;
 		LIST_REMOVE(newblk, nb_deps);
 		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
 			handle_allocdirect_partdone(
 			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
 		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
 			handle_allocindir_partdone(
 			    WK_ALLOCINDIR(&newblk->nb_list));
 		else if (newblk->nb_list.wk_type != D_NEWBLK)
 			panic("handle_written_bmsafemap: Unexpected type: %s",
 			    TYPENAME(newblk->nb_list.wk_type));
 	}
 	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
 		inodedep->id_state |= DEPCOMPLETE;
 		inodedep->id_state &= ~ONDEPLIST;
 		LIST_REMOVE(inodedep, id_deps);
 		inodedep->id_bmsafemap = NULL;
 	}
 	LIST_REMOVE(bmsafemap, sm_next);
 	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
 	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
 	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
 	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
 	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
 		LIST_REMOVE(bmsafemap, sm_hash);
 		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 		return (0);
 	}
 	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
 	if (foreground)
 		bdirty(bp);
 	return (1);
 }
 
 /*
  * Try to free a mkdir dependency.
  */
 static void
 complete_mkdir(mkdir)
 	struct mkdir *mkdir;
 {
 	struct diradd *dap;
 
 	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	LIST_REMOVE(mkdir, md_mkdirs);
 	dap = mkdir->md_diradd;
 	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
 		dap->da_state |= DEPCOMPLETE;
 		complete_diradd(dap);
 	}
 	WORKITEM_FREE(mkdir, D_MKDIR);
 }
 
 /*
  * Handle the completion of a mkdir dependency.
  */
 static void
 handle_written_mkdir(mkdir, type)
 	struct mkdir *mkdir;
 	int type;
 {
 
 	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
 		panic("handle_written_mkdir: bad type");
 	mkdir->md_state |= COMPLETE;
 	complete_mkdir(mkdir);
 }
 
 static int
 free_pagedep(pagedep)
 	struct pagedep *pagedep;
 {
 	int i;
 
 	if (pagedep->pd_state & NEWBLOCK)
 		return (0);
 	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
 		return (0);
 	for (i = 0; i < DAHASHSZ; i++)
 		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
 			return (0);
 	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
 		return (0);
 	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
 		return (0);
 	if (pagedep->pd_state & ONWORKLIST)
 		WORKLIST_REMOVE(&pagedep->pd_list);
 	LIST_REMOVE(pagedep, pd_hash);
 	WORKITEM_FREE(pagedep, D_PAGEDEP);
 
 	return (1);
 }
 
 /*
  * Called from within softdep_disk_write_complete above.
  * A write operation was just completed. Removed inodes can
  * now be freed and associated block pointers may be committed.
  * Note that this routine is always called from interrupt level
  * with further interrupts from this device blocked.
  *
  * If the write did not succeed, we will do all the roll-forward
  * operations, but we will not take the actions that will allow its
  * dependencies to be processed.
  */
 static int 
 handle_written_filepage(pagedep, bp, flags)
 	struct pagedep *pagedep;
 	struct buf *bp;		/* buffer containing the written page */
 	int flags;
 {
 	struct dirrem *dirrem;
 	struct diradd *dap, *nextdap;
 	struct direct *ep;
 	int i, chgs;
 
 	if ((pagedep->pd_state & IOSTARTED) == 0)
 		panic("handle_written_filepage: not started");
 	pagedep->pd_state &= ~IOSTARTED;
 	if ((flags & WRITESUCCEEDED) == 0)
 		goto rollforward;
 	/*
 	 * Process any directory removals that have been committed.
 	 */
 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 		LIST_REMOVE(dirrem, dm_next);
 		dirrem->dm_state |= COMPLETE;
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
 		    ("handle_written_filepage: Journal entries not written."));
 		add_to_worklist(&dirrem->dm_list, 0);
 	}
 	/*
 	 * Free any directory additions that have been committed.
 	 * If it is a newly allocated block, we have to wait until
 	 * the on-disk directory inode claims the new block.
 	 */
 	if ((pagedep->pd_state & NEWBLOCK) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap, NULL);
 rollforward:
 	/*
 	 * Uncommitted directory entries must be restored.
 	 */
 	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
 		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
 		     dap = nextdap) {
 			nextdap = LIST_NEXT(dap, da_pdlist);
 			if (dap->da_state & ATTACHED)
 				panic("handle_written_filepage: attached");
 			ep = (struct direct *)
 			    ((char *)bp->b_data + dap->da_offset);
 			ep->d_ino = dap->da_newinum;
 			dap->da_state &= ~UNDONE;
 			dap->da_state |= ATTACHED;
 			chgs = 1;
 			/*
 			 * If the inode referenced by the directory has
 			 * been written out, then the dependency can be
 			 * moved to the pending list.
 			 */
 			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 				LIST_REMOVE(dap, da_pdlist);
 				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
 				    da_pdlist);
 			}
 		}
 	}
 	/*
 	 * If there were any rollbacks in the directory, then it must be
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
 	if (chgs || (flags & WRITESUCCEEDED) == 0) {
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_dir_entry++;
 		bdirty(bp);
 		return (1);
 	}
 	/*
 	 * If we are not waiting for a new directory block to be
 	 * claimed by its inode, then the pagedep will be freed.
 	 * Otherwise it will remain to track any new entries on
 	 * the page in case they are fsync'ed.
 	 */
 	free_pagedep(pagedep);
 	return (0);
 }
 
 /*
  * Writing back in-core inode structures.
  * 
  * The filesystem only accesses an inode's contents when it occupies an
  * "in-core" inode structure.  These "in-core" structures are separate from
  * the page frames used to cache inode blocks.  Only the latter are
  * transferred to/from the disk.  So, when the updated contents of the
  * "in-core" inode structure are copied to the corresponding in-memory inode
  * block, the dependencies are also transferred.  The following procedure is
  * called when copying a dirty "in-core" inode to a cached inode block.
  */
 
 /*
  * Called when an inode is loaded from disk. If the effective link count
  * differed from the actual link count when it was last flushed, then we
  * need to ensure that the correct effective link count is put back.
  */
 void 
 softdep_load_inodeblock(ip)
 	struct inode *ip;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_load_inodeblock called on non-softdep filesystem"));
 	/*
 	 * Check for alternate nlink count.
 	 */
 	ip->i_effnlink = ip->i_nlink;
 	ACQUIRE_LOCK(ump);
 	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(ump);
 		return;
 	}
 	if (ip->i_nlink != inodedep->id_nlinkwrote &&
 	    inodedep->id_nlinkwrote != -1) {
 		KASSERT(ip->i_nlink == 0 &&
 		    (ump->um_flags & UM_FSFAIL_CLEANUP) != 0,
 		    ("read bad i_nlink value"));
 		ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote;
 	}
 	ip->i_effnlink -= inodedep->id_nlinkdelta;
 	KASSERT(ip->i_effnlink >= 0,
 	    ("softdep_load_inodeblock: negative i_effnlink"));
 	FREE_LOCK(ump);
 }
 
 /*
  * This routine is called just before the "in-core" inode
  * information is to be copied to the in-memory inode block.
  * Recall that an inode block contains several inodes. If
  * the force flag is set, then the dependencies will be
  * cleared so that the update can always be made. Note that
  * the buffer is locked when this routine is called, so we
  * will never be in the middle of writing the inode block 
  * to disk.
  */
 void 
 softdep_update_inodeblock(ip, bp, waitfor)
 	struct inode *ip;	/* the "in_core" copy of the inode */
 	struct buf *bp;		/* the buffer containing the inode block */
 	int waitfor;		/* nonzero => update must be allowed */
 {
 	struct inodedep *inodedep;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct worklist *wk;
 	struct mount *mp;
 	struct buf *ibp;
 	struct fs *fs;
 	int error;
 
 	ump = ITOUMP(ip);
 	mp = UFSTOVFS(ump);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_update_inodeblock called on non-softdep filesystem"));
 	fs = ump->um_fs;
 	/*
 	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
 	 * does not have access to the in-core ip so must write directly into
 	 * the inode block buffer when setting freelink.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC)
 		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
 	else
 		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
 	/*
 	 * If the effective link count is not equal to the actual link
 	 * count, then we must track the difference in an inodedep while
 	 * the inode is (potentially) tossed out of the cache. Otherwise,
 	 * if there is no existing inodedep, then there are no dependencies
 	 * to track.
 	 */
 	ACQUIRE_LOCK(ump);
 again:
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(ump);
 		if (ip->i_effnlink != ip->i_nlink)
 			panic("softdep_update_inodeblock: bad link count");
 		return;
 	}
 	KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta,
 	    ("softdep_update_inodeblock inconsistent ip %p i_nlink %d "
 	    "inodedep %p id_nlinkdelta %jd",
 	    ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta));
 	inodedep->id_nlinkwrote = ip->i_nlink;
 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
 		panic("softdep_update_inodeblock: bad delta");
 	/*
 	 * If we're flushing all dependencies we must also move any waiting
 	 * for journal writes onto the bufwait list prior to I/O.
 	 */
 	if (waitfor) {
 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
 			    == DEPCOMPLETE) {
 				jwait(&inoref->if_list, MNT_WAIT);
 				goto again;
 			}
 		}
 	}
 	/*
 	 * Changes have been initiated. Anything depending on these
 	 * changes cannot occur until this inode has been written.
 	 */
 	inodedep->id_state &= ~COMPLETE;
 	if ((inodedep->id_state & ONWORKLIST) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
 	/*
 	 * Any new dependencies associated with the incore inode must 
 	 * now be moved to the list associated with the buffer holding
 	 * the in-memory copy of the inode. Once merged process any
 	 * allocdirects that are completed by the merger.
 	 */
 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
 	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
 		    NULL);
 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
 	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
 		    NULL);
 	/*
 	 * Now that the inode has been pushed into the buffer, the
 	 * operations dependent on the inode being written to disk
 	 * can be moved to the id_bufwait so that they will be
 	 * processed when the buffer I/O completes.
 	 */
 	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
 	}
 	/*
 	 * Newly allocated inodes cannot be written until the bitmap
 	 * that allocates them have been written (indicated by
 	 * DEPCOMPLETE being set in id_state). If we are doing a
 	 * forced sync (e.g., an fsync on a file), we force the bitmap
 	 * to be written so that the update can be done.
 	 */
 	if (waitfor == 0) {
 		FREE_LOCK(ump);
 		return;
 	}
 retry:
 	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
 		FREE_LOCK(ump);
 		return;
 	}
 	ibp = inodedep->id_bmsafemap->sm_buf;
 	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
 	if (ibp == NULL) {
 		/*
 		 * If ibp came back as NULL, the dependency could have been
 		 * freed while we slept.  Look it up again, and check to see
 		 * that it has completed.
 		 */
 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 			goto retry;
 		FREE_LOCK(ump);
 		return;
 	}
 	FREE_LOCK(ump);
 	if ((error = bwrite(ibp)) != 0)
 		softdep_error("softdep_update_inodeblock: bwrite", error);
 }
 
 /*
  * Merge the a new inode dependency list (such as id_newinoupdt) into an
  * old inode dependency list (such as id_inoupdt).
  */
 static void
 merge_inode_lists(newlisthead, oldlisthead)
 	struct allocdirectlst *newlisthead;
 	struct allocdirectlst *oldlisthead;
 {
 	struct allocdirect *listadp, *newadp;
 
 	newadp = TAILQ_FIRST(newlisthead);
 	if (newadp != NULL)
 		LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp));
 	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
 		if (listadp->ad_offset < newadp->ad_offset) {
 			listadp = TAILQ_NEXT(listadp, ad_next);
 			continue;
 		}
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
 		if (listadp->ad_offset == newadp->ad_offset) {
 			allocdirect_merge(oldlisthead, newadp,
 			    listadp);
 			listadp = newadp;
 		}
 		newadp = TAILQ_FIRST(newlisthead);
 	}
 	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
 	}
 }
 
 /*
  * If we are doing an fsync, then we must ensure that any directory
  * entries for the inode have been written after the inode gets to disk.
  */
 int
 softdep_fsync(vp)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct worklist *wk;
 	struct diradd *dap;
 	struct mount *mp;
 	struct vnode *pvp;
 	struct inode *ip;
 	struct buf *bp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	int error, flushparent, pagedep_new_block;
 	ino_t parentino;
 	ufs_lbn_t lbn;
 
 	ip = VTOI(vp);
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (MOUNTEDSOFTDEP(mp) == 0)
 		return (0);
 	ACQUIRE_LOCK(ump);
 restart:
 	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(ump);
 		return (0);
 	}
 	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
 		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
 		    == DEPCOMPLETE) {
 			jwait(&inoref->if_list, MNT_WAIT);
 			goto restart;
 		}
 	}
 	if (!LIST_EMPTY(&inodedep->id_inowait) ||
 	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
 		panic("softdep_fsync: pending ops %p", inodedep);
 	for (error = 0, flushparent = 0; ; ) {
 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 			break;
 		if (wk->wk_type != D_DIRADD)
 			panic("softdep_fsync: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 		dap = WK_DIRADD(wk);
 		/*
 		 * Flush our parent if this directory entry has a MKDIR_PARENT
 		 * dependency or is contained in a newly allocated block.
 		 */
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		parentino = pagedep->pd_ino;
 		lbn = pagedep->pd_lbn;
 		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
 			panic("softdep_fsync: dirty");
 		if ((dap->da_state & MKDIR_PARENT) ||
 		    (pagedep->pd_state & NEWBLOCK))
 			flushparent = 1;
 		else
 			flushparent = 0;
 		/*
 		 * If we are being fsync'ed as part of vgone'ing this vnode,
 		 * then we will not be able to release and recover the
 		 * vnode below, so we just have to give up on writing its
 		 * directory entry out. It will eventually be written, just
 		 * not now, but then the user was not asking to have it
 		 * written, so we are not breaking any promises.
 		 */
 		if (VN_IS_DOOMED(vp))
 			break;
 		/*
 		 * We prevent deadlock by always fetching inodes from the
 		 * root, moving down the directory tree. Thus, when fetching
 		 * our parent directory, we first try to get the lock. If
 		 * that fails, we must unlock ourselves before requesting
 		 * the lock on our parent. See the comment in ufs_lookup
 		 * for details on possible races.
 		 */
 		FREE_LOCK(ump);
 		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
 		    FFSV_FORCEINSMQ)) {
 			/*
 			 * Unmount cannot proceed after unlock because
 			 * caller must have called vn_start_write().
 			 */
 			VOP_UNLOCK(vp);
 			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
 			    &pvp, FFSV_FORCEINSMQ);
 			MPASS(VTOI(pvp)->i_mode != 0);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (VN_IS_DOOMED(vp)) {
 				if (error == 0)
 					vput(pvp);
 				error = ENOENT;
 			}
 			if (error != 0)
 				return (error);
 		}
 		/*
 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 		 * that are contained in direct blocks will be resolved by 
 		 * doing a ffs_update. Pagedeps contained in indirect blocks
 		 * may require a complete sync'ing of the directory. So, we
 		 * try the cheap and fast ffs_update first, and if that fails,
 		 * then we do the slower ffs_syncvnode of the directory.
 		 */
 		if (flushparent) {
 			int locked;
 
 			if ((error = ffs_update(pvp, 1)) != 0) {
 				vput(pvp);
 				return (error);
 			}
 			ACQUIRE_LOCK(ump);
 			locked = 1;
 			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
 				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
 					if (wk->wk_type != D_DIRADD)
 						panic("softdep_fsync: Unexpected type %s",
 						      TYPENAME(wk->wk_type));
 					dap = WK_DIRADD(wk);
 					if (dap->da_state & DIRCHG)
 						pagedep = dap->da_previous->dm_pagedep;
 					else
 						pagedep = dap->da_pagedep;
 					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
 					FREE_LOCK(ump);
 					locked = 0;
 					if (pagedep_new_block && (error =
 					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
 						vput(pvp);
 						return (error);
 					}
 				}
 			}
 			if (locked)
 				FREE_LOCK(ump);
 		}
 		/*
 		 * Flush directory page containing the inode's name.
 		 */
 		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
 		    &bp);
 		if (error == 0)
 			error = bwrite(bp);
 		else
 			brelse(bp);
 		vput(pvp);
 		if (!ffs_fsfail_cleanup(ump, error))
 			return (error);
 		ACQUIRE_LOCK(ump);
 		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 			break;
 	}
 	FREE_LOCK(ump);
 	return (0);
 }
 
 /*
  * Flush all the dirty bitmaps associated with the block device
  * before flushing the rest of the dirty blocks so as to reduce
  * the number of dependencies that will have to be rolled back.
  *
  * XXX Unused?
  */
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
 	struct buf *bp, *nbp;
 	struct worklist *wk;
 	struct bufobj *bo;
 
 	if (!vn_isdisk(vp, NULL))
 		panic("softdep_fsync_mountdev: vnode not a disk");
 	bo = &vp->v_bufobj;
 restart:
 	BO_LOCK(bo);
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		/* 
 		 * If it is already scheduled, skip to the next buffer.
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 			continue;
 
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("softdep_fsync_mountdev: not dirty");
 		/*
 		 * We are only interested in bitmaps with outstanding
 		 * dependencies.
 		 */
 		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
 		    wk->wk_type != D_BMSAFEMAP ||
 		    (bp->b_vflags & BV_BKGRDINPROG)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bo);
 		bremfree(bp);
 		(void) bawrite(bp);
 		goto restart;
 	}
 	drain_output(vp);
 	BO_UNLOCK(bo);
 }
 
 /*
  * Sync all cylinder groups that were dirty at the time this function is
  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
  * is used to flush freedep activity that may be holding up writes to a
  * indirect block.
  */
 static int
 sync_cgs(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	struct bmsafemap *bmsafemap;
 	struct bmsafemap *sentinel;
 	struct ufsmount *ump;
 	struct buf *bp;
 	int error;
 
 	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
 	sentinel->sm_cg = -1;
 	ump = VFSTOUFS(mp);
 	error = 0;
 	ACQUIRE_LOCK(ump);
 	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
 	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
 	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
 		/* Skip sentinels and cgs with no work to release. */
 		if (bmsafemap->sm_cg == -1 ||
 		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
 		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
 			LIST_REMOVE(sentinel, sm_next);
 			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
 			continue;
 		}
 		/*
 		 * If we don't get the lock and we're waiting try again, if
 		 * not move on to the next buf and try to sync it.
 		 */
 		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
 		if (bp == NULL && waitfor == MNT_WAIT)
 			continue;
 		LIST_REMOVE(sentinel, sm_next);
 		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
 		if (bp == NULL)
 			continue;
 		FREE_LOCK(ump);
 		if (waitfor == MNT_NOWAIT)
 			bawrite(bp);
 		else
 			error = bwrite(bp);
 		ACQUIRE_LOCK(ump);
 		if (error)
 			break;
 	}
 	LIST_REMOVE(sentinel, sm_next);
 	FREE_LOCK(ump);
 	free(sentinel, M_BMSAFEMAP);
 	return (error);
 }
 
 /*
  * This routine is called when we are trying to synchronously flush a
  * file. This routine must eliminate any filesystem metadata dependencies
  * so that the syncing routine can succeed.
  */
 int
 softdep_sync_metadata(struct vnode *vp)
 {
 	struct inode *ip;
 	int error;
 
 	ip = VTOI(vp);
 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 	    ("softdep_sync_metadata called on non-softdep filesystem"));
 	/*
 	 * Ensure that any direct block dependencies have been cleared,
 	 * truncations are started, and inode references are journaled.
 	 */
 	ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
 	/*
 	 * Write all journal records to prevent rollbacks on devvp.
 	 */
 	if (vp->v_type == VCHR)
 		softdep_flushjournal(vp->v_mount);
 	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
 	/*
 	 * Ensure that all truncates are written so we won't find deps on
 	 * indirect blocks.
 	 */
 	process_truncates(vp);
 	FREE_LOCK(VFSTOUFS(vp->v_mount));
 
 	return (error);
 }
 
 /*
  * This routine is called when we are attempting to sync a buf with
  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
  * other IO it can but returns EBUSY if the buffer is not yet able to
  * be written.  Dependencies which will not cause rollbacks will always
  * return 0.
  */
 int
 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 {
 	struct indirdep *indirdep;
 	struct pagedep *pagedep;
 	struct allocindir *aip;
 	struct newblk *newblk;
 	struct ufsmount *ump;
 	struct buf *nbp;
 	struct worklist *wk;
 	int i, error;
 
 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 	    ("softdep_sync_buf called on non-softdep filesystem"));
 	/*
 	 * For VCHR we just don't want to force flush any dependencies that
 	 * will cause rollbacks.
 	 */
 	if (vp->v_type == VCHR) {
 		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
 			return (EBUSY);
 		return (0);
 	}
 	ump = VFSTOUFS(vp->v_mount);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * As we hold the buffer locked, none of its dependencies
 	 * will disappear.
 	 */
 	error = 0;
 top:
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
 
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 			newblk = WK_NEWBLK(wk);
 			if (newblk->nb_jnewblk != NULL) {
 				if (waitfor == MNT_NOWAIT) {
 					error = EBUSY;
 					goto out_unlock;
 				}
 				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
 				goto top;
 			}
 			if (newblk->nb_state & DEPCOMPLETE ||
 			    waitfor == MNT_NOWAIT)
 				continue;
 			nbp = newblk->nb_bmsafemap->sm_buf;
 			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
 			if (nbp == NULL)
 				goto top;
 			FREE_LOCK(ump);
 			if ((error = bwrite(nbp)) != 0)
 				goto out;
 			ACQUIRE_LOCK(ump);
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			if (waitfor == MNT_NOWAIT) {
 				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
 				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
 					error = EBUSY;
 					goto out_unlock;
 				}
 			}
 			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
 				panic("softdep_sync_buf: truncation pending.");
 		restart:
 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 				newblk = (struct newblk *)aip;
 				if (newblk->nb_jnewblk != NULL) {
 					jwait(&newblk->nb_jnewblk->jn_list,
 					    waitfor);
 					goto restart;
 				}
 				if (newblk->nb_state & DEPCOMPLETE)
 					continue;
 				nbp = newblk->nb_bmsafemap->sm_buf;
 				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
 				if (nbp == NULL)
 					goto restart;
 				FREE_LOCK(ump);
 				if ((error = bwrite(nbp)) != 0)
 					goto out;
 				ACQUIRE_LOCK(ump);
 				goto restart;
 			}
 			continue;
 
 		case D_PAGEDEP:
 			/*
 			 * Only flush directory entries in synchronous passes.
 			 */
 			if (waitfor != MNT_WAIT) {
 				error = EBUSY;
 				goto out_unlock;
 			}
 			/*
 			 * While syncing snapshots, we must allow recursive
 			 * lookups.
 			 */
 			BUF_AREC(bp);
 			/*
 			 * We are trying to sync a directory that may
 			 * have dependencies on both its own metadata
 			 * and/or dependencies on the inodes of any
 			 * recently allocated files. We walk its diradd
 			 * lists pushing out the associated inode.
 			 */
 			pagedep = WK_PAGEDEP(wk);
 			for (i = 0; i < DAHASHSZ; i++) {
 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 					continue;
 				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
 				    &pagedep->pd_diraddhd[i]))) {
 					BUF_NOREC(bp);
 					goto out_unlock;
 				}
 			}
 			BUF_NOREC(bp);
 			continue;
 
 		case D_FREEWORK:
 		case D_FREEDEP:
 		case D_JSEGDEP:
 		case D_JNEWBLK:
 			continue;
 
 		default:
 			panic("softdep_sync_buf: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 out_unlock:
 	FREE_LOCK(ump);
 out:
 	return (error);
 }
 
 /*
  * Flush the dependencies associated with an inodedep.
  */
 static int
 flush_inodedep_deps(vp, mp, ino)
 	struct vnode *vp;
 	struct mount *mp;
 	ino_t ino;
 {
 	struct inodedep *inodedep;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	int error, waitfor;
 
 	/*
 	 * This work is done in two passes. The first pass grabs most
 	 * of the buffers and begins asynchronously writing them. The
 	 * only way to wait for these asynchronous writes is to sleep
 	 * on the filesystem vnode which may stay busy for a long time
 	 * if the filesystem is active. So, instead, we make a second
 	 * pass over the dependencies blocking on each write. In the
 	 * usual case we will be blocking against a write that we
 	 * initiated, so when it is done the dependency will have been
 	 * resolved. Thus the second pass is expected to end quickly.
 	 * We give a brief window at the top of the loop to allow
 	 * any pending I/O to complete.
 	 */
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
 		if (error)
 			return (error);
 		FREE_LOCK(ump);
 		ACQUIRE_LOCK(ump);
 restart:
 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			return (0);
 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
 			    == DEPCOMPLETE) {
 				jwait(&inoref->if_list, MNT_WAIT);
 				goto restart;
 			}
 		}
 		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
 			continue;
 		/*
 		 * If pass2, we are done, otherwise do pass 2.
 		 */
 		if (waitfor == MNT_WAIT)
 			break;
 		waitfor = MNT_WAIT;
 	}
 	/*
 	 * Try freeing inodedep in case all dependencies have been removed.
 	 */
 	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 	return (0);
 }
 
 /*
  * Flush an inode dependency list.
  */
 static int
 flush_deplist(listhead, waitfor, errorp)
 	struct allocdirectlst *listhead;
 	int waitfor;
 	int *errorp;
 {
 	struct allocdirect *adp;
 	struct newblk *newblk;
 	struct ufsmount *ump;
 	struct buf *bp;
 
 	if ((adp = TAILQ_FIRST(listhead)) == NULL)
 		return (0);
 	ump = VFSTOUFS(adp->ad_list.wk_mp);
 	LOCK_OWNED(ump);
 	TAILQ_FOREACH(adp, listhead, ad_next) {
 		newblk = (struct newblk *)adp;
 		if (newblk->nb_jnewblk != NULL) {
 			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
 			return (1);
 		}
 		if (newblk->nb_state & DEPCOMPLETE)
 			continue;
 		bp = newblk->nb_bmsafemap->sm_buf;
 		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
 		if (bp == NULL) {
 			if (waitfor == MNT_NOWAIT)
 				continue;
 			return (1);
 		}
 		FREE_LOCK(ump);
 		if (waitfor == MNT_NOWAIT)
 			bawrite(bp);
 		else 
 			*errorp = bwrite(bp);
 		ACQUIRE_LOCK(ump);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Flush dependencies associated with an allocdirect block.
  */
 static int
 flush_newblk_dep(vp, mp, lbn)
 	struct vnode *vp;
 	struct mount *mp;
 	ufs_lbn_t lbn;
 {
 	struct newblk *newblk;
 	struct ufsmount *ump;
 	struct bufobj *bo;
 	struct inode *ip;
 	struct buf *bp;
 	ufs2_daddr_t blkno;
 	int error;
 
 	error = 0;
 	bo = &vp->v_bufobj;
 	ip = VTOI(vp);
 	blkno = DIP(ip, i_db[lbn]);
 	if (blkno == 0)
 		panic("flush_newblk_dep: Missing block");
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * Loop until all dependencies related to this block are satisfied.
 	 * We must be careful to restart after each sleep in case a write
 	 * completes some part of this process for us.
 	 */
 	for (;;) {
 		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
 			FREE_LOCK(ump);
 			break;
 		}
 		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
 			panic("flush_newblk_dep: Bad newblk %p", newblk);
 		/*
 		 * Flush the journal.
 		 */
 		if (newblk->nb_jnewblk != NULL) {
 			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
 			continue;
 		}
 		/*
 		 * Write the bitmap dependency.
 		 */
 		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
 			bp = newblk->nb_bmsafemap->sm_buf;
 			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
 			if (bp == NULL)
 				continue;
 			FREE_LOCK(ump);
 			error = bwrite(bp);
 			if (error)
 				break;
 			ACQUIRE_LOCK(ump);
 			continue;
 		}
 		/*
 		 * Write the buffer.
 		 */
 		FREE_LOCK(ump);
 		BO_LOCK(bo);
 		bp = gbincore(bo, lbn);
 		if (bp != NULL) {
 			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 			    LK_INTERLOCK, BO_LOCKPTR(bo));
 			if (error == ENOLCK) {
 				ACQUIRE_LOCK(ump);
 				error = 0;
 				continue; /* Slept, retry */
 			}
 			if (error != 0)
 				break;	/* Failed */
 			if (bp->b_flags & B_DELWRI) {
 				bremfree(bp);
 				error = bwrite(bp);
 				if (error)
 					break;
 			} else
 				BUF_UNLOCK(bp);
 		} else
 			BO_UNLOCK(bo);
 		/*
 		 * We have to wait for the direct pointers to
 		 * point at the newdirblk before the dependency
 		 * will go away.
 		 */
 		error = ffs_update(vp, 1);
 		if (error)
 			break;
 		ACQUIRE_LOCK(ump);
 	}
 	return (error);
 }
 
 /*
  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
  */
 static int
 flush_pagedep_deps(pvp, mp, diraddhdp)
 	struct vnode *pvp;
 	struct mount *mp;
 	struct diraddhd *diraddhdp;
 {
 	struct inodedep *inodedep;
 	struct inoref *inoref;
 	struct ufsmount *ump;
 	struct diradd *dap;
 	struct vnode *vp;
 	int error = 0;
 	struct buf *bp;
 	ino_t inum;
 	struct diraddhd unfinished;
 
 	LIST_INIT(&unfinished);
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 restart:
 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 		/*
 		 * Flush ourselves if this directory entry
 		 * has a MKDIR_PARENT dependency.
 		 */
 		if (dap->da_state & MKDIR_PARENT) {
 			FREE_LOCK(ump);
 			if ((error = ffs_update(pvp, 1)) != 0)
 				break;
 			ACQUIRE_LOCK(ump);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 			/*
 			 * All MKDIR_PARENT dependencies and all the
 			 * NEWBLOCK pagedeps that are contained in direct
 			 * blocks were resolved by doing above ffs_update.
 			 * Pagedeps contained in indirect blocks may
 			 * require a complete sync'ing of the directory.
 			 * We are in the midst of doing a complete sync,
 			 * so if they are not resolved in this pass we
 			 * defer them for now as they will be sync'ed by
 			 * our caller shortly.
 			 */
 			LIST_REMOVE(dap, da_pdlist);
 			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
 			continue;
 		}
 		/*
 		 * A newly allocated directory must have its "." and
 		 * ".." entries written out before its name can be
 		 * committed in its parent. 
 		 */
 		inum = dap->da_newinum;
 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
 			panic("flush_pagedep_deps: lost inode1");
 		/*
 		 * Wait for any pending journal adds to complete so we don't
 		 * cause rollbacks while syncing.
 		 */
 		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
 			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
 			    == DEPCOMPLETE) {
 				jwait(&inoref->if_list, MNT_WAIT);
 				goto restart;
 			}
 		}
 		if (dap->da_state & MKDIR_BODY) {
 			FREE_LOCK(ump);
 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
 			    FFSV_FORCEINSMQ)))
 				break;
 			MPASS(VTOI(vp)->i_mode != 0);
 			error = flush_newblk_dep(vp, mp, 0);
 			/*
 			 * If we still have the dependency we might need to
 			 * update the vnode to sync the new link count to
 			 * disk.
 			 */
 			if (error == 0 && dap == LIST_FIRST(diraddhdp))
 				error = ffs_update(vp, 1);
 			vput(vp);
 			if (error != 0)
 				break;
 			ACQUIRE_LOCK(ump);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 			if (dap->da_state & MKDIR_BODY) {
 				inodedep_lookup(UFSTOVFS(ump), inum, 0,
 				    &inodedep);
 				panic("flush_pagedep_deps: MKDIR_BODY "
 				    "inodedep %p dap %p vp %p",
 				    inodedep, dap, vp);
 			}
 		}
 		/*
 		 * Flush the inode on which the directory entry depends.
 		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
 		 * the only remaining dependency is that the updated inode
 		 * count must get pushed to disk. The inode has already
 		 * been pushed into its inode buffer (via VOP_UPDATE) at
 		 * the time of the reference count change. So we need only
 		 * locate that buffer, ensure that there will be no rollback
 		 * caused by a bitmap dependency, then write the inode buffer.
 		 */
 retry:
 		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
 			panic("flush_pagedep_deps: lost inode");
 		/*
 		 * If the inode still has bitmap dependencies,
 		 * push them to disk.
 		 */
 		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
 			bp = inodedep->id_bmsafemap->sm_buf;
 			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
 			if (bp == NULL)
 				goto retry;
 			FREE_LOCK(ump);
 			if ((error = bwrite(bp)) != 0)
 				break;
 			ACQUIRE_LOCK(ump);
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 		}
 		/*
 		 * If the inode is still sitting in a buffer waiting
 		 * to be written or waiting for the link count to be
 		 * adjusted update it here to flush it to disk.
 		 */
 		if (dap == LIST_FIRST(diraddhdp)) {
 			FREE_LOCK(ump);
 			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
 			    FFSV_FORCEINSMQ)))
 				break;
 			MPASS(VTOI(vp)->i_mode != 0);
 			error = ffs_update(vp, 1);
 			vput(vp);
 			if (error)
 				break;
 			ACQUIRE_LOCK(ump);
 		}
 		/*
 		 * If we have failed to get rid of all the dependencies
 		 * then something is seriously wrong.
 		 */
 		if (dap == LIST_FIRST(diraddhdp)) {
 			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
 			panic("flush_pagedep_deps: failed to flush " 
 			    "inodedep %p ino %ju dap %p",
 			    inodedep, (uintmax_t)inum, dap);
 		}
 	}
 	if (error)
 		ACQUIRE_LOCK(ump);
 	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
 	}
 	return (error);
 }
 
 /*
  * A large burst of file addition or deletion activity can drive the
  * memory load excessively high. First attempt to slow things down
  * using the techniques below. If that fails, this routine requests
  * the offending operations to fall back to running synchronously
  * until the memory load returns to a reasonable level.
  */
 int
 softdep_slowdown(vp)
 	struct vnode *vp;
 {
 	struct ufsmount *ump;
 	int jlow;
 	int max_softdeps_hard;
 
 	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 	    ("softdep_slowdown called on non-softdep filesystem"));
 	ump = VFSTOUFS(vp->v_mount);
 	ACQUIRE_LOCK(ump);
 	jlow = 0;
 	/*
 	 * Check for journal space if needed.
 	 */
 	if (DOINGSUJ(vp)) {
 		if (journal_space(ump, 0) == 0)
 			jlow = 1;
 	}
 	/*
 	 * If the system is under its limits and our filesystem is
 	 * not responsible for more than our share of the usage and
 	 * we are not low on journal space, then no need to slow down.
 	 */
 	max_softdeps_hard = max_softdeps * 11 / 10;
 	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
 	    dep_current[D_INODEDEP] < max_softdeps_hard &&
 	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
 	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
 	    ump->softdep_curdeps[D_DIRREM] <
 	    (max_softdeps_hard / 2) / stat_flush_threads &&
 	    ump->softdep_curdeps[D_INODEDEP] <
 	    max_softdeps_hard / stat_flush_threads &&
 	    ump->softdep_curdeps[D_INDIRDEP] <
 	    (max_softdeps_hard / 1000) / stat_flush_threads &&
 	    ump->softdep_curdeps[D_FREEBLKS] <
 	    max_softdeps_hard / stat_flush_threads) {
 		FREE_LOCK(ump);
   		return (0);
 	}
 	/*
 	 * If the journal is low or our filesystem is over its limit
 	 * then speedup the cleanup.
 	 */
 	if (ump->softdep_curdeps[D_INDIRDEP] <
 	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
 		softdep_speedup(ump);
 	stat_sync_limit_hit += 1;
 	FREE_LOCK(ump);
 	/*
 	 * We only slow down the rate at which new dependencies are
 	 * generated if we are not using journaling. With journaling,
 	 * the cleanup should always be sufficient to keep things
 	 * under control.
 	 */
 	if (DOINGSUJ(vp))
 		return (0);
 	return (1);
 }
 
 /*
  * Called by the allocation routines when they are about to fail
  * in the hope that we can free up the requested resource (inodes
  * or disk space).
  * 
  * First check to see if the work list has anything on it. If it has,
  * clean up entries until we successfully free the requested resource.
  * Because this process holds inodes locked, we cannot handle any remove
  * requests that might block on a locked inode as that could lead to
  * deadlock. If the worklist yields none of the requested resource,
  * start syncing out vnodes to free up the needed space.
  */
 int
 softdep_request_cleanup(fs, vp, cred, resource)
 	struct fs *fs;
 	struct vnode *vp;
 	struct ucred *cred;
 	int resource;
 {
 	struct ufsmount *ump;
 	struct mount *mp;
 	long starttime;
 	ufs2_daddr_t needed;
 	int error, failed_vnode;
 
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to process any
 	 * worklist items as we will recurse into the copyonwrite
 	 * routine.  This will result in an incoherent snapshot.
 	 * If the vnode that we hold is a snapshot, we must avoid
 	 * handling other resources that could cause deadlock.
 	 */
 	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
 		return (0);
 
 	if (resource == FLUSH_BLOCKS_WAIT)
 		stat_cleanup_blkrequests += 1;
 	else
 		stat_cleanup_inorequests += 1;
 
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	UFS_UNLOCK(ump);
 	error = ffs_update(vp, 1);
 	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
 		UFS_LOCK(ump);
 		return (0);
 	}
 	/*
 	 * If we are in need of resources, start by cleaning up
 	 * any block removals associated with our inode.
 	 */
 	ACQUIRE_LOCK(ump);
 	process_removes(vp);
 	process_truncates(vp);
 	FREE_LOCK(ump);
 	/*
 	 * Now clean up at least as many resources as we will need.
 	 *
 	 * When requested to clean up inodes, the number that are needed
 	 * is set by the number of simultaneous writers (mnt_writeopcount)
 	 * plus a bit of slop (2) in case some more writers show up while
 	 * we are cleaning.
 	 *
 	 * When requested to free up space, the amount of space that
 	 * we need is enough blocks to allocate a full-sized segment
 	 * (fs_contigsumsize). The number of such segments that will
 	 * be needed is set by the number of simultaneous writers
 	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
 	 * writers show up while we are cleaning.
 	 *
 	 * Additionally, if we are unpriviledged and allocating space,
 	 * we need to ensure that we clean up enough blocks to get the
 	 * needed number of blocks over the threshold of the minimum
 	 * number of blocks required to be kept free by the filesystem
 	 * (fs_minfree).
 	 */
 	if (resource == FLUSH_INODES_WAIT) {
 		needed = vfs_mount_fetch_counter(vp->v_mount,
 		    MNT_COUNT_WRITEOPCOUNT) + 2;
 	} else if (resource == FLUSH_BLOCKS_WAIT) {
 		needed = (vfs_mount_fetch_counter(vp->v_mount,
 		    MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize;
 		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE))
 			needed += fragstoblks(fs,
 			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
 			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
 	} else {
 		printf("softdep_request_cleanup: Unknown resource type %d\n",
 		    resource);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	starttime = time_second;
 retry:
 	if (resource == FLUSH_BLOCKS_WAIT &&
 	    fs->fs_cstotal.cs_nbfree <= needed)
 		softdep_send_speedup(ump, needed * fs->fs_bsize,
 		    BIO_SPEEDUP_TRIM);
 	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
 	    fs->fs_cstotal.cs_nbfree <= needed) ||
 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
 	    fs->fs_cstotal.cs_nifree <= needed)) {
 		ACQUIRE_LOCK(ump);
 		if (ump->softdep_on_worklist > 0 &&
 		    process_worklist_item(UFSTOVFS(ump),
 		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
 			stat_worklist_push += 1;
 		FREE_LOCK(ump);
 	}
 	/*
 	 * If we still need resources and there are no more worklist
 	 * entries to process to obtain them, we have to start flushing
 	 * the dirty vnodes to force the release of additional requests
 	 * to the worklist that we can then process to reap addition
 	 * resources. We walk the vnodes associated with the mount point
 	 * until we get the needed worklist requests that we can reap.
 	 *
 	 * If there are several threads all needing to clean the same
 	 * mount point, only one is allowed to walk the mount list.
 	 * When several threads all try to walk the same mount list,
 	 * they end up competing with each other and often end up in
 	 * livelock. This approach ensures that forward progress is
 	 * made at the cost of occational ENOSPC errors being returned
 	 * that might otherwise have been avoided.
 	 */
 	error = 1;
 	if ((resource == FLUSH_BLOCKS_WAIT && 
 	     fs->fs_cstotal.cs_nbfree <= needed) ||
 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
 	     fs->fs_cstotal.cs_nifree <= needed)) {
 		ACQUIRE_LOCK(ump);
 		if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
 			ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
 			FREE_LOCK(ump);
 			failed_vnode = softdep_request_cleanup_flush(mp, ump);
 			ACQUIRE_LOCK(ump);
 			ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
 			FREE_LOCK(ump);
 			if (ump->softdep_on_worklist > 0) {
 				stat_cleanup_retries += 1;
 				if (!failed_vnode)
 					goto retry;
 			}
 		} else {
 			FREE_LOCK(ump);
 			error = 0;
 		}
 		stat_cleanup_failures += 1;
 	}
 	if (time_second - starttime > stat_cleanup_high_delay)
 		stat_cleanup_high_delay = time_second - starttime;
 	UFS_LOCK(ump);
 	return (error);
 }
 
 /*
  * Scan the vnodes for the specified mount point flushing out any
  * vnodes that can be locked without waiting. Finally, try to flush
  * the device associated with the mount point if it can be locked
  * without waiting.
  *
  * We return 0 if we were able to lock every vnode in our scan.
  * If we had to skip one or more vnodes, we return 1.
  */
 static int
 softdep_request_cleanup_flush(mp, ump)
 	struct mount *mp;
 	struct ufsmount *ump;
 {
 	struct thread *td;
 	struct vnode *lvp, *mvp;
 	int failed_vnode;
 
 	failed_vnode = 0;
 	td = curthread;
 	MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
 		if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
 			VI_UNLOCK(lvp);
 			continue;
 		}
-		if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
-		    td) != 0) {
+		if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) {
 			failed_vnode = 1;
 			continue;
 		}
 		if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
 			vput(lvp);
 			continue;
 		}
 		(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
 		vput(lvp);
 	}
 	lvp = ump->um_devvp;
 	if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
 		VOP_FSYNC(lvp, MNT_NOWAIT, td);
 		VOP_UNLOCK(lvp);
 	}
 	return (failed_vnode);
 }
 
 static bool
 softdep_excess_items(struct ufsmount *ump, int item)
 {
 
 	KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
 	return (dep_current[item] > max_softdeps &&
 	    ump->softdep_curdeps[item] > max_softdeps /
 	    stat_flush_threads);
 }
 
 static void
 schedule_cleanup(struct mount *mp)
 {
 	struct ufsmount *ump;
 	struct thread *td;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	FREE_LOCK(ump);
 	td = curthread;
 	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
 	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
 		/*
 		 * No ast is delivered to kernel threads, so nobody
 		 * would deref the mp.  Some kernel threads
 		 * explicitely check for AST, e.g. NFS daemon does
 		 * this in the serving loop.
 		 */
 		return;
 	}
 	if (td->td_su != NULL)
 		vfs_rel(td->td_su);
 	vfs_ref(mp);
 	td->td_su = mp;
 	thread_lock(td);
 	td->td_flags |= TDF_ASTPENDING;
 	thread_unlock(td);
 }
 
 static void
 softdep_ast_cleanup_proc(struct thread *td)
 {
 	struct mount *mp;
 	struct ufsmount *ump;
 	int error;
 	bool req;
 
 	while ((mp = td->td_su) != NULL) {
 		td->td_su = NULL;
 		error = vfs_busy(mp, MBF_NOWAIT);
 		vfs_rel(mp);
 		if (error != 0)
 			return;
 		if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
 			ump = VFSTOUFS(mp);
 			for (;;) {
 				req = false;
 				ACQUIRE_LOCK(ump);
 				if (softdep_excess_items(ump, D_INODEDEP)) {
 					req = true;
 					request_cleanup(mp, FLUSH_INODES);
 				}
 				if (softdep_excess_items(ump, D_DIRREM)) {
 					req = true;
 					request_cleanup(mp, FLUSH_BLOCKS);
 				}
 				FREE_LOCK(ump);
 				if (softdep_excess_items(ump, D_NEWBLK) ||
 				    softdep_excess_items(ump, D_ALLOCDIRECT) ||
 				    softdep_excess_items(ump, D_ALLOCINDIR)) {
 					error = vn_start_write(NULL, &mp,
 					    V_WAIT);
 					if (error == 0) {
 						req = true;
 						VFS_SYNC(mp, MNT_WAIT);
 						vn_finished_write(mp);
 					}
 				}
 				if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
 					break;
 			}
 		}
 		vfs_unbusy(mp);
 	}
 	if ((mp = td->td_su) != NULL) {
 		td->td_su = NULL;
 		vfs_rel(mp);
 	}
 }
 
 /*
  * If memory utilization has gotten too high, deliberately slow things
  * down and speed up the I/O processing.
  */
 static int
 request_cleanup(mp, resource)
 	struct mount *mp;
 	int resource;
 {
 	struct thread *td = curthread;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 	/*
 	 * We never hold up the filesystem syncer or buf daemon.
 	 */
 	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
 		return (0);
 	/*
 	 * First check to see if the work list has gotten backlogged.
 	 * If it has, co-opt this process to help clean up two entries.
 	 * Because this process may hold inodes locked, we cannot
 	 * handle any remove requests that might block on a locked
 	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
 	 * to avoid recursively processing the worklist.
 	 */
 	if (ump->softdep_on_worklist > max_softdeps / 10) {
 		td->td_pflags |= TDP_SOFTDEP;
 		process_worklist_item(mp, 2, LK_NOWAIT);
 		td->td_pflags &= ~TDP_SOFTDEP;
 		stat_worklist_push += 2;
 		return(1);
 	}
 	/*
 	 * Next, we attempt to speed up the syncer process. If that
 	 * is successful, then we allow the process to continue.
 	 */
 	if (softdep_speedup(ump) &&
 	    resource != FLUSH_BLOCKS_WAIT &&
 	    resource != FLUSH_INODES_WAIT)
 		return(0);
 	/*
 	 * If we are resource constrained on inode dependencies, try
 	 * flushing some dirty inodes. Otherwise, we are constrained
 	 * by file deletions, so try accelerating flushes of directories
 	 * with removal dependencies. We would like to do the cleanup
 	 * here, but we probably hold an inode locked at this point and 
 	 * that might deadlock against one that we try to clean. So,
 	 * the best that we can do is request the syncer daemon to do
 	 * the cleanup for us.
 	 */
 	switch (resource) {
 
 	case FLUSH_INODES:
 	case FLUSH_INODES_WAIT:
 		ACQUIRE_GBLLOCK(&lk);
 		stat_ino_limit_push += 1;
 		req_clear_inodedeps += 1;
 		FREE_GBLLOCK(&lk);
 		stat_countp = &stat_ino_limit_hit;
 		break;
 
 	case FLUSH_BLOCKS:
 	case FLUSH_BLOCKS_WAIT:
 		ACQUIRE_GBLLOCK(&lk);
 		stat_blk_limit_push += 1;
 		req_clear_remove += 1;
 		FREE_GBLLOCK(&lk);
 		stat_countp = &stat_blk_limit_hit;
 		break;
 
 	default:
 		panic("request_cleanup: unknown type");
 	}
 	/*
 	 * Hopefully the syncer daemon will catch up and awaken us.
 	 * We wait at most tickdelay before proceeding in any case.
 	 */
 	ACQUIRE_GBLLOCK(&lk);
 	FREE_LOCK(ump);
 	proc_waiting += 1;
 	if (callout_pending(&softdep_callout) == FALSE)
 		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
 		    pause_timer, 0);
 
 	if ((td->td_pflags & TDP_KTHREAD) == 0)
 		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 	proc_waiting -= 1;
 	FREE_GBLLOCK(&lk);
 	ACQUIRE_LOCK(ump);
 	return (1);
 }
 
 /*
  * Awaken processes pausing in request_cleanup and clear proc_waiting
  * to indicate that there is no longer a timer running. Pause_timer
  * will be called with the global softdep mutex (&lk) locked.
  */
 static void
 pause_timer(arg)
 	void *arg;
 {
 
 	GBLLOCK_OWNED(&lk);
 	/*
 	 * The callout_ API has acquired mtx and will hold it around this
 	 * function call.
 	 */
 	*stat_countp += proc_waiting;
 	wakeup(&proc_waiting);
 }
 
 /*
  * If requested, try removing inode or removal dependencies.
  */
 static void
 check_clear_deps(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump;
 	bool suj_susp;
 
 	/*
 	 * Tell the lower layers that any TRIM or WRITE transactions that have
 	 * been delayed for performance reasons should proceed to help alleviate
 	 * the shortage faster. The race between checking req_* and the softdep
 	 * mutex (lk) is fine since this is an advisory operation that at most
 	 * causes deferred work to be done sooner.
 	 */
 	ump = VFSTOUFS(mp);
 	suj_susp = MOUNTEDSUJ(mp) && ump->softdep_jblocks->jb_suspended;
 	if (req_clear_remove || req_clear_inodedeps || suj_susp) {
 		FREE_LOCK(ump);
 		softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE);
 		ACQUIRE_LOCK(ump);
 	}
 
 	/*
 	 * If we are suspended, it may be because of our using
 	 * too many inodedeps, so help clear them out.
 	 */
 	if (suj_susp)
 		clear_inodedeps(mp);
 
 	/*
 	 * General requests for cleanup of backed up dependencies
 	 */
 	ACQUIRE_GBLLOCK(&lk);
 	if (req_clear_inodedeps) {
 		req_clear_inodedeps -= 1;
 		FREE_GBLLOCK(&lk);
 		clear_inodedeps(mp);
 		ACQUIRE_GBLLOCK(&lk);
 		wakeup(&proc_waiting);
 	}
 	if (req_clear_remove) {
 		req_clear_remove -= 1;
 		FREE_GBLLOCK(&lk);
 		clear_remove(mp);
 		ACQUIRE_GBLLOCK(&lk);
 		wakeup(&proc_waiting);
 	}
 	FREE_GBLLOCK(&lk);
 }
 
 /*
  * Flush out a directory with at least one removal dependency in an effort to
  * reduce the number of dirrem, freefile, and freeblks dependency structures.
  */
 static void
 clear_remove(mp)
 	struct mount *mp;
 {
 	struct pagedep_hashhead *pagedephd;
 	struct pagedep *pagedep;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	struct bufobj *bo;
 	int error, cnt;
 	ino_t ino;
 
 	ump = VFSTOUFS(mp);
 	LOCK_OWNED(ump);
 
 	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
 		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
 		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
 			ump->pagedep_nextclean = 0;
 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 			if (LIST_EMPTY(&pagedep->pd_dirremhd))
 				continue;
 			ino = pagedep->pd_ino;
 			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 				continue;
 			FREE_LOCK(ump);
 
 			/*
 			 * Let unmount clear deps
 			 */
 			error = vfs_busy(mp, MBF_NOWAIT);
 			if (error != 0)
 				goto finish_write;
 			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
 			     FFSV_FORCEINSMQ);
 			vfs_unbusy(mp);
 			if (error != 0) {
 				softdep_error("clear_remove: vget", error);
 				goto finish_write;
 			}
 			MPASS(VTOI(vp)->i_mode != 0);
 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
 				softdep_error("clear_remove: fsync", error);
 			bo = &vp->v_bufobj;
 			BO_LOCK(bo);
 			drain_output(vp);
 			BO_UNLOCK(bo);
 			vput(vp);
 		finish_write:
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(ump);
 			return;
 		}
 	}
 }
 
 /*
  * Clear out a block of dirty inodes in an effort to reduce
  * the number of inodedep dependency structures.
  */
 static void
 clear_inodedeps(mp)
 	struct mount *mp;
 {
 	struct inodedep_hashhead *inodedephd;
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	struct fs *fs;
 	int error, cnt;
 	ino_t firstino, lastino, ino;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	LOCK_OWNED(ump);
 	/*
 	 * Pick a random inode dependency to be cleared.
 	 * We will then gather up all the inodes in its block 
 	 * that have dependencies and flush them out.
 	 */
 	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
 		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
 		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
 			ump->inodedep_nextclean = 0;
 		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
 			break;
 	}
 	if (inodedep == NULL)
 		return;
 	/*
 	 * Find the last inode in the block with dependencies.
 	 */
 	firstino = rounddown2(inodedep->id_ino, INOPB(fs));
 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
 		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
 			break;
 	/*
 	 * Asynchronously push all but the last inode with dependencies.
 	 * Synchronously push the last inode with dependencies to ensure
 	 * that the inode block gets written to free up the inodedeps.
 	 */
 	for (ino = firstino; ino <= lastino; ino++) {
 		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			continue;
 		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 			continue;
 		FREE_LOCK(ump);
 		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
 		if (error != 0) {
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(ump);
 			return;
 		}
 		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
 		    FFSV_FORCEINSMQ)) != 0) {
 			softdep_error("clear_inodedeps: vget", error);
 			vfs_unbusy(mp);
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(ump);
 			return;
 		}
 		vfs_unbusy(mp);
 		if (VTOI(vp)->i_mode == 0) {
 			vgone(vp);
 		} else if (ino == lastino) {
 			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
 				softdep_error("clear_inodedeps: fsync1", error);
 		} else {
 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
 				softdep_error("clear_inodedeps: fsync2", error);
 			BO_LOCK(&vp->v_bufobj);
 			drain_output(vp);
 			BO_UNLOCK(&vp->v_bufobj);
 		}
 		vput(vp);
 		vn_finished_write(mp);
 		ACQUIRE_LOCK(ump);
 	}
 }
 
 void
 softdep_buf_append(bp, wkhd)
 	struct buf *bp;
 	struct workhead *wkhd;
 {
 	struct worklist *wk;
 	struct ufsmount *ump;
 
 	if ((wk = LIST_FIRST(wkhd)) == NULL)
 		return;
 	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
 	    ("softdep_buf_append called on non-softdep filesystem"));
 	ump = VFSTOUFS(wk->wk_mp);
 	ACQUIRE_LOCK(ump);
 	while ((wk = LIST_FIRST(wkhd)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&bp->b_dep, wk);
 	}
 	FREE_LOCK(ump);
 
 }
 
 void
 softdep_inode_append(ip, cred, wkhd)
 	struct inode *ip;
 	struct ucred *cred;
 	struct workhead *wkhd;
 {
 	struct buf *bp;
 	struct fs *fs;
 	struct ufsmount *ump;
 	int error;
 
 	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_inode_append called on non-softdep filesystem"));
 	fs = ump->um_fs;
 	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->fs_bsize, cred, &bp);
 	if (error) {
 		bqrelse(bp);
 		softdep_freework(wkhd);
 		return;
 	}
 	softdep_buf_append(bp, wkhd);
 	bqrelse(bp);
 }
 
 void
 softdep_freework(wkhd)
 	struct workhead *wkhd;
 {
 	struct worklist *wk;
 	struct ufsmount *ump;
 
 	if ((wk = LIST_FIRST(wkhd)) == NULL)
 		return;
 	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
 	    ("softdep_freework called on non-softdep filesystem"));
 	ump = VFSTOUFS(wk->wk_mp);
 	ACQUIRE_LOCK(ump);
 	handle_jwork(wkhd);
 	FREE_LOCK(ump);
 }
 
 static struct ufsmount *
 softdep_bp_to_mp(bp)
 	struct buf *bp;
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	if (LIST_EMPTY(&bp->b_dep))
 		return (NULL);
 	vp = bp->b_vp;
 	KASSERT(vp != NULL,
 	    ("%s, buffer with dependencies lacks vnode", __func__));
 
 	/*
 	 * The ump mount point is stable after we get a correct
 	 * pointer, since bp is locked and this prevents unmount from
 	 * proceeding.  But to get to it, we cannot dereference bp->b_dep
 	 * head wk_mp, because we do not yet own SU ump lock and
 	 * workitem might be freed while dereferenced.
 	 */
 retry:
 	switch (vp->v_type) {
 	case VCHR:
 		VI_LOCK(vp);
 		mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
 		VI_UNLOCK(vp);
 		if (mp == NULL)
 			goto retry;
 		break;
 	case VREG:
 	case VDIR:
 	case VLNK:
 	case VFIFO:
 	case VSOCK:
 		mp = vp->v_mount;
 		break;
 	case VBLK:
 		vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
 		/* FALLTHROUGH */
 	case VNON:
 	case VBAD:
 	case VMARKER:
 		mp = NULL;
 		break;
 	default:
 		vn_printf(vp, "unknown vnode type");
 		mp = NULL;
 		break;
 	}
 	return (VFSTOUFS(mp));
 }
 
 /*
  * Function to determine if the buffer has outstanding dependencies
  * that will cause a roll-back if the buffer is written. If wantcount
  * is set, return number of dependencies, otherwise just yes or no.
  */
 static int
 softdep_count_dependencies(bp, wantcount)
 	struct buf *bp;
 	int wantcount;
 {
 	struct worklist *wk;
 	struct ufsmount *ump;
 	struct bmsafemap *bmsafemap;
 	struct freework *freework;
 	struct inodedep *inodedep;
 	struct indirdep *indirdep;
 	struct freeblks *freeblks;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct dirrem *dirrem;
 	struct newblk *newblk;
 	struct mkdir *mkdir;
 	struct diradd *dap;
 	int i, retval;
 
 	ump = softdep_bp_to_mp(bp);
 	if (ump == NULL)
 		return (0);
 	retval = 0;
 	ACQUIRE_LOCK(ump);
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
 
 		case D_INODEDEP:
 			inodedep = WK_INODEDEP(wk);
 			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 				/* bitmap allocation dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
 				/* direct block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
 				/* direct block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
 				/* Add reference dependency. */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 
 			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
 				/* indirect truncation dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 
 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 				/* indirect block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
 				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
 					/* Journal remove ref dependency. */
 					retval += 1;
 					if (!wantcount)
 						goto out;
 				}
 			}
 			for (i = 0; i < DAHASHSZ; i++) {
 
 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 					/* directory entry dependency */
 					retval += 1;
 					if (!wantcount)
 						goto out;
 				}
 			}
 			continue;
 
 		case D_BMSAFEMAP:
 			bmsafemap = WK_BMSAFEMAP(wk);
 			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
 				/* Add reference dependency. */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
 				/* Allocate block dependency. */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_FREEBLKS:
 			freeblks = WK_FREEBLKS(wk);
 			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
 				/* Freeblk journal dependency. */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 			newblk = WK_NEWBLK(wk);
 			if (newblk->nb_jnewblk) {
 				/* Journal allocate dependency. */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_MKDIR:
 			mkdir = WK_MKDIR(wk);
 			if (mkdir->md_jaddref) {
 				/* Journal reference dependency. */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_FREEWORK:
 		case D_FREEDEP:
 		case D_JSEGDEP:
 		case D_JSEG:
 		case D_SBDEP:
 			/* never a dependency on these blocks */
 			continue;
 
 		default:
 			panic("softdep_count_dependencies: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 out:
 	FREE_LOCK(ump);
 	return (retval);
 }
 
 /*
  * Acquire exclusive access to a buffer.
  * Must be called with a locked mtx parameter.
  * Return acquired buffer or NULL on failure.
  */
 static struct buf *
 getdirtybuf(bp, lock, waitfor)
 	struct buf *bp;
 	struct rwlock *lock;
 	int waitfor;
 {
 	int error;
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
 		if (waitfor != MNT_WAIT)
 			return (NULL);
 		error = BUF_LOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
 		/*
 		 * Even if we successfully acquire bp here, we have dropped
 		 * lock, which may violates our guarantee.
 		 */
 		if (error == 0)
 			BUF_UNLOCK(bp);
 		else if (error != ENOLCK)
 			panic("getdirtybuf: inconsistent lock: %d", error);
 		rw_wlock(lock);
 		return (NULL);
 	}
 	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
 			rw_wunlock(lock);
 			BO_LOCK(bp->b_bufobj);
 			BUF_UNLOCK(bp);
 			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 				bp->b_vflags |= BV_BKGRDWAIT;
 				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
 				       PRIBIO | PDROP, "getbuf", 0);
 			} else
 				BO_UNLOCK(bp->b_bufobj);
 			rw_wlock(lock);
 			return (NULL);
 		}
 		BUF_UNLOCK(bp);
 		if (waitfor != MNT_WAIT)
 			return (NULL);
 #ifdef DEBUG_VFS_LOCKS
 		if (bp->b_vp->v_type != VCHR)
 			ASSERT_BO_WLOCKED(bp->b_bufobj);
 #endif
 		bp->b_vflags |= BV_BKGRDWAIT;
 		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
 		return (NULL);
 	}
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		BUF_UNLOCK(bp);
 		return (NULL);
 	}
 	bremfree(bp);
 	return (bp);
 }
 
 
 /*
  * Check if it is safe to suspend the file system now.  On entry,
  * the vnode interlock for devvp should be held.  Return 0 with
  * the mount interlock held if the file system can be suspended now,
  * otherwise return EAGAIN with the mount interlock held.
  */
 int
 softdep_check_suspend(struct mount *mp,
 		      struct vnode *devvp,
 		      int softdep_depcnt,
 		      int softdep_accdepcnt,
 		      int secondary_writes,
 		      int secondary_accwrites)
 {
 	struct bufobj *bo;
 	struct ufsmount *ump;
 	struct inodedep *inodedep;
 	int error, unlinked;
 
 	bo = &devvp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 
 	/*
 	 * If we are not running with soft updates, then we need only
 	 * deal with secondary writes as we try to suspend.
 	 */
 	if (MOUNTEDSOFTDEP(mp) == 0) {
 		MNT_ILOCK(mp);
 		while (mp->mnt_secondary_writes != 0) {
 			BO_UNLOCK(bo);
 			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 			    (PUSER - 1) | PDROP, "secwr", 0);
 			BO_LOCK(bo);
 			MNT_ILOCK(mp);
 		}
 
 		/*
 		 * Reasons for needing more work before suspend:
 		 * - Dirty buffers on devvp.
 		 * - Secondary writes occurred after start of vnode sync loop
 		 */
 		error = 0;
 		if (bo->bo_numoutput > 0 ||
 		    bo->bo_dirty.bv_cnt > 0 ||
 		    secondary_writes != 0 ||
 		    mp->mnt_secondary_writes != 0 ||
 		    secondary_accwrites != mp->mnt_secondary_accwrites)
 			error = EAGAIN;
 		BO_UNLOCK(bo);
 		return (error);
 	}
 
 	/*
 	 * If we are running with soft updates, then we need to coordinate
 	 * with them as we try to suspend.
 	 */
 	ump = VFSTOUFS(mp);
 	for (;;) {
 		if (!TRY_ACQUIRE_LOCK(ump)) {
 			BO_UNLOCK(bo);
 			ACQUIRE_LOCK(ump);
 			FREE_LOCK(ump);
 			BO_LOCK(bo);
 			continue;
 		}
 		MNT_ILOCK(mp);
 		if (mp->mnt_secondary_writes != 0) {
 			FREE_LOCK(ump);
 			BO_UNLOCK(bo);
 			msleep(&mp->mnt_secondary_writes,
 			       MNT_MTX(mp),
 			       (PUSER - 1) | PDROP, "secwr", 0);
 			BO_LOCK(bo);
 			continue;
 		}
 		break;
 	}
 
 	unlinked = 0;
 	if (MOUNTEDSUJ(mp)) {
 		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
 		    inodedep != NULL;
 		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
 			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
 			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
 			    UNLINKONLIST) ||
 			    !check_inodedep_free(inodedep))
 				continue;
 			unlinked++;
 		}
 	}
 
 	/*
 	 * Reasons for needing more work before suspend:
 	 * - Dirty buffers on devvp.
 	 * - Softdep activity occurred after start of vnode sync loop
 	 * - Secondary writes occurred after start of vnode sync loop
 	 */
 	error = 0;
 	if (bo->bo_numoutput > 0 ||
 	    bo->bo_dirty.bv_cnt > 0 ||
 	    softdep_depcnt != unlinked ||
 	    ump->softdep_deps != unlinked ||
 	    softdep_accdepcnt != ump->softdep_accdeps ||
 	    secondary_writes != 0 ||
 	    mp->mnt_secondary_writes != 0 ||
 	    secondary_accwrites != mp->mnt_secondary_accwrites)
 		error = EAGAIN;
 	FREE_LOCK(ump);
 	BO_UNLOCK(bo);
 	return (error);
 }
 
 
 /*
  * Get the number of dependency structures for the file system, both
  * the current number and the total number allocated.  These will
  * later be used to detect that softdep processing has occurred.
  */
 void
 softdep_get_depcounts(struct mount *mp,
 		      int *softdep_depsp,
 		      int *softdep_accdepsp)
 {
 	struct ufsmount *ump;
 
 	if (MOUNTEDSOFTDEP(mp) == 0) {
 		*softdep_depsp = 0;
 		*softdep_accdepsp = 0;
 		return;
 	}
 	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(ump);
 	*softdep_depsp = ump->softdep_deps;
 	*softdep_accdepsp = ump->softdep_accdeps;
 	FREE_LOCK(ump);
 }
 
 /*
  * Wait for pending output on a vnode to complete.
  */
 static void
 drain_output(vp)
 	struct vnode *vp;
 {
 
 	ASSERT_VOP_LOCKED(vp, "drain_output");
 	(void)bufobj_wwait(&vp->v_bufobj, 0, 0);
 }
 
 /*
  * Called whenever a buffer that is being invalidated or reallocated
  * contains dependencies. This should only happen if an I/O error has
  * occurred. The routine is called with the buffer locked.
  */ 
 static void
 softdep_deallocate_dependencies(bp)
 	struct buf *bp;
 {
 
 	if ((bp->b_ioflags & BIO_ERROR) == 0)
 		panic("softdep_deallocate_dependencies: dangling deps");
 	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
 		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
 	else
 		printf("softdep_deallocate_dependencies: "
 		    "got error %d while accessing filesystem\n", bp->b_error);
 	if (bp->b_error != ENXIO)
 		panic("softdep_deallocate_dependencies: unrecovered I/O error");
 }
 
 /*
  * Function to handle asynchronous write errors in the filesystem.
  */
 static void
 softdep_error(func, error)
 	char *func;
 	int error;
 {
 
 	/* XXX should do something better! */
 	printf("%s: got error %d while accessing filesystem\n", func, error);
 }
 
 #ifdef DDB
 
 /* exported to ffs_vfsops.c */
 extern void db_print_ffs(struct ufsmount *ump);
 void
 db_print_ffs(struct ufsmount *ump)
 {
 	db_printf("mp %p (%s) devvp %p\n", ump->um_mountp,
 	    ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp);
 	db_printf("    fs %p su_wl %d su_deps %d su_req %d\n",
 	    ump->um_fs, ump->softdep_on_worklist,
 	    ump->softdep_deps, ump->softdep_req);
 }
 
 static void
 worklist_print(struct worklist *wk, int verbose)
 {
 
 	if (!verbose) {
 		db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk,
 		    (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS);
 		return;
 	}
 	db_printf("worklist: %p type %s state 0x%b next %p\n    ", wk,
 	    TYPENAME(wk->wk_type), (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS,
 	    LIST_NEXT(wk, wk_list));
 	db_print_ffs(VFSTOUFS(wk->wk_mp));
 }
 
 static void
 inodedep_print(struct inodedep *inodedep, int verbose)
 {
 
 	worklist_print(&inodedep->id_list, 0);
 	db_printf("    fs %p ino %jd inoblk %jd delta %jd nlink %jd\n",
 	    inodedep->id_fs,
 	    (intmax_t)inodedep->id_ino,
 	    (intmax_t)fsbtodb(inodedep->id_fs,
 	        ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
 	    (intmax_t)inodedep->id_nlinkdelta,
 	    (intmax_t)inodedep->id_savednlink);
 
 	if (verbose == 0)
 		return;
 
 	db_printf("    bmsafemap %p, mkdiradd %p, inoreflst %p\n",
 	    inodedep->id_bmsafemap,
 	    inodedep->id_mkdiradd,
 	    TAILQ_FIRST(&inodedep->id_inoreflst));
 	db_printf("    dirremhd %p, pendinghd %p, bufwait %p\n",
 	    LIST_FIRST(&inodedep->id_dirremhd),
 	    LIST_FIRST(&inodedep->id_pendinghd),
 	    LIST_FIRST(&inodedep->id_bufwait));
 	db_printf("    inowait %p, inoupdt %p, newinoupdt %p\n",
 	    LIST_FIRST(&inodedep->id_inowait),
 	    TAILQ_FIRST(&inodedep->id_inoupdt),
 	    TAILQ_FIRST(&inodedep->id_newinoupdt));
 	db_printf("    extupdt %p, newextupdt %p, freeblklst %p\n",
 	    TAILQ_FIRST(&inodedep->id_extupdt),
 	    TAILQ_FIRST(&inodedep->id_newextupdt),
 	    TAILQ_FIRST(&inodedep->id_freeblklst));
 	db_printf("    saveino %p, savedsize %jd, savedextsize %jd\n",
 	    inodedep->id_savedino1,
 	    (intmax_t)inodedep->id_savedsize,
 	    (intmax_t)inodedep->id_savedextsize);
 }
 
 static void
 newblk_print(struct newblk *nbp)
 {
 
 	worklist_print(&nbp->nb_list, 0);
 	db_printf("    newblkno %jd\n", (intmax_t)nbp->nb_newblkno);
 	db_printf("    jnewblk %p, bmsafemap %p, freefrag %p\n",
 	    &nbp->nb_jnewblk,
 	    &nbp->nb_bmsafemap,
 	    &nbp->nb_freefrag);
 	db_printf("    indirdeps %p, newdirblk %p, jwork %p\n",
 	    LIST_FIRST(&nbp->nb_indirdeps),
 	    LIST_FIRST(&nbp->nb_newdirblk),
 	    LIST_FIRST(&nbp->nb_jwork));
 }
 
 static void
 allocdirect_print(struct allocdirect *adp)
 {
 
 	newblk_print(&adp->ad_block);
 	db_printf("    oldblkno %jd, oldsize %ld, newsize %ld\n",
 	    adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize);
 	db_printf("    offset %d, inodedep %p\n",
 	    adp->ad_offset, adp->ad_inodedep);
 }
 
 static void
 allocindir_print(struct allocindir *aip)
 {
 
 	newblk_print(&aip->ai_block);
 	db_printf("    oldblkno %jd, lbn %jd\n",
 	    (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn);
 	db_printf("    offset %d, indirdep %p\n",
 	    aip->ai_offset, aip->ai_indirdep);
 }
 
 static void
 mkdir_print(struct mkdir *mkdir)
 {
 
 	worklist_print(&mkdir->md_list, 0);
 	db_printf("    diradd %p, jaddref %p, buf %p\n",
 		mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf);
 }
 
 DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep)
 {
 
 	if (have_addr == 0) {
 		db_printf("inodedep address required\n");
 		return;
 	}
 	inodedep_print((struct inodedep*)addr, 1);
 }
 
 DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps)
 {
 	struct inodedep_hashhead *inodedephd;
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 	int cnt;
 
 	if (have_addr == 0) {
 		db_printf("ufsmount address required\n");
 		return;
 	}
 	ump = (struct ufsmount *)addr;
 	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
 		inodedephd = &ump->inodedep_hashtbl[cnt];
 		LIST_FOREACH(inodedep, inodedephd, id_hash) {
 			inodedep_print(inodedep, 0);
 		}
 	}
 }
 
 DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist)
 {
 
 	if (have_addr == 0) {
 		db_printf("worklist address required\n");
 		return;
 	}
 	worklist_print((struct worklist *)addr, 1);
 }
 
 DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead)
 {
 	struct worklist *wk;
 	struct workhead *wkhd;
 
 	if (have_addr == 0) {
 		db_printf("worklist address required "
 		    "(for example value in bp->b_dep)\n");
 		return;
 	}
 	/*
 	 * We often do not have the address of the worklist head but
 	 * instead a pointer to its first entry (e.g., we have the
 	 * contents of bp->b_dep rather than &bp->b_dep). But the back
 	 * pointer of bp->b_dep will point at the head of the list, so
 	 * we cheat and use that instead. If we are in the middle of
 	 * a list we will still get the same result, so nothing
 	 * unexpected will result.
 	 */
 	wk = (struct worklist *)addr;
 	if (wk == NULL)
 		return;
 	wkhd = (struct workhead *)wk->wk_list.le_prev;
 	LIST_FOREACH(wk, wkhd, wk_list) {
 		switch(wk->wk_type) {
 		case D_INODEDEP:
 			inodedep_print(WK_INODEDEP(wk), 0);
 			continue;
 		case D_ALLOCDIRECT:
 			allocdirect_print(WK_ALLOCDIRECT(wk));
 			continue;
 		case D_ALLOCINDIR:
 			allocindir_print(WK_ALLOCINDIR(wk));
 			continue;
 		case D_MKDIR:
 			mkdir_print(WK_MKDIR(wk));
 			continue;
 		default:
 			worklist_print(wk, 0);
 			continue;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir)
 {
 	if (have_addr == 0) {
 		db_printf("mkdir address required\n");
 		return;
 	}
 	mkdir_print((struct mkdir *)addr);
 }
 
 DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list)
 {
 	struct mkdirlist *mkdirlisthd;
 	struct mkdir *mkdir;
 
 	if (have_addr == 0) {
 		db_printf("mkdir listhead address required\n");
 		return;
 	}
 	mkdirlisthd = (struct mkdirlist *)addr;
 	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
 		mkdir_print(mkdir);
 		if (mkdir->md_diradd != NULL) {
 			db_printf("    ");
 			worklist_print(&mkdir->md_diradd->da_list, 0);
 		}
 		if (mkdir->md_jaddref != NULL) {
 			db_printf("    ");
 			worklist_print(&mkdir->md_jaddref->ja_list, 0);
 		}
 	}
 }
 
 DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect)
 {
 	if (have_addr == 0) {
 		db_printf("allocdirect address required\n");
 		return;
 	}
 	allocdirect_print((struct allocdirect *)addr);
 }
 
 DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir)
 {
 	if (have_addr == 0) {
 		db_printf("allocindir address required\n");
 		return;
 	}
 	allocindir_print((struct allocindir *)addr);
 }
 
 #endif /* DDB */
 
 #endif /* SOFTUPDATES */
Index: projects/clang1100-import/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- projects/clang1100-import/sys/ufs/ffs/ffs_vfsops.c	(revision 364278)
+++ projects/clang1100-import/sys/ufs/ffs/ffs_vfsops.c	(revision 364279)
@@ -1,2684 +1,2683 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/gsb_crc32.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/gjournal.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/vm_page.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <ddb/ddb.h>
 
 static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
 VFS_SMR_DECLARE;
 
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
 static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
 static int	ffs_sync_lazy(struct mount *mp);
 static int	ffs_use_bread(void *devfd, off_t loc, void **bufp, int size);
 static int	ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size);
 
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
 static vfs_extattrctl_t ffs_extattrctl;
 static vfs_cmount_t ffs_cmount;
 static vfs_unmount_t ffs_unmount;
 static vfs_mount_t ffs_mount;
 static vfs_statfs_t ffs_statfs;
 static vfs_fhtovp_t ffs_fhtovp;
 static vfs_sync_t ffs_sync;
 
 static struct vfsops ufs_vfsops = {
 	.vfs_extattrctl =	ffs_extattrctl,
 	.vfs_fhtovp =		ffs_fhtovp,
 	.vfs_init =		ffs_init,
 	.vfs_mount =		ffs_mount,
 	.vfs_cmount =		ffs_cmount,
 	.vfs_quotactl =		ufs_quotactl,
 	.vfs_root =		vfs_cache_root,
 	.vfs_cachedroot =	ufs_root,
 	.vfs_statfs =		ffs_statfs,
 	.vfs_sync =		ffs_sync,
 	.vfs_uninit =		ffs_uninit,
 	.vfs_unmount =		ffs_unmount,
 	.vfs_vget =		ffs_vget,
 	.vfs_susp_clean =	process_deferred_inactive,
 };
 
 VFS_SET(ufs_vfsops, ufs, 0);
 MODULE_VERSION(ufs, 1);
 
 static b_strategy_t ffs_geom_strategy;
 static b_write_t ffs_bufwrite;
 
 static struct buf_ops ffs_ops = {
 	.bop_name =	"FFS",
 	.bop_write =	ffs_bufwrite,
 	.bop_strategy =	ffs_geom_strategy,
 	.bop_sync =	bufsync,
 #ifdef NO_FFS_SNAPSHOT
 	.bop_bdflush =	bufbdflush,
 #else
 	.bop_bdflush =	ffs_bdflush,
 #endif
 };
 
 /*
  * Note that userquota and groupquota options are not currently used
  * by UFS/FFS code and generally mount(8) does not pass those options
  * from userland, but they can be passed by loader(8) via
  * vfs.root.mountfrom.options.
  */
 static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
     "noclusterw", "noexec", "export", "force", "from", "groupquota",
     "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir",
     "nosymfollow", "sync", "union", "userquota", "untrusted", NULL };
 
 static int ffs_enxio_enable = 1;
 SYSCTL_DECL(_vfs_ffs);
 SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN,
     &ffs_enxio_enable, 0,
     "enable mapping of other disk I/O errors to ENXIO");
 
 /*
  * Return buffer with the contents of block "offset" from the beginning of
  * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
  * remaining space in the directory.
  */
 static int
 ffs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp)
 {
 	struct inode *ip;
 	struct fs *fs;
 	struct buf *bp;
 	ufs_lbn_t lbn;
 	int bsize, error;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 	lbn = lblkno(fs, offset);
 	bsize = blksize(fs, ip, lbn);
 
 	*bpp = NULL;
 	error = bread(vp, lbn, bsize, NOCRED, &bp);
 	if (error) {
 		return (error);
 	}
 	if (res)
 		*res = (char *)bp->b_data + blkoff(fs, offset);
 	*bpp = bp;
 	return (0);
 }
 
 /*
  * Load up the contents of an inode and copy the appropriate pieces
  * to the incore copy.
  */
 static int
 ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
 {
 	struct ufs1_dinode *dip1;
 	struct ufs2_dinode *dip2;
 	int error;
 
 	if (I_IS_UFS1(ip)) {
 		dip1 = ip->i_din1;
 		*dip1 =
 		    *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino));
 		ip->i_mode = dip1->di_mode;
 		ip->i_nlink = dip1->di_nlink;
 		ip->i_effnlink = dip1->di_nlink;
 		ip->i_size = dip1->di_size;
 		ip->i_flags = dip1->di_flags;
 		ip->i_gen = dip1->di_gen;
 		ip->i_uid = dip1->di_uid;
 		ip->i_gid = dip1->di_gid;
 		return (0);
 	}
 	dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino));
 	if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 &&
 	    !ffs_fsfail_cleanup(ITOUMP(ip), error)) {
 		printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt,
 		    (intmax_t)ino);
 		return (error);
 	}
 	*ip->i_din2 = *dip2;
 	dip2 = ip->i_din2;
 	ip->i_mode = dip2->di_mode;
 	ip->i_nlink = dip2->di_nlink;
 	ip->i_effnlink = dip2->di_nlink;
 	ip->i_size = dip2->di_size;
 	ip->i_flags = dip2->di_flags;
 	ip->i_gen = dip2->di_gen;
 	ip->i_uid = dip2->di_uid;
 	ip->i_gid = dip2->di_gid;
 	return (0);
 }
 
 /*
  * Verify that a filesystem block number is a valid data block.
  * This routine is only called on untrusted filesystems.
  */
 static int
 ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize)
 {
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t end_daddr;
 	int cg, havemtx;
 
 	KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0,
 	    ("ffs_check_blkno called on a trusted file system"));
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	cg = dtog(fs, daddr);
 	end_daddr = daddr + numfrags(fs, blksize);
 	/*
 	 * Verify that the block number is a valid data block. Also check
 	 * that it does not point to an inode block or a superblock. Accept
 	 * blocks that are unalloacted (0) or part of snapshot metadata
 	 * (BLK_NOCOPY or BLK_SNAP).
 	 *
 	 * Thus, the block must be in a valid range for the filesystem and
 	 * either in the space before a backup superblock (except the first
 	 * cylinder group where that space is used by the bootstrap code) or
 	 * after the inode blocks and before the end of the cylinder group.
 	 */
 	if ((uint64_t)daddr <= BLK_SNAP ||
 	    ((uint64_t)end_daddr <= fs->fs_size &&
 	    ((cg > 0 && end_daddr <= cgsblock(fs, cg)) ||
 	    (daddr >= cgdmin(fs, cg) &&
 	    end_daddr <= cgbase(fs, cg) + fs->fs_fpg))))
 		return (0);
 	if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0)
 		UFS_LOCK(ump);
 	if (ppsratecheck(&ump->um_last_integritymsg,
 	    &ump->um_secs_integritymsg, 1)) {
 		UFS_UNLOCK(ump);
 		uprintf("\n%s: inode %jd, out-of-range indirect block "
 		    "number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr);
 		if (havemtx)
 			UFS_LOCK(ump);
 	} else if (!havemtx)
 		UFS_UNLOCK(ump);
 	return (EINTEGRITY);
 }
 
 /*
  * Initiate a forcible unmount.
  * Used to unmount filesystems whose underlying media has gone away.
  */
 static void
 ffs_fsfail_unmount(void *v, int pending)
 {
 	struct fsfail_task *etp;
 	struct mount *mp;
 
 	etp = v;
 
 	/*
 	 * Find our mount and get a ref on it, then try to unmount.
 	 */
 	mp = vfs_getvfs(&etp->fsid);
 	if (mp != NULL)
 		dounmount(mp, MNT_FORCE, curthread);
 	free(etp, M_UFSMNT);
 }
 
 /*
  * On first ENXIO error, start a task that forcibly unmounts the filesystem.
  *
  * Return true if a cleanup is in progress.
  */
 int
 ffs_fsfail_cleanup(struct ufsmount *ump, int error)
 {
 	int retval;
 
 	UFS_LOCK(ump);
 	retval = ffs_fsfail_cleanup_locked(ump, error);
 	UFS_UNLOCK(ump);
 	return (retval);
 }
 
 int
 ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error)
 {
 	struct fsfail_task *etp;
 	struct task *tp;
 
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) {
 		ump->um_flags |= UM_FSFAIL_CLEANUP;
 		/*
 		 * Queue an async forced unmount.
 		 */
 		etp = ump->um_fsfail_task;
 		ump->um_fsfail_task = NULL;
 		if (etp != NULL) {
 			tp = &etp->task;
 			TASK_INIT(tp, 0, ffs_fsfail_unmount, etp);
 			taskqueue_enqueue(taskqueue_thread, tp);
 			printf("UFS: forcibly unmounting %s from %s\n",
 			    ump->um_mountp->mnt_stat.f_mntfromname,
 			    ump->um_mountp->mnt_stat.f_mntonname);
 		}
 	}
 	return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0);
 }
 
 /*
  * Wrapper used during ENXIO cleanup to allocate empty buffers when
  * the kernel is unable to read the real one. They are needed so that
  * the soft updates code can use them to unwind its dependencies.
  */
 int
 ffs_breadz(struct ufsmount *ump, struct vnode *vp, daddr_t lblkno,
     daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt,
     struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *),
     struct buf **bpp)
 {
 	int error;
 
 	flags |= GB_CVTENXIO;
 	error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt,
 	    cred, flags, ckhashfunc, bpp);
 	if (error != 0 && ffs_fsfail_cleanup(ump, error)) {
 		error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp);
 		KASSERT(error == 0, ("getblkx failed"));
 		vfs_bio_bzero_buf(*bpp, 0, size);
 	}
 	return (error);
 }
 
 static int
 ffs_mount(struct mount *mp)
 {
 	struct vnode *devvp, *odevvp;
 	struct thread *td;
 	struct ufsmount *ump = NULL;
 	struct fs *fs;
 	pid_t fsckpid = 0;
 	int error, error1, flags;
 	uint64_t mntorflags, saved_mnt_flag;
 	accmode_t accmode;
 	struct nameidata ndp;
 	char *fspec;
 
 	td = curthread;
 	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
 		return (EINVAL);
 	if (uma_inode == NULL) {
 		uma_inode = uma_zcreate("FFS inode",
 		    sizeof(struct inode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs1 = uma_zcreate("FFS1 dinode",
 		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs2 = uma_zcreate("FFS2 dinode",
 		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		VFS_SMR_ZONE_SET(uma_inode);
 	}
 
 	vfs_deleteopt(mp->mnt_optnew, "groupquota");
 	vfs_deleteopt(mp->mnt_optnew, "userquota");
 
 	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)
 		return (error);
 
 	mntorflags = 0;
 	if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0)
 		mntorflags |= MNT_UNTRUSTED;
 
 	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
 		mntorflags |= MNT_ACLS;
 
 	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) {
 		mntorflags |= MNT_SNAPSHOT;
 		/*
 		 * Once we have set the MNT_SNAPSHOT flag, do not
 		 * persist "snapshot" in the options list.
 		 */
 		vfs_deleteopt(mp->mnt_optnew, "snapshot");
 		vfs_deleteopt(mp->mnt_opt, "snapshot");
 	}
 
 	if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 &&
 	    vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) {
 		/*
 		 * Once we have set the restricted PID, do not
 		 * persist "fsckpid" in the options list.
 		 */
 		vfs_deleteopt(mp->mnt_optnew, "fsckpid");
 		vfs_deleteopt(mp->mnt_opt, "fsckpid");
 		if (mp->mnt_flag & MNT_UPDATE) {
 			if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 &&
 			     vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
 				vfs_mount_error(mp,
 				    "Checker enable: Must be read-only");
 				return (EINVAL);
 			}
 		} else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
 			vfs_mount_error(mp,
 			    "Checker enable: Must be read-only");
 			return (EINVAL);
 		}
 		/* Set to -1 if we are done */
 		if (fsckpid == 0)
 			fsckpid = -1;
 	}
 
 	if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
 		if (mntorflags & MNT_ACLS) {
 			vfs_mount_error(mp,
 			    "\"acls\" and \"nfsv4acls\" options "
 			    "are mutually exclusive");
 			return (EINVAL);
 		}
 		mntorflags |= MNT_NFS4ACLS;
 	}
 
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag &= ~MNTK_FPLOOKUP;
 	mp->mnt_flag |= mntorflags;
 	MNT_IUNLOCK(mp);
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		odevvp = ump->um_odevvp;
 		devvp = ump->um_devvp;
 		if (fsckpid == -1 && ump->um_fsckpid > 0) {
 			if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
 			    (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0)
 				return (error);
 			g_topology_lock();
 			/*
 			 * Return to normal read-only mode.
 			 */
 			error = g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 			ump->um_fsckpid = 0;
 		}
 		if (fs->fs_ronly == 0 &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * Flush any dirty data and suspend filesystem.
 			 */
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			error = vfs_write_suspend_umnt(mp);
 			if (error != 0)
 				return (error);
 			/*
 			 * Check for and optionally get rid of files open
 			 * for writing.
 			 */
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (MOUNTEDSOFTDEP(mp)) {
 				error = softdep_flushfiles(mp, flags, td);
 			} else {
 				error = ffs_flushfiles(mp, flags, td);
 			}
 			if (error) {
 				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			if (fs->fs_pendingblocks != 0 ||
 			    fs->fs_pendinginodes != 0) {
 				printf("WARNING: %s Update error: blocks %jd "
 				    "files %d\n", fs->fs_fsmnt, 
 				    (intmax_t)fs->fs_pendingblocks,
 				    fs->fs_pendinginodes);
 				fs->fs_pendingblocks = 0;
 				fs->fs_pendinginodes = 0;
 			}
 			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 				fs->fs_clean = 1;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				fs->fs_ronly = 0;
 				fs->fs_clean = 0;
 				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			if (MOUNTEDSOFTDEP(mp))
 				softdep_unmount(mp);
 			g_topology_lock();
 			/*
 			 * Drop our write and exclusive access.
 			 */
 			g_access(ump->um_cp, 0, -1, -1);
 			g_topology_unlock();
 			fs->fs_ronly = 1;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			/*
 			 * Allow the writers to note that filesystem
 			 * is ro now.
 			 */
 			vfs_write_resume(mp, 0);
 		}
 		if ((mp->mnt_flag & MNT_RELOAD) &&
 		    (error = ffs_reload(mp, td, 0)) != 0)
 			return (error);
 		if (fs->fs_ronly &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If we are running a checker, do not allow upgrade.
 			 */
 			if (ump->um_fsckpid > 0) {
 				vfs_mount_error(mp,
 				    "Active checker, cannot upgrade to write");
 				return (EINVAL);
 			}
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(odevvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			VOP_UNLOCK(odevvp);
 			if (error) {
 				return (error);
 			}
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
 				if ((mp->mnt_flag & MNT_FORCE) ||
 				    ((fs->fs_flags &
 				     (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
 				     (fs->fs_flags & FS_DOSOFTDEP))) {
 					printf("WARNING: %s was not properly "
 					   "dismounted\n", fs->fs_fsmnt);
 				} else {
 					vfs_mount_error(mp,
 					   "R/W mount of %s denied. %s.%s",
 					   fs->fs_fsmnt,
 					   "Filesystem is not clean - run fsck",
 					   (fs->fs_flags & FS_SUJ) == 0 ? "" :
 					   " Forced mount will invalidate"
 					   " journal contents");
 					return (EPERM);
 				}
 			}
 			g_topology_lock();
 			/*
 			 * Request exclusive write access.
 			 */
 			error = g_access(ump->um_cp, 0, 1, 1);
 			g_topology_unlock();
 			if (error)
 				return (error);
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			error = vfs_write_suspend_umnt(mp);
 			if (error != 0)
 				return (error);
 			fs->fs_ronly = 0;
 			MNT_ILOCK(mp);
 			saved_mnt_flag = MNT_RDONLY;
 			if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag &
 			    MNT_ASYNC) != 0)
 				saved_mnt_flag |= MNT_ASYNC;
 			mp->mnt_flag &= ~saved_mnt_flag;
 			MNT_IUNLOCK(mp);
 			fs->fs_mtime = time_second;
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
 				fs->fs_ronly = 1;
 				MNT_ILOCK(mp);
 				mp->mnt_flag |= saved_mnt_flag;
 				MNT_IUNLOCK(mp);
 				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				fs->fs_ronly = 1;
 				MNT_ILOCK(mp);
 				mp->mnt_flag |= saved_mnt_flag;
 				MNT_IUNLOCK(mp);
 				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			vfs_write_resume(mp, 0);
 		}
 		/*
 		 * Soft updates is incompatible with "async",
 		 * so if we are doing softupdates stop the user
 		 * from setting the async flag in an update.
 		 * Softdep_mount() clears it in an initial mount
 		 * or ro->rw remount.
 		 */
 		if (MOUNTEDSOFTDEP(mp)) {
 			/* XXX: Reset too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_ASYNC;
 			MNT_IUNLOCK(mp);
 		}
 		/*
 		 * Keep MNT_ACLS flag if it is stored in superblock.
 		 */
 		if ((fs->fs_flags & FS_ACLS) != 0) {
 			/* XXX: Set too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_ACLS;
 			MNT_IUNLOCK(mp);
 		}
 
 		if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
 			/* XXX: Set too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_NFS4ACLS;
 			MNT_IUNLOCK(mp);
 		}
 		/*
 		 * If this is a request from fsck to clean up the filesystem,
 		 * then allow the specified pid to proceed.
 		 */
 		if (fsckpid > 0) {
 			if (ump->um_fsckpid != 0) {
 				vfs_mount_error(mp,
 				    "Active checker already running on %s",
 				    fs->fs_fsmnt);
 				return (EINVAL);
 			}
 			KASSERT(MOUNTEDSOFTDEP(mp) == 0,
 			    ("soft updates enabled on read-only file system"));
 			g_topology_lock();
 			/*
 			 * Request write access.
 			 */
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error) {
 				vfs_mount_error(mp,
 				    "Checker activation failed on %s",
 				    fs->fs_fsmnt);
 				return (error);
 			}
 			ump->um_fsckpid = fsckpid;
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			fs->fs_mtime = time_second;
 			fs->fs_fmod = 1;
 			fs->fs_clean = 0;
 			(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 		}
 
 		/*
 		 * If this is a snapshot request, take the snapshot.
 		 */
 		if (mp->mnt_flag & MNT_SNAPSHOT)
 			return (ffs_snapshot(mp, fspec));
 
 		/*
 		 * Must not call namei() while owning busy ref.
 		 */
 		vfs_unbusy(mp);
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	error = namei(&ndp);
 	if ((mp->mnt_flag & MNT_UPDATE) != 0) {
 		/*
 		 * Unmount does not start if MNT_UPDATE is set.  Mount
 		 * update busies mp before setting MNT_UPDATE.  We
 		 * must be able to retain our busy ref succesfully,
 		 * without sleep.
 		 */
 		error1 = vfs_busy(mp, MBF_NOWAIT);
 		MPASS(error1 == 0);
 	}
 	if (error != 0)
 		return (error);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	accmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Update only
 		 *
 		 * If it's not the same vnode, or at least the same device
 		 * then it's not correct.
 		 */
 
 		if (devvp->v_rdev != ump->um_devvp->v_rdev)
 			error = EINVAL;	/* needs translation */
 		vput(devvp);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * New mount
 		 *
 		 * We need the name for the mount point (also used for
 		 * "last mounted on") copied in. If an error occurs,
 		 * the mount point is discarded by the upper level code.
 		 * Note that vfs_mount_alloc() populates f_mntonname for us.
 		 */
 		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
 			vrele(devvp);
 			return (error);
 		}
 		if (fsckpid > 0) {
 			KASSERT(MOUNTEDSOFTDEP(mp) == 0,
 			    ("soft updates enabled on read-only file system"));
 			ump = VFSTOUFS(mp);
 			fs = ump->um_fs;
 			g_topology_lock();
 			/*
 			 * Request write access.
 			 */
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error) {
 				printf("WARNING: %s: Checker activation "
 				    "failed\n", fs->fs_fsmnt);
 			} else { 
 				ump->um_fsckpid = fsckpid;
 				if (fs->fs_snapinum[0] != 0)
 					ffs_snapshot_mount(mp);
 				fs->fs_mtime = time_second;
 				fs->fs_clean = 0;
 				(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 			}
 		}
 	}
 
 	MNT_ILOCK(mp);
 	/*
 	 * This is racy versus lookup, see ufs_fplookup_vexec for details.
 	 */
 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0)
 		panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp);
 	if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS | MNT_UNION)) == 0)
 		mp->mnt_kern_flag |= MNTK_FPLOOKUP;
 	MNT_IUNLOCK(mp);
 
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 /*
  * Compatibility with old mount system call.
  */
 
 static int
 ffs_cmount(struct mntarg *ma, void *data, uint64_t flags)
 {
 	struct ufs_args args;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &args.export, sizeof(args.export));
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). If the 'force' flag
  * is 0, the filesystem must be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary
  *	   writers, if requested.
  *	6) invalidate all cached file data.
  *	7) re-read inode data for all active vnodes.
  */
 int
 ffs_reload(struct mount *mp, struct thread *td, int flags)
 {
 	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	void *space;
 	struct buf *bp;
 	struct fs *fs, *newfs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 	int i, blks, error;
 	u_long size;
 	int32_t *lp;
 
 	ump = VFSTOUFS(mp);
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) {
 		MNT_IUNLOCK(mp);
 		return (EINVAL);
 	}
 	MNT_IUNLOCK(mp);
 	
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	if (vinvalbuf(devvp, 0, 0, 0) != 0)
 		panic("ffs_reload: dirty1");
 	VOP_UNLOCK(devvp);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 */
 	fs = VFSTOUFS(mp)->um_fs;
 	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
 	    NOCRED, &bp)) != 0)
 		return (error);
 	newfs = (struct fs *)bp->b_data;
 	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
 	     newfs->fs_magic != FS_UFS2_MAGIC) ||
 	    newfs->fs_bsize > MAXBSIZE ||
 	    newfs->fs_bsize < sizeof(struct fs)) {
 			brelse(bp);
 			return (EIO);		/* XXX needs translation */
 	}
 	/*
 	 * Preserve the summary information, read-only status, and
 	 * superblock location by copying these fields into our new
 	 * superblock before using it to update the existing superblock.
 	 */
 	newfs->fs_si = fs->fs_si;
 	newfs->fs_ronly = fs->fs_ronly;
 	sblockloc = fs->fs_sblockloc;
 	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
 	brelse(bp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("WARNING: %s: reload pending error: blocks %jd "
 		    "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 
 	/*
 	 * Step 3: re-read summary information from disk.
 	 */
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	size += fs->fs_ncg * sizeof(u_int8_t);
 	free(fs->fs_csp, M_UFSMNT);
 	space = malloc(size, M_UFSMNT, M_WAITOK);
 	fs->fs_csp = space;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    NOCRED, &bp);
 		if (error)
 			return (error);
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 	}
 	/*
 	 * We no longer know anything about clusters per cylinder group.
 	 */
 	if (fs->fs_contigsumsize > 0) {
 		fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 		space = lp;
 	}
 	size = fs->fs_ncg * sizeof(u_int8_t);
 	fs->fs_contigdirs = (u_int8_t *)space;
 	bzero(fs->fs_contigdirs, size);
 	if ((flags & FFSR_UNSUSPEND) != 0) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 		wakeup(&mp->mnt_flag);
 		MNT_IUNLOCK(mp);
 	}
 
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/*
 		 * Skip syncer vnode.
 		 */
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
-		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, 0, 0))
 			panic("ffs_reload: dirty2");
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			vput(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (error);
 		}
 		if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) {
 			brelse(bp);
 			vput(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (error);
 		}
 		ip->i_effnlink = ip->i_nlink;
 		brelse(bp);
 		vput(vp);
 	}
 	return (0);
 }
 
 /*
  * Common code for mount and mountroot
  */
 static int
 ffs_mountfs(odevvp, mp, td)
 	struct vnode *odevvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct cdev *dev;
 	int error, i, len, ronly;
 	struct ucred *cred;
 	struct g_consumer *cp;
 	struct mount *nmp;
 	struct vnode *devvp;
 	struct fsfail_task *etp;
 	int candelete, canspeedup;
 	off_t loc;
 
 	fs = NULL;
 	ump = NULL;
 	cred = td ? td->td_ucred : NOCRED;
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
 	devvp = mntfs_allocvp(mp, odevvp);
 	VOP_UNLOCK(odevvp);
 	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
 	dev = devvp->v_rdev;
 	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
 	    (uintptr_t)mp) == 0) {
 		mntfs_freevp(devvp);
 		return (EBUSY);
 	}
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
 	g_topology_unlock();
 	if (error != 0) {
 		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 		mntfs_freevp(devvp);
 		return (error);
 	}
 	dev_ref(dev);
 	devvp->v_bufobj.bo_ops = &ffs_ops;
 	BO_LOCK(&odevvp->v_bufobj);
 	odevvp->v_bufobj.bo_flag |= BO_NOBUFS;
 	BO_UNLOCK(&odevvp->v_bufobj);
 	if (dev->si_iosize_max != 0)
 		mp->mnt_iosize_max = dev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 	if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
 		error = EINVAL;
 		vfs_mount_error(mp,
 		    "Invalid sectorsize %d for superblock size %d",
 		    cp->provider->sectorsize, SBLOCKSIZE);
 		goto out;
 	}
 	/* fetch the superblock and summary information */
 	loc = STDSB;
 	if ((mp->mnt_flag & MNT_ROOTFS) != 0)
 		loc = STDSB_NOHASHFAIL;
 	if ((error = ffs_sbget(devvp, &fs, loc, M_UFSMNT, ffs_use_bread)) != 0)
 		goto out;
 	/* none of these types of check-hashes are maintained by this kernel */
 	fs->fs_metackhash &= ~(CK_INDIR | CK_DIR);
 	/* no support for any undefined flags */
 	fs->fs_flags &= FS_SUPPORTED;
 	fs->fs_flags &= ~FS_UNCLEAN;
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
 		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
 		    ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
 		     (fs->fs_flags & FS_DOSOFTDEP))) {
 			printf("WARNING: %s was not properly dismounted\n",
 			    fs->fs_fsmnt);
 		} else {
 			vfs_mount_error(mp, "R/W mount of %s denied. %s%s",
 			    fs->fs_fsmnt, "Filesystem is not clean - run fsck.",
 			    (fs->fs_flags & FS_SUJ) == 0 ? "" :
 			    " Forced mount will invalidate journal contents");
 			error = EPERM;
 			goto out;
 		}
 		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
 		    (mp->mnt_flag & MNT_FORCE)) {
 			printf("WARNING: %s: lost blocks %jd files %d\n",
 			    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 			    fs->fs_pendinginodes);
 			fs->fs_pendingblocks = 0;
 			fs->fs_pendinginodes = 0;
 		}
 	}
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("WARNING: %s: mount pending error: blocks %jd "
 		    "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
 #ifdef UFS_GJOURNAL
 		/*
 		 * Get journal provider name.
 		 */
 		len = 1024;
 		mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK);
 		if (g_io_getattr("GJOURNAL::provider", cp, &len,
 		    mp->mnt_gjprovider) == 0) {
 			mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len,
 			    M_UFSMNT, M_WAITOK);
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_GJOURNAL;
 			MNT_IUNLOCK(mp);
 		} else {
 			printf("WARNING: %s: GJOURNAL flag on fs "
 			    "but no gjournal provider below\n",
 			    mp->mnt_stat.f_mntonname);
 			free(mp->mnt_gjprovider, M_UFSMNT);
 			mp->mnt_gjprovider = NULL;
 		}
 #else
 		printf("WARNING: %s: GJOURNAL flag on fs but no "
 		    "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname);
 #endif
 	} else {
 		mp->mnt_gjprovider = NULL;
 	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
 	ump->um_cp = cp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_fs = fs;
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_fstype = UFS1;
 		ump->um_balloc = ffs_balloc_ufs1;
 	} else {
 		ump->um_fstype = UFS2;
 		ump->um_balloc = ffs_balloc_ufs2;
 	}
 	ump->um_blkatoff = ffs_blkatoff;
 	ump->um_truncate = ffs_truncate;
 	ump->um_update = ffs_update;
 	ump->um_valloc = ffs_valloc;
 	ump->um_vfree = ffs_vfree;
 	ump->um_ifree = ffs_ifree;
 	ump->um_rdonly = ffs_rdonly;
 	ump->um_snapgone = ffs_snapgone;
 	if ((mp->mnt_flag & MNT_UNTRUSTED) != 0)
 		ump->um_check_blkno = ffs_check_blkno;
 	else
 		ump->um_check_blkno = NULL;
 	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
 	ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc);
 	fs->fs_ronly = ronly;
 	fs->fs_active = NULL;
 	mp->mnt_data = ump;
 	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
 	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
 	nmp = NULL;
 	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
 	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
 		if (nmp)
 			vfs_rel(nmp);
 		vfs_getnewfsid(mp);
 	}
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
 #ifdef MAC
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_MULTILABEL;
 		MNT_IUNLOCK(mp);
 #else
 		printf("WARNING: %s: multilabel flag on fs but "
 		    "no MAC support\n", mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_ACLS) != 0) {
 #ifdef UFS_ACL
 		MNT_ILOCK(mp);
 
 		if (mp->mnt_flag & MNT_NFS4ACLS)
 			printf("WARNING: %s: ACLs flag on fs conflicts with "
 			    "\"nfsv4acls\" mount option; option ignored\n",
 			    mp->mnt_stat.f_mntonname);
 		mp->mnt_flag &= ~MNT_NFS4ACLS;
 		mp->mnt_flag |= MNT_ACLS;
 
 		MNT_IUNLOCK(mp);
 #else
 		printf("WARNING: %s: ACLs flag on fs but no ACLs support\n",
 		    mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
 #ifdef UFS_ACL
 		MNT_ILOCK(mp);
 
 		if (mp->mnt_flag & MNT_ACLS)
 			printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts "
 			    "with \"acls\" mount option; option ignored\n",
 			    mp->mnt_stat.f_mntonname);
 		mp->mnt_flag &= ~MNT_ACLS;
 		mp->mnt_flag |= MNT_NFS4ACLS;
 
 		MNT_IUNLOCK(mp);
 #else
 		printf("WARNING: %s: NFSv4 ACLs flag on fs but no "
 		    "ACLs support\n", mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_TRIM) != 0) {
 		len = sizeof(int);
 		if (g_io_getattr("GEOM::candelete", cp, &len,
 		    &candelete) == 0) {
 			if (candelete)
 				ump->um_flags |= UM_CANDELETE;
 			else
 				printf("WARNING: %s: TRIM flag on fs but disk "
 				    "does not support TRIM\n",
 				    mp->mnt_stat.f_mntonname);
 		} else {
 			printf("WARNING: %s: TRIM flag on fs but disk does "
 			    "not confirm that it supports TRIM\n",
 			    mp->mnt_stat.f_mntonname);
 		}
 		if (((ump->um_flags) & UM_CANDELETE) != 0) {
 			ump->um_trim_tq = taskqueue_create("trim", M_WAITOK,
 			    taskqueue_thread_enqueue, &ump->um_trim_tq);
 			taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
 			    "%s trim", mp->mnt_stat.f_mntonname);
 			ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
 			    &ump->um_trimlisthashsize);
 		}
 	}
 
 	len = sizeof(int);
 	if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) {
 		if (canspeedup)
 			ump->um_flags |= UM_CANSPEEDUP;
 	}
 
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_odevvp = odevvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
 #ifdef UFS_EXTATTR
 	ufs_extattr_uepm_init(&ump->um_extattr);
 #endif
 	/*
 	 * Set FS local "last mounted on" information (NULL pad)
 	 */
 	bzero(fs->fs_fsmnt, MAXMNTLEN);
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
 		 * to update the system clock.
 		 */
 		mp->mnt_time = fs->fs_time;
 	}
 
 	if (ronly == 0) {
 		fs->fs_mtime = time_second;
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			ffs_flushfiles(mp, FORCECLOSE, td);
 			goto out;
 		}
 		if (fs->fs_snapinum[0] != 0)
 			ffs_snapshot_mount(mp);
 		fs->fs_fmod = 1;
 		fs->fs_clean = 0;
 		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 	}
 	/*
 	 * Initialize filesystem state information in mount struct.
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
 	    MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE;
 	MNT_IUNLOCK(mp);
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
 	/*
 	 *
 	 * Auto-starting does the following:
 	 *	- check for /.attribute in the fs, and extattr_start if so
 	 *	- for each file in .attribute, enable that file with
 	 * 	  an attribute of the same name.
 	 * Not clear how to report errors -- probably eat them.
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
 	etp = malloc(sizeof *ump->um_fsfail_task, M_UFSMNT, M_WAITOK | M_ZERO);
 	etp->fsid = mp->mnt_stat.f_fsid;
 	ump->um_fsfail_task = etp;
 	return (0);
 out:
 	if (fs != NULL) {
 		free(fs->fs_csp, M_UFSMNT);
 		free(fs->fs_si, M_UFSMNT);
 		free(fs, M_UFSMNT);
 	}
 	if (cp != NULL) {
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
 	}
 	if (ump) {
 		mtx_destroy(UFS_MTX(ump));
 		if (mp->mnt_gjprovider != NULL) {
 			free(mp->mnt_gjprovider, M_UFSMNT);
 			mp->mnt_gjprovider = NULL;
 		}
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
 	BO_LOCK(&odevvp->v_bufobj);
 	odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
 	BO_UNLOCK(&odevvp->v_bufobj);
 	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 	mntfs_freevp(devvp);
 	dev_rel(dev);
 	return (error);
 }
 
 /*
  * A read function for use by filesystem-layer routines.
  */
 static int
 ffs_use_bread(void *devfd, off_t loc, void **bufp, int size)
 {
 	struct buf *bp;
 	int error;
 
 	KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp));
 	*bufp = malloc(size, M_UFSMNT, M_WAITOK);
 	if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED,
 	    &bp)) != 0)
 		return (error);
 	bcopy(bp->b_data, *bufp, size);
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	return (0);
 }
 
 static int bigcgs = 0;
 SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
 
 /*
  * Sanity checks for loading old filesystem superblocks.
  * See ffs_oldfscompat_write below for unwound actions.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_read(fs, ump, sblockloc)
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 {
 	off_t maxfilesize;
 
 	/*
 	 * If not yet done, update fs_flags location and value of fs_sblockloc.
 	 */
 	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		fs->fs_flags = fs->fs_old_flags;
 		fs->fs_old_flags |= FS_FLAGS_UPDATED;
 		fs->fs_sblockloc = sblockloc;
 	}
 	/*
 	 * If not yet done, update UFS1 superblock with new wider fields.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
 		fs->fs_maxbsize = fs->fs_bsize;
 		fs->fs_time = fs->fs_old_time;
 		fs->fs_size = fs->fs_old_size;
 		fs->fs_dsize = fs->fs_old_dsize;
 		fs->fs_csaddr = fs->fs_old_csaddr;
 		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
 		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
 		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
 		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {
 		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
 		fs->fs_qbmask = ~fs->fs_bmask;
 		fs->fs_qfmask = ~fs->fs_fmask;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
 		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
 		if (fs->fs_maxfilesize > maxfilesize)
 			fs->fs_maxfilesize = maxfilesize;
 	}
 	/* Compatibility for old filesystems */
 	if (fs->fs_avgfilesize <= 0)
 		fs->fs_avgfilesize = AVFILESIZ;
 	if (fs->fs_avgfpdir <= 0)
 		fs->fs_avgfpdir = AFPDIR;
 	if (bigcgs) {
 		fs->fs_save_cgsize = fs->fs_cgsize;
 		fs->fs_cgsize = fs->fs_bsize;
 	}
 }
 
 /*
  * Unwinding superblock updates for old filesystems.
  * See ffs_oldfscompat_read above for details.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
 {
 
 	/*
 	 * Copy back UFS2 updated fields that UFS1 inspects.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		fs->fs_old_time = fs->fs_time;
 		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
 		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
 		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
 		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
 		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
 	}
 	if (bigcgs) {
 		fs->fs_cgsize = fs->fs_save_cgsize;
 		fs->fs_save_cgsize = 0;
 	}
 }
 
 /*
  * unmount system call
  */
 static int
 ffs_unmount(mp, mntflags)
 	struct mount *mp;
 	int mntflags;
 {
 	struct thread *td;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, flags, susp;
 #ifdef UFS_EXTATTR
 	int e_restart;
 #endif
 
 	flags = 0;
 	td = curthread;
 	fs = ump->um_fs;
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 	susp = fs->fs_ronly == 0;
 #ifdef UFS_EXTATTR
 	if ((error = ufs_extattr_stop(mp, td))) {
 		if (error != EOPNOTSUPP)
 			printf("WARNING: unmount %s: ufs_extattr_stop "
 			    "returned errno %d\n", mp->mnt_stat.f_mntonname,
 			    error);
 		e_restart = 0;
 	} else {
 		ufs_extattr_uepm_destroy(&ump->um_extattr);
 		e_restart = 1;
 	}
 #endif
 	if (susp) {
 		error = vfs_write_suspend_umnt(mp);
 		if (error != 0)
 			goto fail1;
 	}
 	if (MOUNTEDSOFTDEP(mp))
 		error = softdep_flushfiles(mp, flags, td);
 	else
 		error = ffs_flushfiles(mp, flags, td);
 	if (error != 0 && !ffs_fsfail_cleanup(ump, error))
 		goto fail;
 
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("WARNING: unmount %s: pending error: blocks %jd "
 		    "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 	if (MOUNTEDSOFTDEP(mp))
 		softdep_unmount(mp);
 	if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT, 0);
 		if (ffs_fsfail_cleanup(ump, error))
 			error = 0;
 		if (error != 0 && !ffs_fsfail_cleanup(ump, error)) {
 			fs->fs_clean = 0;
 			goto fail;
 		}
 	}
 	if (susp)
 		vfs_write_resume(mp, VR_START_WRITE);
 	if (ump->um_trim_tq != NULL) {
 		while (ump->um_trim_inflight != 0)
 			pause("ufsutr", hz);
 		taskqueue_drain_all(ump->um_trim_tq);
 		taskqueue_free(ump->um_trim_tq);
 		free (ump->um_trimhash, M_TRIM);
 	}
 	g_topology_lock();
 	if (ump->um_fsckpid > 0) {
 		/*
 		 * Return to normal read-only mode.
 		 */
 		error = g_access(ump->um_cp, 0, -1, 0);
 		ump->um_fsckpid = 0;
 	}
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
 	BO_LOCK(&ump->um_odevvp->v_bufobj);
 	ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
 	BO_UNLOCK(&ump->um_odevvp->v_bufobj);
 	atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
 	mntfs_freevp(ump->um_devvp);
 	vrele(ump->um_odevvp);
 	dev_rel(ump->um_dev);
 	mtx_destroy(UFS_MTX(ump));
 	if (mp->mnt_gjprovider != NULL) {
 		free(mp->mnt_gjprovider, M_UFSMNT);
 		mp->mnt_gjprovider = NULL;
 	}
 	free(fs->fs_csp, M_UFSMNT);
 	free(fs->fs_si, M_UFSMNT);
 	free(fs, M_UFSMNT);
 	if (ump->um_fsfail_task != NULL)
 		free(ump->um_fsfail_task, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	if (td->td_su == mp) {
 		td->td_su = NULL;
 		vfs_rel(mp);
 	}
 	return (error);
 
 fail:
 	if (susp)
 		vfs_write_resume(mp, VR_START_WRITE);
 fail1:
 #ifdef UFS_EXTATTR
 	if (e_restart) {
 		ufs_extattr_uepm_init(&ump->um_extattr);
 #ifdef UFS_EXTATTR_AUTOSTART
 		(void) ufs_extattr_autostart(mp, td);
 #endif
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 int
 ffs_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	int qerror, error;
 
 	ump = VFSTOUFS(mp);
 	qerror = 0;
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
 		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			error = quotaoff(td, mp, i);
 			if (error != 0) {
 				if ((flags & EARLYFLUSH) == 0)
 					return (error);
 				else
 					qerror = error;
 			}
 		}
 
 		/*
 		 * Here we fall through to vflush again to ensure that
 		 * we have gotten rid of all the system vnodes, unless
 		 * quotas must not be closed.
 		 */
 	}
 #endif
 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
 	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
 		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
 			return (error);
 		ffs_snapshot_unmount(mp);
 		flags |= FORCECLOSE;
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
 
 	/*
 	 * Do not close system files if quotas were not closed, to be
 	 * able to sync the remaining dquots.  The freeblks softupdate
 	 * workitems might hold a reference on a dquot, preventing
 	 * quotaoff() from completing.  Next round of
 	 * softdep_flushworklist() iteration should process the
 	 * blockers, allowing the next run of quotaoff() to finally
 	 * flush held dquots.
 	 *
 	 * Otherwise, flush all the files.
 	 */
 	if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0)
 		return (error);
 
 	/*
 	 * Flush filesystem metadata.
 	 */
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
 	VOP_UNLOCK(ump->um_devvp);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 static int
 ffs_statfs(mp, sbp)
 	struct mount *mp;
 	struct statfs *sbp;
 {
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_statfs");
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_bsize = fs->fs_fsize;
 	sbp->f_iosize = fs->fs_bsize;
 	sbp->f_blocks = fs->fs_dsize;
 	UFS_LOCK(ump);
 	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
 	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
 	    dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO;
 	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
 	UFS_UNLOCK(ump);
 	sbp->f_namemax = UFS_MAXNAMLEN;
 	return (0);
 }
 
 static bool
 sync_doupdate(struct inode *ip)
 {
 
 	return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED |
 	    IN_UPDATE)) != 0);
 }
 
 static int
 ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused)
 {
 	struct inode *ip;
 
 	/*
 	 * Flags are safe to access because ->v_data invalidation
 	 * is held off by listmtx.
 	 */
 	if (vp->v_type == VNON)
 		return (false);
 	ip = VTOI(vp);
 	if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0)
 		return (false);
 	return (true);
 }
 
 /*
  * For a lazy sync, we only care about access times, quotas and the
  * superblock.  Other filesystem changes are already converted to
  * cylinder group blocks or inode blocks updates and are written to
  * disk by syncer.
  */
 static int
 ffs_sync_lazy(mp)
      struct mount *mp;
 {
 	struct vnode *mvp, *vp;
 	struct inode *ip;
 	struct thread *td;
 	int allerror, error;
 
 	allerror = 0;
 	td = curthread;
 	if ((mp->mnt_flag & MNT_NOATIME) != 0) {
 #ifdef QUOTA
 		qsync(mp);
 #endif
 		goto sbupdate;
 	}
 	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 
 		/*
 		 * The IN_ACCESS flag is converted to IN_MODIFIED by
 		 * ufs_close() and ufs_getattr() by the calls to
 		 * ufs_itimes_locked(), without subsequent UFS_UPDATE().
 		 * Test also all the other timestamp flags too, to pick up
 		 * any other cases that could be missed.
 		 */
 		if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
-		    td)) != 0)
+		if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK)) != 0)
 			continue;
 #ifdef QUOTA
 		qsyncvp(vp);
 #endif
 		if (sync_doupdate(ip))
 			error = ffs_update(vp, 0);
 		if (error != 0)
 			allerror = error;
 		vput(vp);
 	}
 sbupdate:
 	if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 &&
 	    (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked busy using
  * vfs_busy().
  */
 static int
 ffs_sync(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	struct vnode *mvp, *vp, *devvp;
 	struct thread *td;
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, count, lockreq, allerror = 0;
 	int suspend;
 	int suspended;
 	int secondary_writes;
 	int secondary_accwrites;
 	int softdep_deps;
 	int softdep_accdeps;
 	struct bufobj *bo;
 
 	suspend = 0;
 	suspended = 0;
 	td = curthread;
 	fs = ump->um_fs;
 	if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0)
 		panic("%s: ffs_sync: modification on read-only filesystem",
 		    fs->fs_fsmnt);
 	if (waitfor == MNT_LAZY) {
 		if (!rebooting)
 			return (ffs_sync_lazy(mp));
 		waitfor = MNT_NOWAIT;
 	}
 
 	/*
 	 * Write back each (modified) inode.
 	 */
 	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
 	if (waitfor == MNT_SUSPEND) {
 		suspend = 1;
 		waitfor = MNT_WAIT;
 	}
 	if (waitfor == MNT_WAIT)
 		lockreq = LK_EXCLUSIVE;
 	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
 loop:
 	/* Grab snapshot of secondary write counts */
 	MNT_ILOCK(mp);
 	secondary_writes = mp->mnt_secondary_writes;
 	secondary_accwrites = mp->mnt_secondary_accwrites;
 	MNT_IUNLOCK(mp);
 
 	/* Grab snapshot of softdep dependency counts */
 	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/*
 		 * Depend on the vnode interlock to keep things stable enough
 		 * for a quick test.  Since there might be hundreds of
 		 * thousands of vnodes, we cannot afford even a subroutine
 		 * call unless there's a good chance that we have work to do.
 		 */
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    vp->v_bufobj.bo_dirty.bv_cnt == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		if ((error = vget(vp, lockreq, td)) != 0) {
+		if ((error = vget(vp, lockreq)) != 0) {
 			if (error == ENOENT || error == ENOLCK) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 #ifdef QUOTA
 		qsyncvp(vp);
 #endif
 		if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
 			allerror = error;
 		vput(vp);
 	}
 	/*
 	 * Force stale filesystem control information to be flushed.
 	 */
 	if (waitfor == MNT_WAIT || rebooting) {
 		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
 			allerror = error;
 		if (ffs_fsfail_cleanup(ump, allerror))
 			allerror = 0;
 		/* Flushed work items may create new vnodes to clean */
 		if (allerror == 0 && count)
 			goto loop;
 	}
 
 	devvp = ump->um_devvp;
 	bo = &devvp->v_bufobj;
 	BO_LOCK(bo);
 	if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
 		BO_UNLOCK(bo);
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(devvp, waitfor, td);
 		VOP_UNLOCK(devvp);
 		if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN))
 			error = ffs_sbupdate(ump, waitfor, 0);
 		if (error != 0)
 			allerror = error;
 		if (ffs_fsfail_cleanup(ump, allerror))
 			allerror = 0;
 		if (allerror == 0 && waitfor == MNT_WAIT)
 			goto loop;
 	} else if (suspend != 0) {
 		if (softdep_check_suspend(mp,
 					  devvp,
 					  softdep_deps,
 					  softdep_accdeps,
 					  secondary_writes,
 					  secondary_accwrites) != 0) {
 			MNT_IUNLOCK(mp);
 			goto loop;	/* More work needed */
 		}
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
 		MNT_IUNLOCK(mp);
 		suspended = 1;
 	} else
 		BO_UNLOCK(bo);
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->fs_fmod != 0 &&
 	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
 		allerror = error;
 	if (ffs_fsfail_cleanup(ump, allerror))
 		allerror = 0;
 	return (allerror);
 }
 
 int
 ffs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	return (ffs_vgetf(mp, ino, flags, vpp, 0));
 }
 
 int
 ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 	int ffs_flags;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	daddr_t dbn;
 	int error;
 
 	MPASS((ffs_flags & FFSV_REPLACE) == 0 || (flags & LK_EXCLUSIVE) != 0);
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (*vpp != NULL) {
 		if ((ffs_flags & FFSV_REPLACE) == 0)
 			return (0);
 		vgone(*vpp);
 		vput(*vpp);
 	}
 
 	/*
 	 * We must promote to an exclusive lock for vnode creation.  This
 	 * can happen if lookup is passed LOCKSHARED.
 	 */
 	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 	}
 
 	/*
 	 * We do not lock vnode creation as it is believed to be too
 	 * expensive for such rare case as simultaneous creation of vnode
 	 * for same ino by different processes. We just allow them to race
 	 * and check later to decide who wins. Let the race begin!
 	 */
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ?
 	    &ffs_vnodeops1 : &ffs_vnodeops2, &vp);
 	if (error) {
 		*vpp = NULL;
 		uma_zfree_smr(uma_inode, ip);
 		return (error);
 	}
 	/*
 	 * FFS supports recursive locking.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 	VN_LOCK_AREC(vp);
 	vp->v_data = ip;
 	vp->v_bufobj.bo_bsize = fs->fs_bsize;
 	ip->i_vnode = vp;
 	ip->i_ump = ump;
 	ip->i_number = ino;
 	ip->i_ea_refs = 0;
 	ip->i_nextclustercg = -1;
 	ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2;
 	ip->i_mode = 0; /* ensure error cases below throw away vnode */
 #ifdef QUOTA
 	{
 		int i;
 		for (i = 0; i < MAXQUOTAS; i++)
 			ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 
 	if (ffs_flags & FFSV_FORCEINSMQ)
 		vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		uma_zfree_smr(uma_inode, ip);
 		*vpp = NULL;
 		return (error);
 	}
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	if (*vpp != NULL) {
 		/*
 		 * Calls from ffs_valloc() (i.e. FFSV_REPLACE set)
 		 * operate on empty inode, which must not be found by
 		 * other threads until fully filled.  Vnode for empty
 		 * inode must be not re-inserted on the hash by other
 		 * thread, after removal by us at the beginning.
 		 */
 		MPASS((ffs_flags & FFSV_REPLACE) == 0);
 		return (0);
 	}
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	dbn = fsbtodb(fs, ino_to_fsba(fs, ino));
 	error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize,
 	    NULL, NULL, 0, NOCRED, 0, NULL, &bp);
 	if (error != 0) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		vgone(vp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	if (I_IS_UFS1(ip))
 		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
 	else
 		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
 	if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) {
 		bqrelse(bp);
 		vgone(vp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	if (DOINGSOFTDEP(vp))
 		softdep_load_inodeblock(ip);
 	else
 		ip->i_effnlink = ip->i_nlink;
 	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2,
 	    &vp);
 	if (error) {
 		vgone(vp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Finish inode initialization.
 	 */
 	if (vp->v_type != VFIFO) {
 		/* FFS supports shared locking for all files except fifos. */
 		VN_LOCK_ASHARE(vp);
 	}
 
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		while (ip->i_gen == 0)
 			ip->i_gen = arc4random();
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
 			DIP_SET(ip, i_gen, ip->i_gen);
 		}
 	}
 #ifdef MAC
 	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
 		/*
 		 * If this vnode is already allocated, and we're running
 		 * multi-label, attempt to perform a label association
 		 * from the extended attributes on the inode.
 		 */
 		error = mac_vnode_associate_extattr(mp, vp);
 		if (error) {
 			/* ufs_inactive will release ip->i_devvp ref. */
 			vgone(vp);
 			vput(vp);
 			*vpp = NULL;
 			return (error);
 		}
 	}
 #endif
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - for UFS2 check that the inode number is initialized
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ffs_fhtovp(mp, fhp, flags, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	int flags;
 	struct vnode **vpp;
 {
 	struct ufid *ufhp;
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	ino_t ino;
 	u_int cg;
 	int error;
 
 	ufhp = (struct ufid *)fhp;
 	ino = ufhp->ufid_ino;
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
 	/*
 	 * Need to check if inode is initialized because UFS2 does lazy
 	 * initialization and nfs_fhtovp can offer arbitrary inode numbers.
 	 */
 	if (fs->fs_magic != FS_UFS2_MAGIC)
 		return (ufs_fhtovp(mp, ufhp, flags, vpp));
 	cg = ino_to_cg(fs, ino);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0)
 		return (error);
 	if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) {
 		brelse(bp);
 		return (ESTALE);
 	}
 	brelse(bp);
 	return (ufs_fhtovp(mp, ufhp, flags, vpp));
 }
 
 /*
  * Initialize the filesystem.
  */
 static int
 ffs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	ffs_susp_initialize();
 	softdep_initialize();
 	return (ufs_init(vfsp));
 }
 
 /*
  * Undo the work of ffs_init().
  */
 static int
 ffs_uninit(vfsp)
 	struct vfsconf *vfsp;
 {
 	int ret;
 
 	ret = ufs_uninit(vfsp);
 	softdep_uninitialize();
 	ffs_susp_uninitialize();
 	taskqueue_drain_all(taskqueue_thread);
 	return (ret);
 }
 
 /*
  * Structure used to pass information from ffs_sbupdate to its
  * helper routine ffs_use_bwrite.
  */
 struct devfd {
 	struct ufsmount	*ump;
 	struct buf	*sbbp;
 	int		 waitfor;
 	int		 suspended;
 	int		 error;
 };
 
 /*
  * Write a superblock and associated information back to disk.
  */
 int
 ffs_sbupdate(ump, waitfor, suspended)
 	struct ufsmount *ump;
 	int waitfor;
 	int suspended;
 {
 	struct fs *fs;
 	struct buf *sbbp;
 	struct devfd devfd;
 
 	fs = ump->um_fs;
 	if (fs->fs_ronly == 1 &&
 	    (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
 	    (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0)
 		panic("ffs_sbupdate: write read-only filesystem");
 	/*
 	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
 	 */
 	sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
 	    (int)fs->fs_sbsize, 0, 0, 0);
 	/*
 	 * Initialize info needed for write function.
 	 */
 	devfd.ump = ump;
 	devfd.sbbp = sbbp;
 	devfd.waitfor = waitfor;
 	devfd.suspended = suspended;
 	devfd.error = 0;
 	return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite));
 }
 
 /*
  * Write function for use by filesystem-layer routines.
  */
 static int
 ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size)
 {
 	struct devfd *devfdp;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct fs *fs;
 	int error;
 
 	devfdp = devfd;
 	ump = devfdp->ump;
 	fs = ump->um_fs;
 	/*
 	 * Writing the superblock summary information.
 	 */
 	if (loc != fs->fs_sblockloc) {
 		bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0);
 		bcopy(buf, bp->b_data, (u_int)size);
 		if (devfdp->suspended)
 			bp->b_flags |= B_VALIDSUSPWRT;
 		if (devfdp->waitfor != MNT_WAIT)
 			bawrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			devfdp->error = error;
 		return (0);
 	}
 	/*
 	 * Writing the superblock itself. We need to do special checks for it.
 	 */
 	bp = devfdp->sbbp;
 	if (ffs_fsfail_cleanup(ump, devfdp->error))
 		devfdp->error = 0;
 	if (devfdp->error != 0) {
 		brelse(bp);
 		return (devfdp->error);
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
 	    (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
 		fs->fs_sblockloc = SBLOCK_UFS1;
 	}
 	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
 	    (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
 		fs->fs_sblockloc = SBLOCK_UFS2;
 	}
 	if (MOUNTEDSOFTDEP(ump->um_mountp))
 		softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	fs = (struct fs *)bp->b_data;
 	ffs_oldfscompat_write(fs, ump);
 	fs->fs_si = NULL;
 	/* Recalculate the superblock hash */
 	fs->fs_ckhash = ffs_calc_sbhash(fs);
 	if (devfdp->suspended)
 		bp->b_flags |= B_VALIDSUSPWRT;
 	if (devfdp->waitfor != MNT_WAIT)
 		bawrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		devfdp->error = error;
 	return (devfdp->error);
 }
 
 static int
 ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
 	int attrnamespace, const char *attrname)
 {
 
 #ifdef UFS_EXTATTR
 	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname));
 #else
 	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname));
 #endif
 }
 
 static void
 ffs_ifree(struct ufsmount *ump, struct inode *ip)
 {
 
 	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
 		uma_zfree(uma_ufs1, ip->i_din1);
 	else if (ip->i_din2 != NULL)
 		uma_zfree(uma_ufs2, ip->i_din2);
 	uma_zfree_smr(uma_inode, ip);
 }
 
 static int dobkgrdwrite = 1;
 SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
     "Do background writes (honoring the BV_BKGRDWRITE flag)?");
 
 /*
  * Complete a background write started from bwrite.
  */
 static void
 ffs_backgroundwritedone(struct buf *bp)
 {
 	struct bufobj *bufobj;
 	struct buf *origbp;
 
 #ifdef SOFTUPDATES
 	if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0)
 		softdep_handle_error(bp);
 #endif
 
 	/*
 	 * Find the original buffer that we are writing.
 	 */
 	bufobj = bp->b_bufobj;
 	BO_LOCK(bufobj);
 	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
 		panic("backgroundwritedone: lost buffer");
 
 	/*
 	 * We should mark the cylinder group buffer origbp as
 	 * dirty, to not lose the failed write.
 	 */
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		origbp->b_vflags |= BV_BKGRDERR;
 	BO_UNLOCK(bufobj);
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
 	if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0)
 		buf_complete(bp);
 #ifdef SOFTUPDATES
 	if (!LIST_EMPTY(&bp->b_dep))
 		softdep_move_dependencies(bp, origbp);
 #endif
 	/*
 	 * This buffer is marked B_NOCACHE so when it is released
 	 * by biodone it will be tossed.
 	 */
 	bp->b_flags |= B_NOCACHE;
 	bp->b_flags &= ~B_CACHE;
 	pbrelvp(bp);
 
 	/*
 	 * Prevent brelse() from trying to keep and re-dirtying bp on
 	 * errors. It causes b_bufobj dereference in
 	 * bdirty()/reassignbuf(), and b_bufobj was cleared in
 	 * pbrelvp() above.
 	 */
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		bp->b_flags |= B_INVAL;
 	bufdone(bp);
 	BO_LOCK(bufobj);
 	/*
 	 * Clear the BV_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
 	 * If BV_BKGRDINPROG is not set in the original buffer it must
 	 * have been released and re-instantiated - which is not legal.
 	 */
 	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
 	    ("backgroundwritedone: lost buffer2"));
 	origbp->b_vflags &= ~BV_BKGRDINPROG;
 	if (origbp->b_vflags & BV_BKGRDWAIT) {
 		origbp->b_vflags &= ~BV_BKGRDWAIT;
 		wakeup(&origbp->b_xflags);
 	}
 	BO_UNLOCK(bufobj);
 }
 
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 static int
 ffs_bufwrite(struct buf *bp)
 {
 	struct buf *newbp;
 	struct cg *cgp;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (!BUF_ISLOCKED(bp))
 		panic("bufwrite: buffer is not busy???");
 	/*
 	 * If a background write is already in progress, delay
 	 * writing this block if it is asynchronous. Otherwise
 	 * wait for the background write to complete.
 	 */
 	BO_LOCK(bp->b_bufobj);
 	if (bp->b_vflags & BV_BKGRDINPROG) {
 		if (bp->b_flags & B_ASYNC) {
 			BO_UNLOCK(bp->b_bufobj);
 			bdwrite(bp);
 			return (0);
 		}
 		bp->b_vflags |= BV_BKGRDWAIT;
 		msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO,
 		    "bwrbg", 0);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("bufwrite: still writing");
 	}
 	bp->b_vflags &= ~BV_BKGRDERR;
 	BO_UNLOCK(bp->b_bufobj);
 
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
 	 *
 	 * This optimization eats a lot of memory.  If we have a page
 	 * or buffer shortfall we can't do it.
 	 */
 	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
 	    (bp->b_flags & B_ASYNC) &&
 	    !vm_page_count_severe() &&
 	    !buf_dirty_count_severe()) {
 		KASSERT(bp->b_iodone == NULL,
 		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
 
 		/* get a new block */
 		newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
 		if (newbp == NULL)
 			goto normal_write;
 
 		KASSERT(buf_mapped(bp), ("Unmapped cg"));
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags |= BV_BKGRDINPROG;
 		BO_UNLOCK(bp->b_bufobj);
 		newbp->b_xflags |=
 		    (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER;
 		newbp->b_lblkno = bp->b_lblkno;
 		newbp->b_blkno = bp->b_blkno;
 		newbp->b_offset = bp->b_offset;
 		newbp->b_iodone = ffs_backgroundwritedone;
 		newbp->b_flags |= B_ASYNC;
 		newbp->b_flags &= ~B_INVAL;
 		pbgetvp(bp->b_vp, newbp);
 
 #ifdef SOFTUPDATES
 		/*
 		 * Move over the dependencies.  If there are rollbacks,
 		 * leave the parent buffer dirtied as it will need to
 		 * be written again.
 		 */
 		if (LIST_EMPTY(&bp->b_dep) ||
 		    softdep_move_dependencies(bp, newbp) == 0)
 			bundirty(bp);
 #else
 		bundirty(bp);
 #endif
 
 		/*
 		 * Initiate write on the copy, release the original.  The
 		 * BKGRDINPROG flag prevents it from going away until 
 		 * the background write completes. We have to recalculate
 		 * its check hash in case the buffer gets freed and then
 		 * reconstituted from the buffer cache during a later read.
 		 */
 		if ((bp->b_xflags & BX_CYLGRP) != 0) {
 			cgp = (struct cg *)bp->b_data;
 			cgp->cg_ckhash = 0;
 			cgp->cg_ckhash =
 			    calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
 		}
 		bqrelse(bp);
 		bp = newbp;
 	} else
 		/* Mark the buffer clean */
 		bundirty(bp);
 
 
 	/* Let the normal bufwrite do the rest for us */
 normal_write:
 	/*
 	 * If we are writing a cylinder group, update its time.
 	 */
 	if ((bp->b_xflags & BX_CYLGRP) != 0) {
 		cgp = (struct cg *)bp->b_data;
 		cgp->cg_old_time = cgp->cg_time = time_second;
 	}
 	return (bufwrite(bp));
 }
 
 
 static void
 ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 {
 	struct vnode *vp;
 	struct buf *tbp;
 	int error, nocopy;
 
 	/*
 	 * This is the bufobj strategy for the private VCHR vnodes
 	 * used by FFS to access the underlying storage device.
 	 * We override the default bufobj strategy and thus bypass
 	 * VOP_STRATEGY() for these vnodes.
 	 */
 	vp = bo2vnode(bo);
 	KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR ||
 	    bp->b_vp->v_rdev == NULL ||
 	    bp->b_vp->v_rdev->si_mountpt == NULL ||
 	    VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL ||
 	    vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp,
 	    ("ffs_geom_strategy() with wrong vp"));
 	if (bp->b_iocmd == BIO_WRITE) {
 		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
 		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("ffs_geom_strategy: bad I/O");
 		nocopy = bp->b_flags & B_NOCOPY;
 		bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
 		if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
 		    vp->v_rdev->si_snapdata != NULL) {
 			if ((bp->b_flags & B_CLUSTER) != 0) {
 				runningbufwakeup(bp);
 				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 					      b_cluster.cluster_entry) {
 					error = ffs_copyonwrite(vp, tbp);
 					if (error != 0 &&
 					    error != EOPNOTSUPP) {
 						bp->b_error = error;
 						bp->b_ioflags |= BIO_ERROR;
 						bufdone(bp);
 						return;
 					}
 				}
 				bp->b_runningbufspace = bp->b_bufsize;
 				atomic_add_long(&runningbufspace,
 					       bp->b_runningbufspace);
 			} else {
 				error = ffs_copyonwrite(vp, bp);
 				if (error != 0 && error != EOPNOTSUPP) {
 					bp->b_error = error;
 					bp->b_ioflags |= BIO_ERROR;
 					bufdone(bp);
 					return;
 				}
 			}
 		}
 #ifdef SOFTUPDATES
 		if ((bp->b_flags & B_CLUSTER) != 0) {
 			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 				      b_cluster.cluster_entry) {
 				if (!LIST_EMPTY(&tbp->b_dep))
 					buf_start(tbp);
 			}
 		} else {
 			if (!LIST_EMPTY(&bp->b_dep))
 				buf_start(bp);
 		}
 
 #endif
 		/*
 		 * Check for metadata that needs check-hashes and update them.
 		 */
 		switch (bp->b_xflags & BX_FSPRIV) {
 		case BX_CYLGRP:
 			((struct cg *)bp->b_data)->cg_ckhash = 0;
 			((struct cg *)bp->b_data)->cg_ckhash =
 			    calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
 			break;
 
 		case BX_SUPERBLOCK:
 		case BX_INODE:
 		case BX_INDIR:
 		case BX_DIR:
 			printf("Check-hash write is unimplemented!!!\n");
 			break;
 
 		case 0:
 			break;
 
 		default:
 			printf("multiple buffer types 0x%b\n",
 			    (u_int)(bp->b_xflags & BX_FSPRIV),
 			    PRINT_UFS_BUF_XFLAGS);
 			break;
 		}
 	}
 	if (bp->b_iocmd != BIO_READ && ffs_enxio_enable)
 		bp->b_xflags |= BX_CVTENXIO;
 	g_vfs_strategy(bo, bp);
 }
 
 int
 ffs_own_mount(const struct mount *mp)
 {
 
 	if (mp->mnt_op == &ufs_vfsops)
 		return (1);
 	return (0);
 }
 
 #ifdef	DDB
 #ifdef SOFTUPDATES
 
 /* defined in ffs_softdep.c */
 extern void db_print_ffs(struct ufsmount *ump);
 
 DB_SHOW_COMMAND(ffs, db_show_ffs)
 {
 	struct mount *mp;
 	struct ufsmount *ump;
 
 	if (have_addr) {
 		ump = VFSTOUFS((struct mount *)addr);
 		db_print_ffs(ump);
 		return;
 	}
 
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name))
 			db_print_ffs(VFSTOUFS(mp));
 	}
 }
 
 #endif	/* SOFTUPDATES */
 #endif	/* DDB */
Index: projects/clang1100-import/sys/ufs/ufs/ufs_quota.c
===================================================================
--- projects/clang1100-import/sys/ufs/ufs/ufs_quota.c	(revision 364278)
+++ projects/clang1100-import/sys/ufs/ufs/ufs_quota.c	(revision 364279)
@@ -1,1886 +1,1885 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Robert Elz at The University of Melbourne.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_quota.c	8.5 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/endian.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 CTASSERT(sizeof(struct dqblk64) == sizeof(struct dqhdr64));
 
 static int unprivileged_get_quota = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW,
     &unprivileged_get_quota, 0,
     "Unprivileged processes may retrieve quotas for other uids and gids");
 
 static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries");
 
 /*
  * Quota name to error message mapping.
  */
 static char *quotatypes[] = INITQFNAMES;
 
 static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *);
 static int chkiqchg(struct inode *, int, struct ucred *, int, int *);
 static int dqopen(struct vnode *, struct ufsmount *, int);
 static int dqget(struct vnode *,
 	u_long, struct ufsmount *, int, struct dquot **);
 static int dqsync(struct vnode *, struct dquot *);
 static int dqflush(struct vnode *);
 static int quotaoff1(struct thread *td, struct mount *mp, int type);
 static int quotaoff_inchange(struct thread *td, struct mount *mp, int type);
 
 /* conversion functions - from_to() */
 static void dqb32_dq(const struct dqblk32 *, struct dquot *);
 static void dqb64_dq(const struct dqblk64 *, struct dquot *);
 static void dq_dqb32(const struct dquot *, struct dqblk32 *);
 static void dq_dqb64(const struct dquot *, struct dqblk64 *);
 static void dqb32_dqb64(const struct dqblk32 *, struct dqblk64 *);
 static void dqb64_dqb32(const struct dqblk64 *, struct dqblk32 *);
 
 #ifdef DIAGNOSTIC
 static void dqref(struct dquot *);
 static void chkdquot(struct inode *);
 #endif
 
 /*
  * Set up the quotas for an inode.
  *
  * This routine completely defines the semantics of quotas.
  * If other criterion want to be used to establish quotas, the
  * MAXQUOTAS value in quota.h should be increased, and the
  * additional dquots set up here.
  */
 int
 getinoquota(struct inode *ip)
 {
 	struct ufsmount *ump;
 	struct vnode *vp;
 	int error;
 
 	vp = ITOV(ip);
 
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * snapshot and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return (0);
 	/*
 	 * XXX: Turn off quotas for files with a negative UID or GID.
 	 * This prevents the creation of 100GB+ quota files.
 	 */
 	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
 		return (0);
 	ump = VFSTOUFS(vp->v_mount);
 	/*
 	 * Set up the user quota based on file uid.
 	 * EINVAL means that quotas are not enabled.
 	 */
 	if ((error =
 		dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) &&
 	    error != EINVAL)
 		return (error);
 	/*
 	 * Set up the group quota based on file gid.
 	 * EINVAL means that quotas are not enabled.
 	 */
 	if ((error =
 		dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) &&
 	    error != EINVAL)
 		return (error);
 	return (0);
 }
 
 /*
  * Update disk usage, and take corrective action.
  */
 int
 chkdq(struct inode *ip, ufs2_daddr_t change, struct ucred *cred, int flags)
 {
 	struct dquot *dq;
 	ufs2_daddr_t ncurblocks;
 	struct vnode *vp = ITOV(ip);
 	int i, error, warn, do_check;
 
 	MPASS(cred != NOCRED || (flags & FORCE) != 0);
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * snapshot and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return (0);
 	/*
 	 * XXX: Turn off quotas for files with a negative UID or GID.
 	 * This prevents the creation of 100GB+ quota files.
 	 */
 	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
 		return (0);
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
 #endif
 	if (change == 0)
 		return (0);
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			DQI_LOCK(dq);
 			DQI_WAIT(dq, PINOD+1, "chkdq1");
 			ncurblocks = dq->dq_curblocks + change;
 			if (ncurblocks >= 0)
 				dq->dq_curblocks = ncurblocks;
 			else
 				dq->dq_curblocks = 0;
 			dq->dq_flags &= ~DQ_BLKS;
 			dq->dq_flags |= DQ_MOD;
 			DQI_UNLOCK(dq);
 		}
 		return (0);
 	}
 	if ((flags & FORCE) == 0 &&
 	    priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA))
 		do_check = 1;
 	else
 		do_check = 0;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		warn = 0;
 		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "chkdq2");
 		if (do_check) {
 			error = chkdqchg(ip, change, cred, i, &warn);
 			if (error) {
 				/*
 				 * Roll back user quota changes when
 				 * group quota failed.
 				 */
 				while (i > 0) {
 					--i;
 					dq = ip->i_dquot[i];
 					if (dq == NODQUOT)
 						continue;
 					DQI_LOCK(dq);
 					DQI_WAIT(dq, PINOD+1, "chkdq3");
 					ncurblocks = dq->dq_curblocks - change;
 					if (ncurblocks >= 0)
 						dq->dq_curblocks = ncurblocks;
 					else
 						dq->dq_curblocks = 0;
 					dq->dq_flags &= ~DQ_BLKS;
 					dq->dq_flags |= DQ_MOD;
 					DQI_UNLOCK(dq);
 				}
 				return (error);
 			}
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curblocks + change >= dq->dq_bsoftlimit &&
 		    dq->dq_curblocks < dq->dq_bsoftlimit)
 			dq->dq_btime = time_second + ITOUMP(ip)->um_btime[i];
 		dq->dq_curblocks += change;
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 		if (warn)
 			uprintf("\n%s: warning, %s disk quota exceeded\n",
 			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[i]);
 	}
 	return (0);
 }
 
 /*
  * Check for a valid change to a users allocation.
  * Issue an error message if appropriate.
  */
 static int
 chkdqchg(struct inode *ip, ufs2_daddr_t change, struct ucred *cred,
     int type, int *warn)
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ufs2_daddr_t ncurblocks = dq->dq_curblocks + change;
 
 	/*
 	 * If user would exceed their hard limit, disallow space allocation.
 	 */
 	if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
 		if ((dq->dq_flags & DQ_BLKS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
 			dq->dq_flags |= DQ_BLKS;
 			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s disk limit reached\n",
 			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			return (EDQUOT);
 		}
 		DQI_UNLOCK(dq);
 		return (EDQUOT);
 	}
 	/*
 	 * If user is over their soft limit for too long, disallow space
 	 * allocation. Reset time limit as they cross their soft limit.
 	 */
 	if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
 		if (dq->dq_curblocks < dq->dq_bsoftlimit) {
 			dq->dq_btime = time_second + ITOUMP(ip)->um_btime[type];
 			if (ip->i_uid == cred->cr_uid)
 				*warn = 1;
 			return (0);
 		}
 		if (time_second > dq->dq_btime) {
 			if ((dq->dq_flags & DQ_BLKS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
 				dq->dq_flags |= DQ_BLKS;
 				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s "
 				    "disk quota exceeded for too long\n",
 				    ITOVFS(ip)->mnt_stat.f_mntonname,
 				    quotatypes[type]);
 				return (EDQUOT);
 			}
 			DQI_UNLOCK(dq);
 			return (EDQUOT);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the inode limit, applying corrective action.
  */
 int
 chkiq(struct inode *ip, int change, struct ucred *cred, int flags)
 {
 	struct dquot *dq;
 	int i, error, warn, do_check;
 
 	MPASS(cred != NOCRED || (flags & FORCE) != 0);
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
 #endif
 	if (change == 0)
 		return (0);
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
 			DQI_LOCK(dq);
 			DQI_WAIT(dq, PINOD+1, "chkiq1");
 			if (dq->dq_curinodes >= -change)
 				dq->dq_curinodes += change;
 			else
 				dq->dq_curinodes = 0;
 			dq->dq_flags &= ~DQ_INODS;
 			dq->dq_flags |= DQ_MOD;
 			DQI_UNLOCK(dq);
 		}
 		return (0);
 	}
 	if ((flags & FORCE) == 0 &&
 	    priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA))
 		do_check = 1;
 	else
 		do_check = 0;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		warn = 0;
 		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "chkiq2");
 		if (do_check) {
 			error = chkiqchg(ip, change, cred, i, &warn);
 			if (error) {
 				/*
 				 * Roll back user quota changes when
 				 * group quota failed.
 				 */
 				while (i > 0) {
 					--i;
 					dq = ip->i_dquot[i];
 					if (dq == NODQUOT)
 						continue;
 					DQI_LOCK(dq);
 					DQI_WAIT(dq, PINOD+1, "chkiq3");
 					if (dq->dq_curinodes >= change)
 						dq->dq_curinodes -= change;
 					else
 						dq->dq_curinodes = 0;
 					dq->dq_flags &= ~DQ_INODS;
 					dq->dq_flags |= DQ_MOD;
 					DQI_UNLOCK(dq);
 				}
 				return (error);
 			}
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curinodes + change >= dq->dq_isoftlimit &&
 		    dq->dq_curinodes < dq->dq_isoftlimit)
 			dq->dq_itime = time_second + ITOUMP(ip)->um_itime[i];
 		dq->dq_curinodes += change;
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 		if (warn)
 			uprintf("\n%s: warning, %s inode quota exceeded\n",
 			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[i]);
 	}
 	return (0);
 }
 
 /*
  * Check for a valid change to a users allocation.
  * Issue an error message if appropriate.
  */
 static int
 chkiqchg(struct inode *ip, int change, struct ucred *cred, int type, int *warn)
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ino_t ncurinodes = dq->dq_curinodes + change;
 
 	/*
 	 * If user would exceed their hard limit, disallow inode allocation.
 	 */
 	if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
 		if ((dq->dq_flags & DQ_INODS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
 			dq->dq_flags |= DQ_INODS;
 			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s inode limit reached\n",
 			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			return (EDQUOT);
 		}
 		DQI_UNLOCK(dq);
 		return (EDQUOT);
 	}
 	/*
 	 * If user is over their soft limit for too long, disallow inode
 	 * allocation. Reset time limit as they cross their soft limit.
 	 */
 	if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
 		if (dq->dq_curinodes < dq->dq_isoftlimit) {
 			dq->dq_itime = time_second + ITOUMP(ip)->um_itime[type];
 			if (ip->i_uid == cred->cr_uid)
 				*warn = 1;
 			return (0);
 		}
 		if (time_second > dq->dq_itime) {
 			if ((dq->dq_flags & DQ_INODS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
 				dq->dq_flags |= DQ_INODS;
 				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s "
 				    "inode quota exceeded for too long\n",
 				    ITOVFS(ip)->mnt_stat.f_mntonname,
 				    quotatypes[type]);
 				return (EDQUOT);
 			}
 			DQI_UNLOCK(dq);
 			return (EDQUOT);
 		}
 	}
 	return (0);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * On filesystems with quotas enabled, it is an error for a file to change
  * size and not to have a dquot structure associated with it.
  */
 static void
 chkdquot(struct inode *ip)
 {
 	struct ufsmount *ump;
 	struct vnode *vp;
 	int i;
 
 	ump = ITOUMP(ip);
 	vp = ITOV(ip);
 
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * these are snapshots and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return;
 	/*
 	 * XXX: Turn off quotas for files with a negative UID or GID.
 	 * This prevents the creation of 100GB+ quota files.
 	 */
 	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
 		return;
 
 	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (ump->um_quotas[i] == NULLVP ||
 		    (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING)))
 			continue;
 		if (ip->i_dquot[i] == NODQUOT) {
 			UFS_UNLOCK(ump);
 			vn_printf(ITOV(ip), "chkdquot: missing dquot ");
 			panic("chkdquot: missing dquot");
 		}
 	}
 	UFS_UNLOCK(ump);
 }
 #endif
 
 /*
  * Code to process quotactl commands.
  */
 
 /*
  * Q_QUOTAON - set up a quota file for a particular filesystem.
  */
 int
 quotaon(struct thread *td, struct mount *mp, int type, void *fname)
 {
 	struct ufsmount *ump;
 	struct vnode *vp, **vpp;
 	struct vnode *mvp;
 	struct dquot *dq;
 	int error, flags;
 	struct nameidata nd;
 
 	error = priv_check(td, PRIV_UFS_QUOTAON);
 	if (error != 0) {
 		vfs_unbusy(mp);
 		return (error);
 	}
 
 	if ((mp->mnt_flag & MNT_RDONLY) != 0) {
 		vfs_unbusy(mp);
 		return (EROFS);
 	}
 
 	ump = VFSTOUFS(mp);
 	dq = NODQUOT;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td);
 	flags = FREAD | FWRITE;
 	vfs_ref(mp);
 	vfs_unbusy(mp);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error != 0) {
 		vfs_rel(mp);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	error = vfs_busy(mp, MBF_NOWAIT);
 	vfs_rel(mp);
 	if (error == 0) {
 		if (vp->v_type != VREG) {
 			error = EACCES;
 			vfs_unbusy(mp);
 		}
 	}
 	if (error != 0) {
 		VOP_UNLOCK(vp);
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 		return (error);
 	}
 
 	UFS_LOCK(ump);
 	if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
 		UFS_UNLOCK(ump);
 		VOP_UNLOCK(vp);
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 		vfs_unbusy(mp);
 		return (EALREADY);
 	}
 	ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING;
 	UFS_UNLOCK(ump);
 	if ((error = dqopen(vp, ump, type)) != 0) {
 		VOP_UNLOCK(vp);
 		UFS_LOCK(ump);
 		ump->um_qflags[type] &= ~(QTF_OPENING|QTF_CLOSING);
 		UFS_UNLOCK(ump);
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 		vfs_unbusy(mp);
 		return (error);
 	}
 	VOP_UNLOCK(vp);
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_QUOTA;
 	MNT_IUNLOCK(mp);
 
 	vpp = &ump->um_quotas[type];
 	if (*vpp != vp)
 		quotaoff1(td, mp, type);
 
 	/*
 	 * When the directory vnode containing the quota file is
 	 * inactivated, due to the shared lookup of the quota file
 	 * vput()ing the dvp, the qsyncvp() call for the containing
 	 * directory would try to acquire the quota lock exclusive.
 	 * At the same time, lookup already locked the quota vnode
 	 * shared.  Mark the quota vnode lock as allowing recursion
 	 * and automatically converting shared locks to exclusive.
 	 *
 	 * Also mark quota vnode as system.
 	 */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_SYSTEM;
 	VN_LOCK_AREC(vp);
 	VN_LOCK_DSHARE(vp);
 	VOP_UNLOCK(vp);
 	*vpp = vp;
 	/*
 	 * Save the credential of the process that turned on quotas.
 	 * Set up the time limits for this quota.
 	 */
 	ump->um_cred[type] = crhold(td->td_ucred);
 	ump->um_btime[type] = MAX_DQ_TIME;
 	ump->um_itime[type] = MAX_IQ_TIME;
 	if (dqget(NULLVP, 0, ump, type, &dq) == 0) {
 		if (dq->dq_btime > 0)
 			ump->um_btime[type] = dq->dq_btime;
 		if (dq->dq_itime > 0)
 			ump->um_itime[type] = dq->dq_itime;
 		dqrele(NULLVP, dq);
 	}
 	/*
 	 * Allow the getdq from getinoquota below to read the quota
 	 * from file.
 	 */
 	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_CLOSING;
 	UFS_UNLOCK(ump);
 	/*
 	 * Search vnodes associated with this mount point,
 	 * adding references to quota file being opened.
 	 * NB: only need to add dquot's for inodes being modified.
 	 */
 again:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
-		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto again;
 		}
 		if (vp->v_type == VNON || vp->v_writecount <= 0) {
 			VOP_UNLOCK(vp);
 			vrele(vp);
 			continue;
 		}
 		error = getinoquota(VTOI(vp));
 		VOP_UNLOCK(vp);
 		vrele(vp);
 		if (error) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			break;
 		}
 	}
 
         if (error)
 		quotaoff_inchange(td, mp, type);
 	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_OPENING;
 	KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0,
 		("quotaon: leaking flags"));
 	UFS_UNLOCK(ump);
 
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Main code to turn off disk quotas for a filesystem. Does not change
  * flags.
  */
 static int
 quotaoff1(struct thread *td, struct mount *mp, int type)
 {
 	struct vnode *vp;
 	struct vnode *qvp, *mvp;
 	struct ufsmount *ump;
 	struct dquot *dq;
 	struct inode *ip;
 	struct ucred *cr;
 	int error;
 
 	ump = VFSTOUFS(mp);
 
 	UFS_LOCK(ump);
 	KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0,
 		("quotaoff1: flags are invalid"));
 	if ((qvp = ump->um_quotas[type]) == NULLVP) {
 		UFS_UNLOCK(ump);
 		return (0);
 	}
 	cr = ump->um_cred[type];
 	UFS_UNLOCK(ump);
 
 	/*
 	 * Search vnodes associated with this mount point,
 	 * deleting any references to quota file being closed.
 	 */
 again:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto again;
 		}
 		ip = VTOI(vp);
 		dq = ip->i_dquot[type];
 		ip->i_dquot[type] = NODQUOT;
 		dqrele(vp, dq);
 		VOP_UNLOCK(vp);
 		vrele(vp);
 	}
 
 	error = dqflush(qvp);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Clear um_quotas before closing the quota vnode to prevent
 	 * access to the closed vnode from dqget/dqsync
 	 */
 	UFS_LOCK(ump);
 	ump->um_quotas[type] = NULLVP;
 	ump->um_cred[type] = NOCRED;
 	UFS_UNLOCK(ump);
 
 	vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY);
 	qvp->v_vflag &= ~VV_SYSTEM;
 	VOP_UNLOCK(qvp);
 	error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td);
 	crfree(cr);
 
 	return (error);
 }
 
 static int
 quotaoff_inchange1(struct thread *td, struct mount *mp, int type)
 {
 	int error;
 	bool need_resume;
 
 	/*
 	 * mp is already suspended on unmount.  If not, suspend it, to
 	 * avoid the situation where quotaoff operation eventually
 	 * failing due to SU structures still keeping references on
 	 * dquots, but vnode's references are already clean.  This
 	 * would cause quota accounting leak and asserts otherwise.
 	 * Note that the thread has already called vn_start_write().
 	 */
 	if (mp->mnt_susp_owner == td) {
 		need_resume = false;
 	} else {
 		error = vfs_write_suspend_umnt(mp);
 		if (error != 0)
 			return (error);
 		need_resume = true;
 	}
 	error = quotaoff1(td, mp, type);
 	if (need_resume)
 		vfs_write_resume(mp, VR_START_WRITE);
 	return (error);
 }
 
 /*
  * Turns off quotas, assumes that ump->um_qflags are already checked
  * and QTF_CLOSING is set to indicate operation in progress. Fixes
  * ump->um_qflags and mp->mnt_flag after.
  */
 int
 quotaoff_inchange(struct thread *td, struct mount *mp, int type)
 {
 	struct ufsmount *ump;
 	int error, i;
 
 	error = quotaoff_inchange1(td, mp, type);
 
 	ump = VFSTOUFS(mp);
 	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_CLOSING;
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
 	if (i == MAXQUOTAS) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~MNT_QUOTA;
 		MNT_IUNLOCK(mp);
 	}
 	UFS_UNLOCK(ump);
 	return (error);
 }
 
 /*
  * Q_QUOTAOFF - turn off disk quotas for a filesystem.
  */
 int
 quotaoff(struct thread *td, struct mount *mp, int type)
 {
 	struct ufsmount *ump;
 	int error;
 
 	error = priv_check(td, PRIV_UFS_QUOTAOFF);
 	if (error)
 		return (error);
 
 	ump = VFSTOUFS(mp);
 	UFS_LOCK(ump);
 	if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
 		UFS_UNLOCK(ump);
 		return (EALREADY);
 	}
 	ump->um_qflags[type] |= QTF_CLOSING;
 	UFS_UNLOCK(ump);
 
 	return (quotaoff_inchange(td, mp, type));
 }
 
 /*
  * Q_GETQUOTA - return current values in a dqblk structure.
  */
 static int
 _getquota(struct thread *td, struct mount *mp, u_long id, int type,
     struct dqblk64 *dqb)
 {
 	struct dquot *dq;
 	int error;
 
 	switch (type) {
 	case USRQUOTA:
 		if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) {
 			error = priv_check(td, PRIV_VFS_GETQUOTA);
 			if (error)
 				return (error);
 		}
 		break;
 
 	case GRPQUOTA:
 		if (!groupmember(id, td->td_ucred) &&
 		    !unprivileged_get_quota) {
 			error = priv_check(td, PRIV_VFS_GETQUOTA);
 			if (error)
 				return (error);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	dq = NODQUOT;
 	error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq);
 	if (error)
 		return (error);
 	*dqb = dq->dq_dqb;
 	dqrele(NULLVP, dq);
 	return (error);
 }
 
 /*
  * Q_SETQUOTA - assign an entire dqblk structure.
  */
 static int
 _setquota(struct thread *td, struct mount *mp, u_long id, int type,
     struct dqblk64 *dqb)
 {
 	struct dquot *dq;
 	struct dquot *ndq;
 	struct ufsmount *ump;
 	struct dqblk64 newlim;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_SETQUOTA);
 	if (error)
 		return (error);
 
 	newlim = *dqb;
 
 	ndq = NODQUOT;
 	ump = VFSTOUFS(mp);
 
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
 	DQI_LOCK(dq);
 	DQI_WAIT(dq, PINOD+1, "setqta");
 	/*
 	 * Copy all but the current values.
 	 * Reset time limit if previously had no soft limit or were
 	 * under it, but now have a soft limit and are over it.
 	 */
 	newlim.dqb_curblocks = dq->dq_curblocks;
 	newlim.dqb_curinodes = dq->dq_curinodes;
 	if (dq->dq_id != 0) {
 		newlim.dqb_btime = dq->dq_btime;
 		newlim.dqb_itime = dq->dq_itime;
 	}
 	if (newlim.dqb_bsoftlimit &&
 	    dq->dq_curblocks >= newlim.dqb_bsoftlimit &&
 	    (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
 		newlim.dqb_btime = time_second + ump->um_btime[type];
 	if (newlim.dqb_isoftlimit &&
 	    dq->dq_curinodes >= newlim.dqb_isoftlimit &&
 	    (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
 		newlim.dqb_itime = time_second + ump->um_itime[type];
 	dq->dq_dqb = newlim;
 	if (dq->dq_curblocks < dq->dq_bsoftlimit)
 		dq->dq_flags &= ~DQ_BLKS;
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	else
 		dq->dq_flags &= ~DQ_FAKE;
 	dq->dq_flags |= DQ_MOD;
 	DQI_UNLOCK(dq);
 	dqrele(NULLVP, dq);
 	return (0);
 }
 
 /*
  * Q_SETUSE - set current inode and block usage.
  */
 static int
 _setuse(struct thread *td, struct mount *mp, u_long id, int type,
     struct dqblk64 *dqb)
 {
 	struct dquot *dq;
 	struct ufsmount *ump;
 	struct dquot *ndq;
 	struct dqblk64 usage;
 	int error;
 
 	error = priv_check(td, PRIV_UFS_SETUSE);
 	if (error)
 		return (error);
 
 	usage = *dqb;
 
 	ump = VFSTOUFS(mp);
 	ndq = NODQUOT;
 
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
 	DQI_LOCK(dq);
 	DQI_WAIT(dq, PINOD+1, "setuse");
 	/*
 	 * Reset time limit if have a soft limit and were
 	 * previously under it, but are now over it.
 	 */
 	if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit &&
 	    usage.dqb_curblocks >= dq->dq_bsoftlimit)
 		dq->dq_btime = time_second + ump->um_btime[type];
 	if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit &&
 	    usage.dqb_curinodes >= dq->dq_isoftlimit)
 		dq->dq_itime = time_second + ump->um_itime[type];
 	dq->dq_curblocks = usage.dqb_curblocks;
 	dq->dq_curinodes = usage.dqb_curinodes;
 	if (dq->dq_curblocks < dq->dq_bsoftlimit)
 		dq->dq_flags &= ~DQ_BLKS;
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	dq->dq_flags |= DQ_MOD;
 	DQI_UNLOCK(dq);
 	dqrele(NULLVP, dq);
 	return (0);
 }
 
 int
 getquota32(struct thread *td, struct mount *mp, u_long id, int type, void *addr)
 {
 	struct dqblk32 dqb32;
 	struct dqblk64 dqb64;
 	int error;
 
 	error = _getquota(td, mp, id, type, &dqb64);
 	if (error)
 		return (error);
 	dqb64_dqb32(&dqb64, &dqb32);
 	error = copyout(&dqb32, addr, sizeof(dqb32));
 	return (error);
 }
 
 int
 setquota32(struct thread *td, struct mount *mp, u_long id, int type, void *addr)
 {
 	struct dqblk32 dqb32;
 	struct dqblk64 dqb64;
 	int error;
 
 	error = copyin(addr, &dqb32, sizeof(dqb32));
 	if (error)
 		return (error);
 	dqb32_dqb64(&dqb32, &dqb64);
 	error = _setquota(td, mp, id, type, &dqb64);
 	return (error);
 }
 
 int
 setuse32(struct thread *td, struct mount *mp, u_long id, int type, void *addr)
 {
 	struct dqblk32 dqb32;
 	struct dqblk64 dqb64;
 	int error;
 
 	error = copyin(addr, &dqb32, sizeof(dqb32));
 	if (error)
 		return (error);
 	dqb32_dqb64(&dqb32, &dqb64);
 	error = _setuse(td, mp, id, type, &dqb64);
 	return (error);
 }
 
 int
 getquota(struct thread *td, struct mount *mp, u_long id, int type, void *addr)
 {
 	struct dqblk64 dqb64;
 	int error;
 
 	error = _getquota(td, mp, id, type, &dqb64);
 	if (error)
 		return (error);
 	error = copyout(&dqb64, addr, sizeof(dqb64));
 	return (error);
 }
 
 int
 setquota(struct thread *td, struct mount *mp, u_long id, int type, void *addr)
 {
 	struct dqblk64 dqb64;
 	int error;
 
 	error = copyin(addr, &dqb64, sizeof(dqb64));
 	if (error)
 		return (error);
 	error = _setquota(td, mp, id, type, &dqb64);
 	return (error);
 }
 
 int
 setuse(struct thread *td, struct mount *mp, u_long id, int type, void *addr)
 {
 	struct dqblk64 dqb64;
 	int error;
 
 	error = copyin(addr, &dqb64, sizeof(dqb64));
 	if (error)
 		return (error);
 	error = _setuse(td, mp, id, type, &dqb64);
 	return (error);
 }
 
 /*
  * Q_GETQUOTASIZE - get bit-size of quota file fields
  */
 int
 getquotasize(struct thread *td, struct mount *mp, u_long id, int type,
     void *sizep)
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	int bitsize;
 
 	UFS_LOCK(ump);
 	if (ump->um_quotas[type] == NULLVP ||
 	    (ump->um_qflags[type] & QTF_CLOSING)) {
 		UFS_UNLOCK(ump);
 		return (EINVAL);
 	}
 	if ((ump->um_qflags[type] & QTF_64BIT) != 0)
 		bitsize = 64;
 	else
 		bitsize = 32;
 	UFS_UNLOCK(ump);
 	return (copyout(&bitsize, sizep, sizeof(int)));
 }
 
 /*
  * Q_SYNC - sync quota files to disk.
  */
 int
 qsync(struct mount *mp)
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
-	struct thread *td = curthread;		/* XXX */
 	struct vnode *vp, *mvp;
 	struct dquot *dq;
 	int i, error;
 
 	/*
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
 	if (i == MAXQUOTAS)
 		return (0);
 	/*
 	 * Search vnodes associated with this mount point,
 	 * synchronizing any modified dquot structures.
 	 */
 again:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
-		error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
+		error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 		if (error) {
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto again;
 			}
 			continue;
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dq = VTOI(vp)->i_dquot[i];
 			if (dq != NODQUOT)
 				dqsync(vp, dq);
 		}
 		vput(vp);
 	}
 	return (0);
 }
 
 /*
  * Sync quota file for given vnode to disk.
  */
 int
 qsyncvp(struct vnode *vp)
 {
 	struct ufsmount *ump = VFSTOUFS(vp->v_mount);
 	struct dquot *dq;
 	int i;
 
 	/*
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
 	if (i == MAXQUOTAS)
 		return (0);
 	/*
 	 * Search quotas associated with this vnode
 	 * synchronizing any modified dquot structures.
 	 */
 	for (i = 0; i < MAXQUOTAS; i++) {
 		dq = VTOI(vp)->i_dquot[i];
 		if (dq != NODQUOT)
 			dqsync(vp, dq);
 	}
 	return (0);
 }
 
 /*
  * Code pertaining to management of the in-core dquot data structures.
  */
 #define DQHASH(dqvp, id) \
 	(&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash])
 static LIST_HEAD(dqhash, dquot) *dqhashtbl;
 static u_long dqhash;
 
 /*
  * Dquot free list.
  */
 #define	DQUOTINC	5	/* minimum free dquots desired */
 static TAILQ_HEAD(dqfreelist, dquot) dqfreelist;
 static long numdquot, desireddquot = DQUOTINC;
 
 /*
  * Lock to protect quota hash, dq free list and dq_cnt ref counters of
  * _all_ dqs.
  */
 struct mtx dqhlock;
 
 #define	DQH_LOCK()	mtx_lock(&dqhlock)
 #define	DQH_UNLOCK()	mtx_unlock(&dqhlock)
 
 static struct dquot *dqhashfind(struct dqhash *dqh, u_long id,
 	struct vnode *dqvp);
 
 /*
  * Initialize the quota system.
  */
 void
 dqinit(void)
 {
 
 	mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF);
 	dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash);
 	TAILQ_INIT(&dqfreelist);
 }
 
 /*
  * Shut down the quota system.
  */
 void
 dquninit(void)
 {
 	struct dquot *dq;
 
 	hashdestroy(dqhashtbl, M_DQUOT, dqhash);
 	while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) {
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		mtx_destroy(&dq->dq_lock);
 		free(dq, M_DQUOT);
 	}
 	mtx_destroy(&dqhlock);
 }
 
 static struct dquot *
 dqhashfind(struct dqhash *dqh, u_long id, struct vnode *dqvp)
 {
 	struct dquot *dq;
 
 	mtx_assert(&dqhlock, MA_OWNED);
 	LIST_FOREACH(dq, dqh, dq_hash) {
 		if (dq->dq_id != id ||
 		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
 			continue;
 		/*
 		 * Cache hit with no references.  Take
 		 * the structure off the free list.
 		 */
 		if (dq->dq_cnt == 0)
 			TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		DQREF(dq);
 		return (dq);
 	}
 	return (NODQUOT);
 }
 
 /*
  * Determine the quota file type.
  *
  * A 32-bit quota file is simply an array of struct dqblk32.
  *
  * A 64-bit quota file is a struct dqhdr64 followed by an array of struct
  * dqblk64.  The header contains various magic bits which allow us to be
  * reasonably confident that it is indeeda 64-bit quota file and not just
  * a 32-bit quota file that just happens to "look right".
  *
  */
 static int
 dqopen(struct vnode *vp, struct ufsmount *ump, int type)
 {
 	struct dqhdr64 dqh;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "dqopen");
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = &dqh;
 	aiov.iov_len = sizeof(dqh);
 	auio.uio_resid = sizeof(dqh);
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = (struct thread *)0;
 	error = VOP_READ(vp, &auio, 0, ump->um_cred[type]);
 
 	if (error != 0)
 		return (error);
 	if (auio.uio_resid > 0) {
 		/* assume 32 bits */
 		return (0);
 	}
 
 	UFS_LOCK(ump);
 	if (strcmp(dqh.dqh_magic, Q_DQHDR64_MAGIC) == 0 &&
 	    be32toh(dqh.dqh_version) == Q_DQHDR64_VERSION &&
 	    be32toh(dqh.dqh_hdrlen) == (uint32_t)sizeof(struct dqhdr64) &&
 	    be32toh(dqh.dqh_reclen) == (uint32_t)sizeof(struct dqblk64)) {
 		/* XXX: what if the magic matches, but the sizes are wrong? */
 		ump->um_qflags[type] |= QTF_64BIT;
 	} else {
 		ump->um_qflags[type] &= ~QTF_64BIT;
 	}
 	UFS_UNLOCK(ump);
 
 	return (0);
 }
 
 /*
  * Obtain a dquot structure for the specified identifier and quota file
  * reading the information from the file if necessary.
  */
 static int
 dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type,
     struct dquot **dqp)
 {
 	uint8_t buf[sizeof(struct dqblk64)];
 	off_t base, recsize;
 	struct dquot *dq, *dq1;
 	struct dqhash *dqh;
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
 	int dqvplocked, error;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (vp != NULLVP)
 		ASSERT_VOP_ELOCKED(vp, "dqget");
 #endif
 
 	if (vp != NULLVP && *dqp != NODQUOT) {
 		return (0);
 	}
 
 	/* XXX: Disallow negative id values to prevent the
 	* creation of 100GB+ quota data files.
 	*/
 	if ((int)id < 0)
 		return (EINVAL);
 
 	UFS_LOCK(ump);
 	dqvp = ump->um_quotas[type];
 	if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) {
 		*dqp = NODQUOT;
 		UFS_UNLOCK(ump);
 		return (EINVAL);
 	}
 	vref(dqvp);
 	UFS_UNLOCK(ump);
 	error = 0;
 	dqvplocked = 0;
 
 	/*
 	 * Check the cache first.
 	 */
 	dqh = DQHASH(dqvp, id);
 	DQH_LOCK();
 	dq = dqhashfind(dqh, id, dqvp);
 	if (dq != NULL) {
 		DQH_UNLOCK();
 hfound:		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "dqget");
 		DQI_UNLOCK(dq);
 		if (dq->dq_ump == NULL) {
 			dqrele(vp, dq);
 			dq = NODQUOT;
 			error = EIO;
 		}
 		*dqp = dq;
 		if (dqvplocked)
 			vput(dqvp);
 		else
 			vrele(dqvp);
 		return (error);
 	}
 
 	/*
 	 * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there
 	 * since new dq will appear on the hash chain DQ_LOCKed.
 	 */
 	if (vp != dqvp) {
 		DQH_UNLOCK();
 		vn_lock(dqvp, LK_SHARED | LK_RETRY);
 		dqvplocked = 1;
 		DQH_LOCK();
 		/*
 		 * Recheck the cache after sleep for quota vnode lock.
 		 */
 		dq = dqhashfind(dqh, id, dqvp);
 		if (dq != NULL) {
 			DQH_UNLOCK();
 			goto hfound;
 		}
 	}
 
 	/*
 	 * Not in cache, allocate a new one or take it from the
 	 * free list.
 	 */
 	if (TAILQ_FIRST(&dqfreelist) == NODQUOT &&
 	    numdquot < MAXQUOTAS * desiredvnodes)
 		desireddquot += DQUOTINC;
 	if (numdquot < desireddquot) {
 		numdquot++;
 		DQH_UNLOCK();
 		dq1 = malloc(sizeof *dq1, M_DQUOT, M_WAITOK | M_ZERO);
 		mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF);
 		DQH_LOCK();
 		/*
 		 * Recheck the cache after sleep for memory.
 		 */
 		dq = dqhashfind(dqh, id, dqvp);
 		if (dq != NULL) {
 			numdquot--;
 			DQH_UNLOCK();
 			mtx_destroy(&dq1->dq_lock);
 			free(dq1, M_DQUOT);
 			goto hfound;
 		}
 		dq = dq1;
 	} else {
 		if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) {
 			DQH_UNLOCK();
 			tablefull("dquot");
 			*dqp = NODQUOT;
 			if (dqvplocked)
 				vput(dqvp);
 			else
 				vrele(dqvp);
 			return (EUSERS);
 		}
 		if (dq->dq_cnt || (dq->dq_flags & DQ_MOD))
 			panic("dqget: free dquot isn't %p", dq);
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
 		if (dq->dq_ump != NULL)
 			LIST_REMOVE(dq, dq_hash);
 	}
 
 	/*
 	 * Dq is put into hash already locked to prevent parallel
 	 * usage while it is being read from file.
 	 */
 	dq->dq_flags = DQ_LOCK;
 	dq->dq_id = id;
 	dq->dq_type = type;
 	dq->dq_ump = ump;
 	LIST_INSERT_HEAD(dqh, dq, dq_hash);
 	DQREF(dq);
 	DQH_UNLOCK();
 
 	/*
 	 * Read the requested quota record from the quota file, performing
 	 * any necessary conversions.
 	 */
 	if (ump->um_qflags[type] & QTF_64BIT) {
 		recsize = sizeof(struct dqblk64);
 		base = sizeof(struct dqhdr64);
 	} else {
 		recsize = sizeof(struct dqblk32);
 		base = 0;
 	}
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = buf;
 	aiov.iov_len = recsize;
 	auio.uio_resid = recsize;
 	auio.uio_offset = base + id * recsize;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = (struct thread *)0;
 
 	error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
 	if (auio.uio_resid == recsize && error == 0) {
 		bzero(&dq->dq_dqb, sizeof(dq->dq_dqb));
 	} else {
 		if (ump->um_qflags[type] & QTF_64BIT)
 			dqb64_dq((struct dqblk64 *)buf, dq);
 		else
 			dqb32_dq((struct dqblk32 *)buf, dq);
 	}
 	if (dqvplocked)
 		vput(dqvp);
 	else
 		vrele(dqvp);
 	/*
 	 * I/O error in reading quota file, release
 	 * quota structure and reflect problem to caller.
 	 */
 	if (error) {
 		DQH_LOCK();
 		dq->dq_ump = NULL;
 		LIST_REMOVE(dq, dq_hash);
 		DQH_UNLOCK();
 		DQI_LOCK(dq);
 		if (dq->dq_flags & DQ_WANT)
 			wakeup(dq);
 		dq->dq_flags = 0;
 		DQI_UNLOCK(dq);
 		dqrele(vp, dq);
 		*dqp = NODQUOT;
 		return (error);
 	}
 	DQI_LOCK(dq);
 	/*
 	 * Check for no limit to enforce.
 	 * Initialize time values if necessary.
 	 */
 	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	if (dq->dq_id != 0) {
 		if (dq->dq_btime == 0) {
 			dq->dq_btime = time_second + ump->um_btime[type];
 			if (dq->dq_bsoftlimit &&
 			    dq->dq_curblocks >= dq->dq_bsoftlimit)
 				dq->dq_flags |= DQ_MOD;
 		}
 		if (dq->dq_itime == 0) {
 			dq->dq_itime = time_second + ump->um_itime[type];
 			if (dq->dq_isoftlimit &&
 			    dq->dq_curinodes >= dq->dq_isoftlimit)
 				dq->dq_flags |= DQ_MOD;
 		}
 	}
 	DQI_WAKEUP(dq);
 	DQI_UNLOCK(dq);
 	*dqp = dq;
 	return (0);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Obtain a reference to a dquot.
  */
 static void
 dqref(struct dquot *dq)
 {
 
 	dq->dq_cnt++;
 }
 #endif
 
 /*
  * Release a reference to a dquot.
  */
 void
 dqrele(struct vnode *vp, struct dquot *dq)
 {
 
 	if (dq == NODQUOT)
 		return;
 	DQH_LOCK();
 	KASSERT(dq->dq_cnt > 0, ("Lost dq %p reference 1", dq));
 	if (dq->dq_cnt > 1) {
 		dq->dq_cnt--;
 		DQH_UNLOCK();
 		return;
 	}
 	DQH_UNLOCK();
 sync:
 	(void) dqsync(vp, dq);
 
 	DQH_LOCK();
 	KASSERT(dq->dq_cnt > 0, ("Lost dq %p reference 2", dq));
 	if (--dq->dq_cnt > 0)
 	{
 		DQH_UNLOCK();
 		return;
 	}
 
 	/*
 	 * The dq may become dirty after it is synced but before it is
 	 * put to the free list. Checking the DQ_MOD there without
 	 * locking dq should be safe since no other references to the
 	 * dq exist.
 	 */
 	if ((dq->dq_flags & DQ_MOD) != 0) {
 		dq->dq_cnt++;
 		DQH_UNLOCK();
 		goto sync;
 	}
 	TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist);
 	DQH_UNLOCK();
 }
 
 /*
  * Update the disk quota in the quota file.
  */
 static int
 dqsync(struct vnode *vp, struct dquot *dq)
 {
 	uint8_t buf[sizeof(struct dqblk64)];
 	off_t base, recsize;
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 	struct mount *mp;
 	struct ufsmount *ump;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (vp != NULL)
 		ASSERT_VOP_ELOCKED(vp, "dqsync");
 #endif
 
 	mp = NULL;
 	error = 0;
 	if (dq == NODQUOT)
 		panic("dqsync: dquot");
 	if ((ump = dq->dq_ump) == NULL)
 		return (0);
 	UFS_LOCK(ump);
 	if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP) {
 		if (vp == NULL) {
 			UFS_UNLOCK(ump);
 			return (0);
 		} else
 			panic("dqsync: file");
 	}
 	vref(dqvp);
 	UFS_UNLOCK(ump);
 
 	DQI_LOCK(dq);
 	if ((dq->dq_flags & DQ_MOD) == 0) {
 		DQI_UNLOCK(dq);
 		vrele(dqvp);
 		return (0);
 	}
 	DQI_UNLOCK(dq);
 
 	(void) vn_start_secondary_write(dqvp, &mp, V_WAIT);
 	if (vp != dqvp)
 		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
 
 	DQI_LOCK(dq);
 	DQI_WAIT(dq, PINOD+2, "dqsync");
 	if ((dq->dq_flags & DQ_MOD) == 0)
 		goto out;
 	dq->dq_flags |= DQ_LOCK;
 	DQI_UNLOCK(dq);
 
 	/*
 	 * Write the quota record to the quota file, performing any
 	 * necessary conversions.  See dqget() for additional details.
 	 */
 	if (ump->um_qflags[dq->dq_type] & QTF_64BIT) {
 		dq_dqb64(dq, (struct dqblk64 *)buf);
 		recsize = sizeof(struct dqblk64);
 		base = sizeof(struct dqhdr64);
 	} else {
 		dq_dqb32(dq, (struct dqblk32 *)buf);
 		recsize = sizeof(struct dqblk32);
 		base = 0;
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = buf;
 	aiov.iov_len = recsize;
 	auio.uio_resid = recsize;
 	auio.uio_offset = base + dq->dq_id * recsize;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = (struct thread *)0;
 	error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
 	if (auio.uio_resid && error == 0)
 		error = EIO;
 
 	DQI_LOCK(dq);
 	DQI_WAKEUP(dq);
 	dq->dq_flags &= ~DQ_MOD;
 out:
 	DQI_UNLOCK(dq);
 	if (vp != dqvp)
 		vput(dqvp);
 	else
 		vrele(dqvp);
 	vn_finished_secondary_write(mp);
 	return (error);
 }
 
 /*
  * Flush all entries from the cache for a particular vnode.
  */
 static int
 dqflush(struct vnode *vp)
 {
 	struct dquot *dq, *nextdq;
 	struct dqhash *dqh;
 	int error;
 
 	/*
 	 * Move all dquot's that used to refer to this quota
 	 * file off their hash chains (they will eventually
 	 * fall off the head of the free list and be re-used).
 	 */
 	error = 0;
 	DQH_LOCK();
 	for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) {
 		for (dq = LIST_FIRST(dqh); dq; dq = nextdq) {
 			nextdq = LIST_NEXT(dq, dq_hash);
 			if (dq->dq_ump->um_quotas[dq->dq_type] != vp)
 				continue;
 			if (dq->dq_cnt)
 				error = EBUSY;
 			else {
 				LIST_REMOVE(dq, dq_hash);
 				dq->dq_ump = NULL;
 			}
 		}
 	}
 	DQH_UNLOCK();
 	return (error);
 }
 
 /*
  * The following three functions are provided for the adjustment of
  * quotas by the soft updates code.
  */
 #ifdef SOFTUPDATES
 /*
  * Acquire a reference to the quota structures associated with a vnode.
  * Return count of number of quota structures found.
  */
 int
 quotaref(vp, qrp)
 	struct vnode *vp;
 	struct dquot **qrp;
 {
 	struct inode *ip;
 	struct dquot *dq;
 	int i, found;
 
 	for (i = 0; i < MAXQUOTAS; i++)
 		qrp[i] = NODQUOT;
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * snapshot and quota files.
 	 */
 	if ((vp->v_vflag & VV_SYSTEM) != 0)
 		return (0);
 	/*
 	 * Iterate through and copy active quotas.
 	 */
 	found = 0;
 	ip = VTOI(vp);
 	mtx_lock(&dqhlock);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
 		DQREF(dq);
 		qrp[i] = dq;
 		found++;
 	}
 	mtx_unlock(&dqhlock);
 	return (found);
 }
 
 /*
  * Release a set of quota structures obtained from a vnode.
  */
 void
 quotarele(qrp)
 	struct dquot **qrp;
 {
 	struct dquot *dq;
 	int i;
 
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = qrp[i]) == NODQUOT)
 			continue;
 		dqrele(NULL, dq);
 	}
 }
 
 /*
  * Adjust the number of blocks associated with a quota.
  * Positive numbers when adding blocks; negative numbers when freeing blocks.
  */
 void
 quotaadj(qrp, ump, blkcount)
 	struct dquot **qrp;
 	struct ufsmount *ump;
 	int64_t blkcount;
 {
 	struct dquot *dq;
 	ufs2_daddr_t ncurblocks;
 	int i;
 
 	if (blkcount == 0)
 		return;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = qrp[i]) == NODQUOT)
 			continue;
 		DQI_LOCK(dq);
 		DQI_WAIT(dq, PINOD+1, "adjqta");
 		ncurblocks = dq->dq_curblocks + blkcount;
 		if (ncurblocks >= 0)
 			dq->dq_curblocks = ncurblocks;
 		else
 			dq->dq_curblocks = 0;
 		if (blkcount < 0)
 			dq->dq_flags &= ~DQ_BLKS;
 		else if (dq->dq_curblocks + blkcount >= dq->dq_bsoftlimit &&
 			 dq->dq_curblocks < dq->dq_bsoftlimit)
 			dq->dq_btime = time_second + ump->um_btime[i];
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 	}
 }
 #endif /* SOFTUPDATES */
 
 /*
  * 32-bit / 64-bit conversion functions.
  *
  * 32-bit quota records are stored in native byte order.  Attention must
  * be paid to overflow issues.
  *
  * 64-bit quota records are stored in network byte order.
  */
 
 #define CLIP32(u64) (u64 > UINT32_MAX ? UINT32_MAX : (uint32_t)u64)
 
 /*
  * Convert 32-bit host-order structure to dquot.
  */
 static void
 dqb32_dq(const struct dqblk32 *dqb32, struct dquot *dq)
 {
 
 	dq->dq_bhardlimit = dqb32->dqb_bhardlimit;
 	dq->dq_bsoftlimit = dqb32->dqb_bsoftlimit;
 	dq->dq_curblocks = dqb32->dqb_curblocks;
 	dq->dq_ihardlimit = dqb32->dqb_ihardlimit;
 	dq->dq_isoftlimit = dqb32->dqb_isoftlimit;
 	dq->dq_curinodes = dqb32->dqb_curinodes;
 	dq->dq_btime = dqb32->dqb_btime;
 	dq->dq_itime = dqb32->dqb_itime;
 }
 
 /*
  * Convert 64-bit network-order structure to dquot.
  */
 static void
 dqb64_dq(const struct dqblk64 *dqb64, struct dquot *dq)
 {
 
 	dq->dq_bhardlimit = be64toh(dqb64->dqb_bhardlimit);
 	dq->dq_bsoftlimit = be64toh(dqb64->dqb_bsoftlimit);
 	dq->dq_curblocks = be64toh(dqb64->dqb_curblocks);
 	dq->dq_ihardlimit = be64toh(dqb64->dqb_ihardlimit);
 	dq->dq_isoftlimit = be64toh(dqb64->dqb_isoftlimit);
 	dq->dq_curinodes = be64toh(dqb64->dqb_curinodes);
 	dq->dq_btime = be64toh(dqb64->dqb_btime);
 	dq->dq_itime = be64toh(dqb64->dqb_itime);
 }
 
 /*
  * Convert dquot to 32-bit host-order structure.
  */
 static void
 dq_dqb32(const struct dquot *dq, struct dqblk32 *dqb32)
 {
 
 	dqb32->dqb_bhardlimit = CLIP32(dq->dq_bhardlimit);
 	dqb32->dqb_bsoftlimit = CLIP32(dq->dq_bsoftlimit);
 	dqb32->dqb_curblocks = CLIP32(dq->dq_curblocks);
 	dqb32->dqb_ihardlimit = CLIP32(dq->dq_ihardlimit);
 	dqb32->dqb_isoftlimit = CLIP32(dq->dq_isoftlimit);
 	dqb32->dqb_curinodes = CLIP32(dq->dq_curinodes);
 	dqb32->dqb_btime = CLIP32(dq->dq_btime);
 	dqb32->dqb_itime = CLIP32(dq->dq_itime);
 }
 
 /*
  * Convert dquot to 64-bit network-order structure.
  */
 static void
 dq_dqb64(const struct dquot *dq, struct dqblk64 *dqb64)
 {
 
 	dqb64->dqb_bhardlimit = htobe64(dq->dq_bhardlimit);
 	dqb64->dqb_bsoftlimit = htobe64(dq->dq_bsoftlimit);
 	dqb64->dqb_curblocks = htobe64(dq->dq_curblocks);
 	dqb64->dqb_ihardlimit = htobe64(dq->dq_ihardlimit);
 	dqb64->dqb_isoftlimit = htobe64(dq->dq_isoftlimit);
 	dqb64->dqb_curinodes = htobe64(dq->dq_curinodes);
 	dqb64->dqb_btime = htobe64(dq->dq_btime);
 	dqb64->dqb_itime = htobe64(dq->dq_itime);
 }
 
 /*
  * Convert 64-bit host-order structure to 32-bit host-order structure.
  */
 static void
 dqb64_dqb32(const struct dqblk64 *dqb64, struct dqblk32 *dqb32)
 {
 
 	dqb32->dqb_bhardlimit = CLIP32(dqb64->dqb_bhardlimit);
 	dqb32->dqb_bsoftlimit = CLIP32(dqb64->dqb_bsoftlimit);
 	dqb32->dqb_curblocks = CLIP32(dqb64->dqb_curblocks);
 	dqb32->dqb_ihardlimit = CLIP32(dqb64->dqb_ihardlimit);
 	dqb32->dqb_isoftlimit = CLIP32(dqb64->dqb_isoftlimit);
 	dqb32->dqb_curinodes = CLIP32(dqb64->dqb_curinodes);
 	dqb32->dqb_btime = CLIP32(dqb64->dqb_btime);
 	dqb32->dqb_itime = CLIP32(dqb64->dqb_itime);
 }
 
 /*
  * Convert 32-bit host-order structure to 64-bit host-order structure.
  */
 static void
 dqb32_dqb64(const struct dqblk32 *dqb32, struct dqblk64 *dqb64)
 {
 
 	dqb64->dqb_bhardlimit = dqb32->dqb_bhardlimit;
 	dqb64->dqb_bsoftlimit = dqb32->dqb_bsoftlimit;
 	dqb64->dqb_curblocks = dqb32->dqb_curblocks;
 	dqb64->dqb_ihardlimit = dqb32->dqb_ihardlimit;
 	dqb64->dqb_isoftlimit = dqb32->dqb_isoftlimit;
 	dqb64->dqb_curinodes = dqb32->dqb_curinodes;
 	dqb64->dqb_btime = dqb32->dqb_btime;
 	dqb64->dqb_itime = dqb32->dqb_itime;
 }
Index: projects/clang1100-import/sys/vm/vm_fault.c
===================================================================
--- projects/clang1100-import/sys/vm/vm_fault.c	(revision 364278)
+++ projects/clang1100-import/sys/vm/vm_fault.c	(revision 364279)
@@ -1,2007 +1,2007 @@
 /*-
  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Page fault handling module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_reserv.h>
 
 #define PFBAK 4
 #define PFFOR 4
 
 #define	VM_FAULT_READ_DEFAULT	(1 + VM_FAULT_READ_AHEAD_INIT)
 #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
 
 #define	VM_FAULT_DONTNEED_MIN	1048576
 
 struct faultstate {
 	/* Fault parameters. */
 	vm_offset_t	vaddr;
 	vm_page_t	*m_hold;
 	vm_prot_t	fault_type;
 	vm_prot_t	prot;
 	int		fault_flags;
 	int		oom;
 	boolean_t	wired;
 
 	/* Page reference for cow. */
 	vm_page_t m_cow;
 
 	/* Current object. */
 	vm_object_t	object;
 	vm_pindex_t	pindex;
 	vm_page_t	m;
 
 	/* Top-level map object. */
 	vm_object_t	first_object;
 	vm_pindex_t	first_pindex;
 	vm_page_t	first_m;
 
 	/* Map state. */
 	vm_map_t	map;
 	vm_map_entry_t	entry;
 	int		map_generation;
 	bool		lookup_still_valid;
 
 	/* Vnode if locked. */
 	struct vnode	*vp;
 };
 
 static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
 	    int ahead);
 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
 	    int backward, int forward, bool obj_locked);
 
 static int vm_pfault_oom_attempts = 3;
 SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN,
     &vm_pfault_oom_attempts, 0,
     "Number of page allocation attempts in page fault handler before it "
     "triggers OOM handling");
 
 static int vm_pfault_oom_wait = 10;
 SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN,
     &vm_pfault_oom_wait, 0,
     "Number of seconds to wait for free pages before retrying "
     "the page fault handler");
 
 static inline void
 fault_page_release(vm_page_t *mp)
 {
 	vm_page_t m;
 
 	m = *mp;
 	if (m != NULL) {
 		/*
 		 * We are likely to loop around again and attempt to busy
 		 * this page.  Deactivating it leaves it available for
 		 * pageout while optimizing fault restarts.
 		 */
 		vm_page_deactivate(m);
 		vm_page_xunbusy(m);
 		*mp = NULL;
 	}
 }
 
 static inline void
 fault_page_free(vm_page_t *mp)
 {
 	vm_page_t m;
 
 	m = *mp;
 	if (m != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 		if (!vm_page_wired(m))
 			vm_page_free(m);
 		else
 			vm_page_xunbusy(m);
 		*mp = NULL;
 	}
 }
 
 static inline void
 unlock_map(struct faultstate *fs)
 {
 
 	if (fs->lookup_still_valid) {
 		vm_map_lookup_done(fs->map, fs->entry);
 		fs->lookup_still_valid = false;
 	}
 }
 
 static void
 unlock_vp(struct faultstate *fs)
 {
 
 	if (fs->vp != NULL) {
 		vput(fs->vp);
 		fs->vp = NULL;
 	}
 }
 
 static void
 fault_deallocate(struct faultstate *fs)
 {
 
 	fault_page_release(&fs->m_cow);
 	fault_page_release(&fs->m);
 	vm_object_pip_wakeup(fs->object);
 	if (fs->object != fs->first_object) {
 		VM_OBJECT_WLOCK(fs->first_object);
 		fault_page_free(&fs->first_m);
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		vm_object_pip_wakeup(fs->first_object);
 	}
 	vm_object_deallocate(fs->first_object);
 	unlock_map(fs);
 	unlock_vp(fs);
 }
 
 static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
 	VM_OBJECT_WUNLOCK(fs->object);
 	fault_deallocate(fs);
 }
 
 static void
 vm_fault_dirty(struct faultstate *fs, vm_page_t m)
 {
 	bool need_dirty;
 
 	if (((fs->prot & VM_PROT_WRITE) == 0 &&
 	    (fs->fault_flags & VM_FAULT_DIRTY) == 0) ||
 	    (m->oflags & VPO_UNMANAGED) != 0)
 		return;
 
 	VM_PAGE_OBJECT_BUSY_ASSERT(m);
 
 	need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 &&
 	    (fs->fault_flags & VM_FAULT_WIRE) == 0) ||
 	    (fs->fault_flags & VM_FAULT_DIRTY) != 0;
 
 	vm_object_set_writeable_dirty(m->object);
 
 	/*
 	 * If the fault is a write, we know that this page is being
 	 * written NOW so dirty it explicitly to save on
 	 * pmap_is_modified() calls later.
 	 *
 	 * Also, since the page is now dirty, we can possibly tell
 	 * the pager to release any swap backing the page.
 	 */
 	if (need_dirty && vm_page_set_dirty(m) == 0) {
 		/*
 		 * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
 		 * if the page is already dirty to prevent data written with
 		 * the expectation of being synced from not being synced.
 		 * Likewise if this entry does not request NOSYNC then make
 		 * sure the page isn't marked NOSYNC.  Applications sharing
 		 * data should use the same flags to avoid ping ponging.
 		 */
 		if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0)
 			vm_page_aflag_set(m, PGA_NOSYNC);
 		else
 			vm_page_aflag_clear(m, PGA_NOSYNC);
 	}
 
 }
 
 /*
  * Unlocks fs.first_object and fs.map on success.
  */
 static int
 vm_fault_soft_fast(struct faultstate *fs)
 {
 	vm_page_t m, m_map;
 #if VM_NRESERVLEVEL > 0
 	vm_page_t m_super;
 	int flags;
 #endif
 	int psind, rv;
 	vm_offset_t vaddr;
 
 	MPASS(fs->vp == NULL);
 	vaddr = fs->vaddr;
 	vm_object_busy(fs->first_object);
 	m = vm_page_lookup(fs->first_object, fs->first_pindex);
 	/* A busy page can be mapped for read|execute access. */
 	if (m == NULL || ((fs->prot & VM_PROT_WRITE) != 0 &&
 	    vm_page_busied(m)) || !vm_page_all_valid(m)) {
 		rv = KERN_FAILURE;
 		goto out;
 	}
 	m_map = m;
 	psind = 0;
 #if VM_NRESERVLEVEL > 0
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    (m_super = vm_reserv_to_superpage(m)) != NULL &&
 	    rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
 	    roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
 	    (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
 	    (pagesizes[m_super->psind] - 1)) && !fs->wired &&
 	    pmap_ps_enabled(fs->map->pmap)) {
 		flags = PS_ALL_VALID;
 		if ((fs->prot & VM_PROT_WRITE) != 0) {
 			/*
 			 * Create a superpage mapping allowing write access
 			 * only if none of the constituent pages are busy and
 			 * all of them are already dirty (except possibly for
 			 * the page that was faulted on).
 			 */
 			flags |= PS_NONE_BUSY;
 			if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
 				flags |= PS_ALL_DIRTY;
 		}
 		if (vm_page_ps_test(m_super, flags, m)) {
 			m_map = m_super;
 			psind = m_super->psind;
 			vaddr = rounddown2(vaddr, pagesizes[psind]);
 			/* Preset the modified bit for dirty superpages. */
 			if ((flags & PS_ALL_DIRTY) != 0)
 				fs->fault_type |= VM_PROT_WRITE;
 		}
 	}
 #endif
 	rv = pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type |
 	    PMAP_ENTER_NOSLEEP | (fs->wired ? PMAP_ENTER_WIRED : 0), psind);
 	if (rv != KERN_SUCCESS)
 		goto out;
 	if (fs->m_hold != NULL) {
 		(*fs->m_hold) = m;
 		vm_page_wire(m);
 	}
 	if (psind == 0 && !fs->wired)
 		vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
 	VM_OBJECT_RUNLOCK(fs->first_object);
 	vm_fault_dirty(fs, m);
 	vm_map_lookup_done(fs->map, fs->entry);
 	curthread->td_ru.ru_minflt++;
 
 out:
 	vm_object_unbusy(fs->first_object);
 	return (rv);
 }
 
 static void
 vm_fault_restore_map_lock(struct faultstate *fs)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
 	MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0);
 
 	if (!vm_map_trylock_read(fs->map)) {
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		vm_map_lock_read(fs->map);
 		VM_OBJECT_WLOCK(fs->first_object);
 	}
 	fs->lookup_still_valid = true;
 }
 
 static void
 vm_fault_populate_check_page(vm_page_t m)
 {
 
 	/*
 	 * Check each page to ensure that the pager is obeying the
 	 * interface: the page must be installed in the object, fully
 	 * valid, and exclusively busied.
 	 */
 	MPASS(m != NULL);
 	MPASS(vm_page_all_valid(m));
 	MPASS(vm_page_xbusied(m));
 }
 
 static void
 vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
     vm_pindex_t last)
 {
 	vm_page_t m;
 	vm_pindex_t pidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	MPASS(first <= last);
 	for (pidx = first, m = vm_page_lookup(object, pidx);
 	    pidx <= last; pidx++, m = vm_page_next(m)) {
 		vm_fault_populate_check_page(m);
 		vm_page_deactivate(m);
 		vm_page_xunbusy(m);
 	}
 }
 
 static int
 vm_fault_populate(struct faultstate *fs)
 {
 	vm_offset_t vaddr;
 	vm_page_t m;
 	vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
 	int i, npages, psind, rv;
 
 	MPASS(fs->object == fs->first_object);
 	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
 	MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0);
 	MPASS(fs->first_object->backing_object == NULL);
 	MPASS(fs->lookup_still_valid);
 
 	pager_first = OFF_TO_IDX(fs->entry->offset);
 	pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
 	unlock_map(fs);
 	unlock_vp(fs);
 
 	/*
 	 * Call the pager (driver) populate() method.
 	 *
 	 * There is no guarantee that the method will be called again
 	 * if the current fault is for read, and a future fault is
 	 * for write.  Report the entry's maximum allowed protection
 	 * to the driver.
 	 */
 	rv = vm_pager_populate(fs->first_object, fs->first_pindex,
 	    fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last);
 
 	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
 	if (rv == VM_PAGER_BAD) {
 		/*
 		 * VM_PAGER_BAD is the backdoor for a pager to request
 		 * normal fault handling.
 		 */
 		vm_fault_restore_map_lock(fs);
 		if (fs->map->timestamp != fs->map_generation)
 			return (KERN_RESTART);
 		return (KERN_NOT_RECEIVER);
 	}
 	if (rv != VM_PAGER_OK)
 		return (KERN_FAILURE); /* AKA SIGSEGV */
 
 	/* Ensure that the driver is obeying the interface. */
 	MPASS(pager_first <= pager_last);
 	MPASS(fs->first_pindex <= pager_last);
 	MPASS(fs->first_pindex >= pager_first);
 	MPASS(pager_last < fs->first_object->size);
 
 	vm_fault_restore_map_lock(fs);
 	if (fs->map->timestamp != fs->map_generation) {
 		vm_fault_populate_cleanup(fs->first_object, pager_first,
 		    pager_last);
 		return (KERN_RESTART);
 	}
 
 	/*
 	 * The map is unchanged after our last unlock.  Process the fault.
 	 *
 	 * The range [pager_first, pager_last] that is given to the
 	 * pager is only a hint.  The pager may populate any range
 	 * within the object that includes the requested page index.
 	 * In case the pager expanded the range, clip it to fit into
 	 * the map entry.
 	 */
 	map_first = OFF_TO_IDX(fs->entry->offset);
 	if (map_first > pager_first) {
 		vm_fault_populate_cleanup(fs->first_object, pager_first,
 		    map_first - 1);
 		pager_first = map_first;
 	}
 	map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
 	if (map_last < pager_last) {
 		vm_fault_populate_cleanup(fs->first_object, map_last + 1,
 		    pager_last);
 		pager_last = map_last;
 	}
 	for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
 	    pidx <= pager_last;
 	    pidx += npages, m = vm_page_next(&m[npages - 1])) {
 		vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
 #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
     __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)
 		psind = m->psind;
 		if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
 		    pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||
 		    !pmap_ps_enabled(fs->map->pmap) || fs->wired))
 			psind = 0;
 #else
 		psind = 0;
 #endif		
 		npages = atop(pagesizes[psind]);
 		for (i = 0; i < npages; i++) {
 			vm_fault_populate_check_page(&m[i]);
 			vm_fault_dirty(fs, &m[i]);
 		}
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type |
 		    (fs->wired ? PMAP_ENTER_WIRED : 0), psind);
 #if defined(__amd64__)
 		if (psind > 0 && rv == KERN_FAILURE) {
 			for (i = 0; i < npages; i++) {
 				rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
 				    &m[i], fs->prot, fs->fault_type |
 				    (fs->wired ? PMAP_ENTER_WIRED : 0), 0);
 				MPASS(rv == KERN_SUCCESS);
 			}
 		}
 #else
 		MPASS(rv == KERN_SUCCESS);
 #endif
 		VM_OBJECT_WLOCK(fs->first_object);
 		for (i = 0; i < npages; i++) {
 			if ((fs->fault_flags & VM_FAULT_WIRE) != 0)
 				vm_page_wire(&m[i]);
 			else
 				vm_page_activate(&m[i]);
 			if (fs->m_hold != NULL && m[i].pindex == fs->first_pindex) {
 				(*fs->m_hold) = &m[i];
 				vm_page_wire(&m[i]);
 			}
 			vm_page_xunbusy(&m[i]);
 		}
 	}
 	curthread->td_ru.ru_majflt++;
 	return (KERN_SUCCESS);
 }
 
 static int prot_fault_translation;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
     &prot_fault_translation, 0,
     "Control signal to deliver on protection fault");
 
 /* compat definition to keep common code for signal translation */
 #define	UCODE_PAGEFLT	12
 #ifdef T_PAGEFLT
 _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT");
 #endif
 
 /*
  *	vm_fault_trap:
  *
  *	Handle a page fault occurring at the given address,
  *	requiring the given permissions, in the map specified.
  *	If successful, the page is inserted into the
  *	associated physical map.
  *
  *	NOTE: the given address should be truncated to the
  *	proper page address.
  *
  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
  *	a standard error specifying why the fault is fatal is returned.
  *
  *	The map in question must be referenced, and remains so.
  *	Caller may hold no locks.
  */
 int
 vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, int *signo, int *ucode)
 {
 	int result;
 
 	MPASS(signo == NULL || ucode != NULL);
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT))
 		ktrfault(vaddr, fault_type);
 #endif
 	result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags,
 	    NULL);
 	KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE ||
 	    result == KERN_INVALID_ADDRESS ||
 	    result == KERN_RESOURCE_SHORTAGE ||
 	    result == KERN_PROTECTION_FAILURE ||
 	    result == KERN_OUT_OF_BOUNDS,
 	    ("Unexpected Mach error %d from vm_fault()", result));
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND))
 		ktrfaultend(result);
 #endif
 	if (result != KERN_SUCCESS && signo != NULL) {
 		switch (result) {
 		case KERN_FAILURE:
 		case KERN_INVALID_ADDRESS:
 			*signo = SIGSEGV;
 			*ucode = SEGV_MAPERR;
 			break;
 		case KERN_RESOURCE_SHORTAGE:
 			*signo = SIGBUS;
 			*ucode = BUS_OOMERR;
 			break;
 		case KERN_OUT_OF_BOUNDS:
 			*signo = SIGBUS;
 			*ucode = BUS_OBJERR;
 			break;
 		case KERN_PROTECTION_FAILURE:
 			if (prot_fault_translation == 0) {
 				/*
 				 * Autodetect.  This check also covers
 				 * the images without the ABI-tag ELF
 				 * note.
 				 */
 				if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
 				    curproc->p_osrel >= P_OSREL_SIGSEGV) {
 					*signo = SIGSEGV;
 					*ucode = SEGV_ACCERR;
 				} else {
 					*signo = SIGBUS;
 					*ucode = UCODE_PAGEFLT;
 				}
 			} else if (prot_fault_translation == 1) {
 				/* Always compat mode. */
 				*signo = SIGBUS;
 				*ucode = UCODE_PAGEFLT;
 			} else {
 				/* Always SIGSEGV mode. */
 				*signo = SIGSEGV;
 				*ucode = SEGV_ACCERR;
 			}
 			break;
 		default:
 			KASSERT(0, ("Unexpected Mach error %d from vm_fault()",
 			    result));
 			break;
 		}
 	}
 	return (result);
 }
 
 static int
 vm_fault_lock_vnode(struct faultstate *fs, bool objlocked)
 {
 	struct vnode *vp;
 	int error, locked;
 
 	if (fs->object->type != OBJT_VNODE)
 		return (KERN_SUCCESS);
 	vp = fs->object->handle;
 	if (vp == fs->vp) {
 		ASSERT_VOP_LOCKED(vp, "saved vnode is not locked");
 		return (KERN_SUCCESS);
 	}
 
 	/*
 	 * Perform an unlock in case the desired vnode changed while
 	 * the map was unlocked during a retry.
 	 */
 	unlock_vp(fs);
 
 	locked = VOP_ISLOCKED(vp);
 	if (locked != LK_EXCLUSIVE)
 		locked = LK_SHARED;
 
 	/*
 	 * We must not sleep acquiring the vnode lock while we have
 	 * the page exclusive busied or the object's
 	 * paging-in-progress count incremented.  Otherwise, we could
 	 * deadlock.
 	 */
-	error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread);
+	error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT);
 	if (error == 0) {
 		fs->vp = vp;
 		return (KERN_SUCCESS);
 	}
 
 	vhold(vp);
 	if (objlocked)
 		unlock_and_deallocate(fs);
 	else
 		fault_deallocate(fs);
-	error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread);
+	error = vget(vp, locked | LK_RETRY | LK_CANRECURSE);
 	vdrop(vp);
 	fs->vp = vp;
 	KASSERT(error == 0, ("vm_fault: vget failed %d", error));
 	return (KERN_RESOURCE_SHORTAGE);
 }
 
 /*
  * Calculate the desired readahead.  Handle drop-behind.
  *
  * Returns the number of readahead blocks to pass to the pager.
  */
 static int
 vm_fault_readahead(struct faultstate *fs)
 {
 	int era, nera;
 	u_char behavior;
 
 	KASSERT(fs->lookup_still_valid, ("map unlocked"));
 	era = fs->entry->read_ahead;
 	behavior = vm_map_entry_behavior(fs->entry);
 	if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
 		nera = 0;
 	} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
 		nera = VM_FAULT_READ_AHEAD_MAX;
 		if (fs->vaddr == fs->entry->next_read)
 			vm_fault_dontneed(fs, fs->vaddr, nera);
 	} else if (fs->vaddr == fs->entry->next_read) {
 		/*
 		 * This is a sequential fault.  Arithmetically
 		 * increase the requested number of pages in
 		 * the read-ahead window.  The requested
 		 * number of pages is "# of sequential faults
 		 * x (read ahead min + 1) + read ahead min"
 		 */
 		nera = VM_FAULT_READ_AHEAD_MIN;
 		if (era > 0) {
 			nera += era + 1;
 			if (nera > VM_FAULT_READ_AHEAD_MAX)
 				nera = VM_FAULT_READ_AHEAD_MAX;
 		}
 		if (era == VM_FAULT_READ_AHEAD_MAX)
 			vm_fault_dontneed(fs, fs->vaddr, nera);
 	} else {
 		/*
 		 * This is a non-sequential fault.
 		 */
 		nera = 0;
 	}
 	if (era != nera) {
 		/*
 		 * A read lock on the map suffices to update
 		 * the read ahead count safely.
 		 */
 		fs->entry->read_ahead = nera;
 	}
 
 	return (nera);
 }
 
 static int
 vm_fault_lookup(struct faultstate *fs)
 {
 	int result;
 
 	KASSERT(!fs->lookup_still_valid,
 	   ("vm_fault_lookup: Map already locked."));
 	result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type |
 	    VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object,
 	    &fs->first_pindex, &fs->prot, &fs->wired);
 	if (result != KERN_SUCCESS) {
 		unlock_vp(fs);
 		return (result);
 	}
 
 	fs->map_generation = fs->map->timestamp;
 
 	if (fs->entry->eflags & MAP_ENTRY_NOFAULT) {
 		panic("%s: fault on nofault entry, addr: %#lx",
 		    __func__, (u_long)fs->vaddr);
 	}
 
 	if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION &&
 	    fs->entry->wiring_thread != curthread) {
 		vm_map_unlock_read(fs->map);
 		vm_map_lock(fs->map);
 		if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) &&
 		    (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
 			unlock_vp(fs);
 			fs->entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			vm_map_unlock_and_wait(fs->map, 0);
 		} else
 			vm_map_unlock(fs->map);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0);
 
 	if (fs->wired)
 		fs->fault_type = fs->prot | (fs->fault_type & VM_PROT_COPY);
 	else
 		KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0,
 		    ("!fs->wired && VM_FAULT_WIRE"));
 	fs->lookup_still_valid = true;
 
 	return (KERN_SUCCESS);
 }
 
 static int
 vm_fault_relookup(struct faultstate *fs)
 {
 	vm_object_t retry_object;
 	vm_pindex_t retry_pindex;
 	vm_prot_t retry_prot;
 	int result;
 
 	if (!vm_map_trylock_read(fs->map))
 		return (KERN_RESTART);
 
 	fs->lookup_still_valid = true;
 	if (fs->map->timestamp == fs->map_generation)
 		return (KERN_SUCCESS);
 
 	result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type,
 	    &fs->entry, &retry_object, &retry_pindex, &retry_prot,
 	    &fs->wired);
 	if (result != KERN_SUCCESS) {
 		/*
 		 * If retry of map lookup would have blocked then
 		 * retry fault from start.
 		 */
 		if (result == KERN_FAILURE)
 			return (KERN_RESTART);
 		return (result);
 	}
 	if (retry_object != fs->first_object ||
 	    retry_pindex != fs->first_pindex)
 		return (KERN_RESTART);
 
 	/*
 	 * Check whether the protection has changed or the object has
 	 * been copied while we left the map unlocked. Changing from
 	 * read to write permission is OK - we leave the page
 	 * write-protected, and catch the write fault. Changing from
 	 * write to read permission means that we can't mark the page
 	 * write-enabled after all.
 	 */
 	fs->prot &= retry_prot;
 	fs->fault_type &= retry_prot;
 	if (fs->prot == 0)
 		return (KERN_RESTART);
 
 	/* Reassert because wired may have changed. */
 	KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0,
 	    ("!wired && VM_FAULT_WIRE"));
 
 	return (KERN_SUCCESS);
 }
 
 static void
 vm_fault_cow(struct faultstate *fs)
 {
 	bool is_first_object_locked;
 
 	/*
 	 * This allows pages to be virtually copied from a backing_object
 	 * into the first_object, where the backing object has no other
 	 * refs to it, and cannot gain any more refs.  Instead of a bcopy,
 	 * we just move the page from the backing object to the first
 	 * object.  Note that we must mark the page dirty in the first
 	 * object so that it will go out to swap when needed.
 	 */
 	is_first_object_locked = false;
 	if (
 	    /*
 	     * Only one shadow object and no other refs.
 	     */
 	    fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
 	    /*
 	     * No other ways to look the object up
 	     */
 	    fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 &&
 	    /*
 	     * We don't chase down the shadow chain and we can acquire locks.
 	     */
 	    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) &&
 	    fs->object == fs->first_object->backing_object &&
 	    VM_OBJECT_TRYWLOCK(fs->object)) {
 
 		/*
 		 * Remove but keep xbusy for replace.  fs->m is moved into
 		 * fs->first_object and left busy while fs->first_m is
 		 * conditionally freed.
 		 */
 		vm_page_remove_xbusy(fs->m);
 		vm_page_replace(fs->m, fs->first_object, fs->first_pindex,
 		    fs->first_m);
 		vm_page_dirty(fs->m);
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * Rename the reservation.
 		 */
 		vm_reserv_rename(fs->m, fs->first_object, fs->object,
 		    OFF_TO_IDX(fs->first_object->backing_object_offset));
 #endif
 		VM_OBJECT_WUNLOCK(fs->object);
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		fs->first_m = fs->m;
 		fs->m = NULL;
 		VM_CNT_INC(v_cow_optim);
 	} else {
 		if (is_first_object_locked)
 			VM_OBJECT_WUNLOCK(fs->first_object);
 		/*
 		 * Oh, well, lets copy it.
 		 */
 		pmap_copy_page(fs->m, fs->first_m);
 		vm_page_valid(fs->first_m);
 		if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) {
 			vm_page_wire(fs->first_m);
 			vm_page_unwire(fs->m, PQ_INACTIVE);
 		}
 		/*
 		 * Save the cow page to be released after
 		 * pmap_enter is complete.
 		 */
 		fs->m_cow = fs->m;
 		fs->m = NULL;
 	}
 	/*
 	 * fs->object != fs->first_object due to above 
 	 * conditional
 	 */
 	vm_object_pip_wakeup(fs->object);
 
 	/*
 	 * Only use the new page below...
 	 */
 	fs->object = fs->first_object;
 	fs->pindex = fs->first_pindex;
 	fs->m = fs->first_m;
 	VM_CNT_INC(v_cow_faults);
 	curthread->td_cow++;
 }
 
 static bool
 vm_fault_next(struct faultstate *fs)
 {
 	vm_object_t next_object;
 
 	/*
 	 * The requested page does not exist at this object/
 	 * offset.  Remove the invalid page from the object,
 	 * waking up anyone waiting for it, and continue on to
 	 * the next object.  However, if this is the top-level
 	 * object, we must leave the busy page in place to
 	 * prevent another process from rushing past us, and
 	 * inserting the page in that object at the same time
 	 * that we are.
 	 */
 	if (fs->object == fs->first_object) {
 		fs->first_m = fs->m;
 		fs->m = NULL;
 	} else
 		fault_page_free(&fs->m);
 
 	/*
 	 * Move on to the next object.  Lock the next object before
 	 * unlocking the current one.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(fs->object);
 	next_object = fs->object->backing_object;
 	if (next_object == NULL)
 		return (false);
 	MPASS(fs->first_m != NULL);
 	KASSERT(fs->object != next_object, ("object loop %p", next_object));
 	VM_OBJECT_WLOCK(next_object);
 	vm_object_pip_add(next_object, 1);
 	if (fs->object != fs->first_object)
 		vm_object_pip_wakeup(fs->object);
 	fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset);
 	VM_OBJECT_WUNLOCK(fs->object);
 	fs->object = next_object;
 
 	return (true);
 }
 
 static void
 vm_fault_zerofill(struct faultstate *fs)
 {
 
 	/*
 	 * If there's no object left, fill the page in the top
 	 * object with zeros.
 	 */
 	if (fs->object != fs->first_object) {
 		vm_object_pip_wakeup(fs->object);
 		fs->object = fs->first_object;
 		fs->pindex = fs->first_pindex;
 	}
 	MPASS(fs->first_m != NULL);
 	MPASS(fs->m == NULL);
 	fs->m = fs->first_m;
 	fs->first_m = NULL;
 
 	/*
 	 * Zero the page if necessary and mark it valid.
 	 */
 	if ((fs->m->flags & PG_ZERO) == 0) {
 		pmap_zero_page(fs->m);
 	} else {
 		VM_CNT_INC(v_ozfod);
 	}
 	VM_CNT_INC(v_zfod);
 	vm_page_valid(fs->m);
 }
 
 /*
  * Allocate a page directly or via the object populate method.
  */
 static int
 vm_fault_allocate(struct faultstate *fs)
 {
 	struct domainset *dset;
 	int alloc_req;
 	int rv;
 
 
 	if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) {
 		rv = vm_fault_lock_vnode(fs, true);
 		MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE);
 		if (rv == KERN_RESOURCE_SHORTAGE)
 			return (rv);
 	}
 
 	if (fs->pindex >= fs->object->size)
 		return (KERN_OUT_OF_BOUNDS);
 
 	if (fs->object == fs->first_object &&
 	    (fs->first_object->flags & OBJ_POPULATE) != 0 &&
 	    fs->first_object->shadow_count == 0) {
 		rv = vm_fault_populate(fs);
 		switch (rv) {
 		case KERN_SUCCESS:
 		case KERN_FAILURE:
 		case KERN_RESTART:
 			return (rv);
 		case KERN_NOT_RECEIVER:
 			/*
 			 * Pager's populate() method
 			 * returned VM_PAGER_BAD.
 			 */
 			break;
 		default:
 			panic("inconsistent return codes");
 		}
 	}
 
 	/*
 	 * Allocate a new page for this object/offset pair.
 	 *
 	 * Unlocked read of the p_flag is harmless. At worst, the P_KILLED
 	 * might be not observed there, and allocation can fail, causing
 	 * restart and new reading of the p_flag.
 	 */
 	dset = fs->object->domain.dr_policy;
 	if (dset == NULL)
 		dset = curthread->td_domain.dr_policy;
 	if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) {
 #if VM_NRESERVLEVEL > 0
 		vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex);
 #endif
 		alloc_req = P_KILLED(curproc) ?
 		    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
 		if (fs->object->type != OBJT_VNODE &&
 		    fs->object->backing_object == NULL)
 			alloc_req |= VM_ALLOC_ZERO;
 		fs->m = vm_page_alloc(fs->object, fs->pindex, alloc_req);
 	}
 	if (fs->m == NULL) {
 		unlock_and_deallocate(fs);
 		if (vm_pfault_oom_attempts < 0 ||
 		    fs->oom < vm_pfault_oom_attempts) {
 			fs->oom++;
 			vm_waitpfault(dset, vm_pfault_oom_wait * hz);
 		} else 	{
 			if (bootverbose)
 				printf(
 		"proc %d (%s) failed to alloc page on fault, starting OOM\n",
 				    curproc->p_pid, curproc->p_comm);
 			vm_pageout_oom(VM_OOM_MEM_PF);
 			fs->oom = 0;
 		}
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 	fs->oom = 0;
 
 	return (KERN_NOT_RECEIVER);
 }
 
 /*
  * Call the pager to retrieve the page if there is a chance
  * that the pager has it, and potentially retrieve additional
  * pages at the same time.
  */
 static int
 vm_fault_getpages(struct faultstate *fs, int nera, int *behindp, int *aheadp)
 {
 	vm_offset_t e_end, e_start;
 	int ahead, behind, cluster_offset, rv;
 	u_char behavior;
 
 	/*
 	 * Prepare for unlocking the map.  Save the map
 	 * entry's start and end addresses, which are used to
 	 * optimize the size of the pager operation below.
 	 * Even if the map entry's addresses change after
 	 * unlocking the map, using the saved addresses is
 	 * safe.
 	 */
 	e_start = fs->entry->start;
 	e_end = fs->entry->end;
 	behavior = vm_map_entry_behavior(fs->entry);
 
 	/*
 	 * Release the map lock before locking the vnode or
 	 * sleeping in the pager.  (If the current object has
 	 * a shadow, then an earlier iteration of this loop
 	 * may have already unlocked the map.)
 	 */
 	unlock_map(fs);
 
 	rv = vm_fault_lock_vnode(fs, false);
 	MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE);
 	if (rv == KERN_RESOURCE_SHORTAGE)
 		return (rv);
 	KASSERT(fs->vp == NULL || !fs->map->system_map,
 	    ("vm_fault: vnode-backed object mapped by system map"));
 
 	/*
 	 * Page in the requested page and hint the pager,
 	 * that it may bring up surrounding pages.
 	 */
 	if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
 	    P_KILLED(curproc)) {
 		behind = 0;
 		ahead = 0;
 	} else {
 		/* Is this a sequential fault? */
 		if (nera > 0) {
 			behind = 0;
 			ahead = nera;
 		} else {
 			/*
 			 * Request a cluster of pages that is
 			 * aligned to a VM_FAULT_READ_DEFAULT
 			 * page offset boundary within the
 			 * object.  Alignment to a page offset
 			 * boundary is more likely to coincide
 			 * with the underlying file system
 			 * block than alignment to a virtual
 			 * address boundary.
 			 */
 			cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT;
 			behind = ulmin(cluster_offset,
 			    atop(fs->vaddr - e_start));
 			ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset;
 		}
 		ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1);
 	}
 	*behindp = behind;
 	*aheadp = ahead;
 	rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp);
 	if (rv == VM_PAGER_OK)
 		return (KERN_SUCCESS);
 	if (rv == VM_PAGER_ERROR)
 		printf("vm_fault: pager read error, pid %d (%s)\n",
 		    curproc->p_pid, curproc->p_comm);
 	/*
 	 * If an I/O error occurred or the requested page was
 	 * outside the range of the pager, clean up and return
 	 * an error.
 	 */
 	if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD)
 		return (KERN_OUT_OF_BOUNDS);
 	return (KERN_NOT_RECEIVER);
 }
 
 /*
  * Wait/Retry if the page is busy.  We have to do this if the page is
  * either exclusive or shared busy because the vm_pager may be using
  * read busy for pageouts (and even pageins if it is the vnode pager),
  * and we could end up trying to pagein and pageout the same page
  * simultaneously.
  *
  * We can theoretically allow the busy case on a read fault if the page
  * is marked valid, but since such pages are typically already pmap'd,
  * putting that special case in might be more effort then it is worth.
  * We cannot under any circumstances mess around with a shared busied
  * page except, perhaps, to pmap it.
  */
 static void
 vm_fault_busy_sleep(struct faultstate *fs)
 {
 	/*
 	 * Reference the page before unlocking and
 	 * sleeping so that the page daemon is less
 	 * likely to reclaim it.
 	 */
 	vm_page_aflag_set(fs->m, PGA_REFERENCED);
 	if (fs->object != fs->first_object) {
 		fault_page_release(&fs->first_m);
 		vm_object_pip_wakeup(fs->first_object);
 	}
 	vm_object_pip_wakeup(fs->object);
 	unlock_map(fs);
 	if (fs->m == vm_page_lookup(fs->object, fs->pindex))
 		vm_page_busy_sleep(fs->m, "vmpfw", false);
 	else
 		VM_OBJECT_WUNLOCK(fs->object);
 	VM_CNT_INC(v_intrans);
 	vm_object_deallocate(fs->first_object);
 }
 
 int
 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold)
 {
 	struct faultstate fs;
 	int ahead, behind, faultcount;
 	int nera, result, rv;
 	bool dead, hardfault;
 
 	VM_CNT_INC(v_vm_faults);
 
 	if ((curthread->td_pflags & TDP_NOFAULTING) != 0)
 		return (KERN_PROTECTION_FAILURE);
 
 	fs.vp = NULL;
 	fs.vaddr = vaddr;
 	fs.m_hold = m_hold;
 	fs.fault_flags = fault_flags;
 	fs.map = map;
 	fs.lookup_still_valid = false;
 	fs.oom = 0;
 	faultcount = 0;
 	nera = -1;
 	hardfault = false;
 
 RetryFault:
 	fs.fault_type = fault_type;
 
 	/*
 	 * Find the backing store object and offset into it to begin the
 	 * search.
 	 */
 	result = vm_fault_lookup(&fs);
 	if (result != KERN_SUCCESS) {
 		if (result == KERN_RESOURCE_SHORTAGE)
 			goto RetryFault;
 		return (result);
 	}
 
 	/*
 	 * Try to avoid lock contention on the top-level object through
 	 * special-case handling of some types of page faults, specifically,
 	 * those that are mapping an existing page from the top-level object.
 	 * Under this condition, a read lock on the object suffices, allowing
 	 * multiple page faults of a similar type to run in parallel.
 	 */
 	if (fs.vp == NULL /* avoid locked vnode leak */ &&
 	    (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) {
 		VM_OBJECT_RLOCK(fs.first_object);
 		rv = vm_fault_soft_fast(&fs);
 		if (rv == KERN_SUCCESS)
 			return (rv);
 		if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
 			VM_OBJECT_RUNLOCK(fs.first_object);
 			VM_OBJECT_WLOCK(fs.first_object);
 		}
 	} else {
 		VM_OBJECT_WLOCK(fs.first_object);
 	}
 
 	/*
 	 * Make a reference to this object to prevent its disposal while we
 	 * are messing with it.  Once we have the reference, the map is free
 	 * to be diddled.  Since objects reference their shadows (and copies),
 	 * they will stay around as well.
 	 *
 	 * Bump the paging-in-progress count to prevent size changes (e.g. 
 	 * truncation operations) during I/O.
 	 */
 	vm_object_reference_locked(fs.first_object);
 	vm_object_pip_add(fs.first_object, 1);
 
 	fs.m_cow = fs.m = fs.first_m = NULL;
 
 	/*
 	 * Search for the page at object/offset.
 	 */
 	fs.object = fs.first_object;
 	fs.pindex = fs.first_pindex;
 	while (TRUE) {
 		KASSERT(fs.m == NULL,
 		    ("page still set %p at loop start", fs.m));
 		/*
 		 * If the object is marked for imminent termination,
 		 * we retry here, since the collapse pass has raced
 		 * with us.  Otherwise, if we see terminally dead
 		 * object, return fail.
 		 */
 		if ((fs.object->flags & OBJ_DEAD) != 0) {
 			dead = fs.object->type == OBJT_DEAD;
 			unlock_and_deallocate(&fs);
 			if (dead)
 				return (KERN_PROTECTION_FAILURE);
 			pause("vmf_de", 1);
 			goto RetryFault;
 		}
 
 		/*
 		 * See if page is resident
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
 			if (vm_page_tryxbusy(fs.m) == 0) {
 				vm_fault_busy_sleep(&fs);
 				goto RetryFault;
 			}
 
 			/*
 			 * The page is marked busy for other processes and the
 			 * pagedaemon.  If it still is completely valid we
 			 * are done.
 			 */
 			if (vm_page_all_valid(fs.m)) {
 				VM_OBJECT_WUNLOCK(fs.object);
 				break; /* break to PAGE HAS BEEN FOUND. */
 			}
 		}
 		VM_OBJECT_ASSERT_WLOCKED(fs.object);
 
 		/*
 		 * Page is not resident.  If the pager might contain the page
 		 * or this is the beginning of the search, allocate a new
 		 * page.  (Default objects are zero-fill, so there is no real
 		 * pager for them.)
 		 */
 		if (fs.m == NULL && (fs.object->type != OBJT_DEFAULT ||
 		    fs.object == fs.first_object)) {
 			rv = vm_fault_allocate(&fs);
 			switch (rv) {
 			case KERN_RESTART:
 				unlock_and_deallocate(&fs);
 				/* FALLTHROUGH */
 			case KERN_RESOURCE_SHORTAGE:
 				goto RetryFault;
 			case KERN_SUCCESS:
 			case KERN_FAILURE:
 			case KERN_OUT_OF_BOUNDS:
 				unlock_and_deallocate(&fs);
 				return (rv);
 			case KERN_NOT_RECEIVER:
 				break;
 			default:
 				panic("vm_fault: Unhandled rv %d", rv);
 			}
 		}
 
 		/*
 		 * Default objects have no pager so no exclusive busy exists
 		 * to protect this page in the chain.  Skip to the next
 		 * object without dropping the lock to preserve atomicity of
 		 * shadow faults.
 		 */
 		if (fs.object->type != OBJT_DEFAULT) {
 			/*
 			 * At this point, we have either allocated a new page
 			 * or found an existing page that is only partially
 			 * valid.
 			 *
 			 * We hold a reference on the current object and the
 			 * page is exclusive busied.  The exclusive busy
 			 * prevents simultaneous faults and collapses while
 			 * the object lock is dropped.
 		 	 */
 			VM_OBJECT_WUNLOCK(fs.object);
 
 			/*
 			 * If the pager for the current object might have
 			 * the page, then determine the number of additional
 			 * pages to read and potentially reprioritize
 			 * previously read pages for earlier reclamation.
 			 * These operations should only be performed once per
 			 * page fault.  Even if the current pager doesn't
 			 * have the page, the number of additional pages to
 			 * read will apply to subsequent objects in the
 			 * shadow chain.
 			 */
 			if (nera == -1 && !P_KILLED(curproc))
 				nera = vm_fault_readahead(&fs);
 
 			rv = vm_fault_getpages(&fs, nera, &behind, &ahead);
 			if (rv == KERN_SUCCESS) {
 				faultcount = behind + 1 + ahead;
 				hardfault = true;
 				break; /* break to PAGE HAS BEEN FOUND. */
 			}
 			if (rv == KERN_RESOURCE_SHORTAGE)
 				goto RetryFault;
 			VM_OBJECT_WLOCK(fs.object);
 			if (rv == KERN_OUT_OF_BOUNDS) {
 				fault_page_free(&fs.m);
 				unlock_and_deallocate(&fs);
 				return (rv);
 			}
 		}
 
 		/*
 		 * The page was not found in the current object.  Try to
 		 * traverse into a backing object or zero fill if none is
 		 * found.
 		 */
 		if (vm_fault_next(&fs))
 			continue;
 		VM_OBJECT_WUNLOCK(fs.object);
 		vm_fault_zerofill(&fs);
 		/* Don't try to prefault neighboring pages. */
 		faultcount = 1;
 		break;	/* break to PAGE HAS BEEN FOUND. */
 	}
 
 	/*
 	 * PAGE HAS BEEN FOUND.  A valid page has been found and exclusively
 	 * busied.  The object lock must no longer be held.
 	 */
 	vm_page_assert_xbusied(fs.m);
 	VM_OBJECT_ASSERT_UNLOCKED(fs.object);
 
 	/*
 	 * If the page is being written, but isn't already owned by the
 	 * top-level object, we have to copy it into a new page owned by the
 	 * top-level object.
 	 */
 	if (fs.object != fs.first_object) {
 		/*
 		 * We only really need to copy if we want to write it.
 		 */
 		if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 			vm_fault_cow(&fs);
 			/*
 			 * We only try to prefault read-only mappings to the
 			 * neighboring pages when this copy-on-write fault is
 			 * a hard fault.  In other cases, trying to prefault
 			 * is typically wasted effort.
 			 */
 			if (faultcount == 0)
 				faultcount = 1;
 
 		} else {
 			fs.prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * We must verify that the maps have not changed since our last
 	 * lookup.
 	 */
 	if (!fs.lookup_still_valid) {
 		result = vm_fault_relookup(&fs);
 		if (result != KERN_SUCCESS) {
 			fault_deallocate(&fs);
 			if (result == KERN_RESTART)
 				goto RetryFault;
 			return (result);
 		}
 	}
 	VM_OBJECT_ASSERT_UNLOCKED(fs.object);
 
 	/*
 	 * If the page was filled by a pager, save the virtual address that
 	 * should be faulted on next under a sequential access pattern to the
 	 * map entry.  A read lock on the map suffices to update this address
 	 * safely.
 	 */
 	if (hardfault)
 		fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 
 	/*
 	 * Page must be completely valid or it is not fit to
 	 * map into user space.  vm_pager_get_pages() ensures this.
 	 */
 	vm_page_assert_xbusied(fs.m);
 	KASSERT(vm_page_all_valid(fs.m),
 	    ("vm_fault: page %p partially invalid", fs.m));
 
 	vm_fault_dirty(&fs, fs.m);
 
 	/*
 	 * Put this page into the physical map.  We had to do the unlock above
 	 * because pmap_enter() may sleep.  We don't put the page
 	 * back on the active queue until later so that the pageout daemon
 	 * won't find it (yet).
 	 */
 	pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot,
 	    fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0);
 	if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 &&
 	    fs.wired == 0)
 		vm_fault_prefault(&fs, vaddr,
 		    faultcount > 0 ? behind : PFBAK,
 		    faultcount > 0 ? ahead : PFFOR, false);
 
 	/*
 	 * If the page is not wired down, then put it where the pageout daemon
 	 * can find it.
 	 */
 	if ((fs.fault_flags & VM_FAULT_WIRE) != 0)
 		vm_page_wire(fs.m);
 	else
 		vm_page_activate(fs.m);
 	if (fs.m_hold != NULL) {
 		(*fs.m_hold) = fs.m;
 		vm_page_wire(fs.m);
 	}
 	vm_page_xunbusy(fs.m);
 	fs.m = NULL;
 
 	/*
 	 * Unlock everything, and return
 	 */
 	fault_deallocate(&fs);
 	if (hardfault) {
 		VM_CNT_INC(v_io_faults);
 		curthread->td_ru.ru_majflt++;
 #ifdef RACCT
 		if (racct_enable && fs.object->type == OBJT_VNODE) {
 			PROC_LOCK(curproc);
 			if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 				racct_add_force(curproc, RACCT_WRITEBPS,
 				    PAGE_SIZE + behind * PAGE_SIZE);
 				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
 			} else {
 				racct_add_force(curproc, RACCT_READBPS,
 				    PAGE_SIZE + ahead * PAGE_SIZE);
 				racct_add_force(curproc, RACCT_READIOPS, 1);
 			}
 			PROC_UNLOCK(curproc);
 		}
 #endif
 	} else 
 		curthread->td_ru.ru_minflt++;
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Speed up the reclamation of pages that precede the faulting pindex within
  * the first object of the shadow chain.  Essentially, perform the equivalent
  * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
  * the faulting pindex by the cluster size when the pages read by vm_fault()
  * cross a cluster-size boundary.  The cluster size is the greater of the
  * smallest superpage size and VM_FAULT_DONTNEED_MIN.
  *
  * When "fs->first_object" is a shadow object, the pages in the backing object
  * that precede the faulting pindex are deactivated by vm_fault().  So, this
  * function must only be concerned with pages in the first object.
  */
 static void
 vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
 {
 	vm_map_entry_t entry;
 	vm_object_t first_object, object;
 	vm_offset_t end, start;
 	vm_page_t m, m_next;
 	vm_pindex_t pend, pstart;
 	vm_size_t size;
 
 	object = fs->object;
 	VM_OBJECT_ASSERT_UNLOCKED(object);
 	first_object = fs->first_object;
 	/* Neither fictitious nor unmanaged pages can be reclaimed. */
 	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
 		VM_OBJECT_RLOCK(first_object);
 		size = VM_FAULT_DONTNEED_MIN;
 		if (MAXPAGESIZES > 1 && size < pagesizes[1])
 			size = pagesizes[1];
 		end = rounddown2(vaddr, size);
 		if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
 		    (entry = fs->entry)->start < end) {
 			if (end - entry->start < size)
 				start = entry->start;
 			else
 				start = end - size;
 			pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
 			pstart = OFF_TO_IDX(entry->offset) + atop(start -
 			    entry->start);
 			m_next = vm_page_find_least(first_object, pstart);
 			pend = OFF_TO_IDX(entry->offset) + atop(end -
 			    entry->start);
 			while ((m = m_next) != NULL && m->pindex < pend) {
 				m_next = TAILQ_NEXT(m, listq);
 				if (!vm_page_all_valid(m) ||
 				    vm_page_busied(m))
 					continue;
 
 				/*
 				 * Don't clear PGA_REFERENCED, since it would
 				 * likely represent a reference by a different
 				 * process.
 				 *
 				 * Typically, at this point, prefetched pages
 				 * are still in the inactive queue.  Only
 				 * pages that triggered page faults are in the
 				 * active queue.  The test for whether the page
 				 * is in the inactive queue is racy; in the
 				 * worst case we will requeue the page
 				 * unnecessarily.
 				 */
 				if (!vm_page_inactive(m))
 					vm_page_deactivate(m);
 			}
 		}
 		VM_OBJECT_RUNLOCK(first_object);
 	}
 }
 
 /*
  * vm_fault_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of vm_map_pmap_enter, except it runs at page fault time instead
  * of mmap time.
  */
 static void
 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
     int backward, int forward, bool obj_locked)
 {
 	pmap_t pmap;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, lobject;
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
 	int i;
 
 	pmap = fs->map->pmap;
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
 	entry = fs->entry;
 
 	if (addra < backward * PAGE_SIZE) {
 		starta = entry->start;
 	} else {
 		starta = addra - backward * PAGE_SIZE;
 		if (starta < entry->start)
 			starta = entry->start;
 	}
 
 	/*
 	 * Generate the sequence of virtual addresses that are candidates for
 	 * prefaulting in an outward spiral from the faulting virtual address,
 	 * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
 	 * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
 	 * If the candidate address doesn't have a backing physical page, then
 	 * the loop immediately terminates.
 	 */
 	for (i = 0; i < 2 * imax(backward, forward); i++) {
 		addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
 		    PAGE_SIZE);
 		if (addr > addra + forward * PAGE_SIZE)
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if (!pmap_is_prefaultable(pmap, addr))
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = entry->object.vm_object;
 		if (!obj_locked)
 			VM_OBJECT_RLOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
 			KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
 			    0, ("vm_fault_prefault: unaligned object offset"));
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 			VM_OBJECT_RLOCK(backing_object);
 			if (!obj_locked || lobject != entry->object.vm_object)
 				VM_OBJECT_RUNLOCK(lobject);
 			lobject = backing_object;
 		}
 		if (m == NULL) {
 			if (!obj_locked || lobject != entry->object.vm_object)
 				VM_OBJECT_RUNLOCK(lobject);
 			break;
 		}
 		if (vm_page_all_valid(m) &&
 		    (m->flags & PG_FICTITIOUS) == 0)
 			pmap_enter_quick(pmap, addr, m, entry->protection);
 		if (!obj_locked || lobject != entry->object.vm_object)
 			VM_OBJECT_RUNLOCK(lobject);
 	}
 }
 
 /*
  * Hold each of the physical pages that are mapped by the specified range of
  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
  * and allow the specified types of access, "prot".  If all of the implied
  * pages are successfully held, then the number of held pages is returned
  * together with pointers to those pages in the array "ma".  However, if any
  * of the pages cannot be held, -1 is returned.
  */
 int
 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count)
 {
 	vm_offset_t end, va;
 	vm_page_t *mp;
 	int count;
 	boolean_t pmap_failed;
 
 	if (len == 0)
 		return (0);
 	end = round_page(addr + len);
 	addr = trunc_page(addr);
 
 	if (!vm_map_range_valid(map, addr, end))
 		return (-1);
 
 	if (atop(end - addr) > max_count)
 		panic("vm_fault_quick_hold_pages: count > max_count");
 	count = atop(end - addr);
 
 	/*
 	 * Most likely, the physical pages are resident in the pmap, so it is
 	 * faster to try pmap_extract_and_hold() first.
 	 */
 	pmap_failed = FALSE;
 	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
 		*mp = pmap_extract_and_hold(map->pmap, va, prot);
 		if (*mp == NULL)
 			pmap_failed = TRUE;
 		else if ((prot & VM_PROT_WRITE) != 0 &&
 		    (*mp)->dirty != VM_PAGE_BITS_ALL) {
 			/*
 			 * Explicitly dirty the physical page.  Otherwise, the
 			 * caller's changes may go unnoticed because they are
 			 * performed through an unmanaged mapping or by a DMA
 			 * operation.
 			 *
 			 * The object lock is not held here.
 			 * See vm_page_clear_dirty_mask().
 			 */
 			vm_page_dirty(*mp);
 		}
 	}
 	if (pmap_failed) {
 		/*
 		 * One or more pages could not be held by the pmap.  Either no
 		 * page was mapped at the specified virtual address or that
 		 * mapping had insufficient permissions.  Attempt to fault in
 		 * and hold these pages.
 		 *
 		 * If vm_fault_disable_pagefaults() was called,
 		 * i.e., TDP_NOFAULTING is set, we must not sleep nor
 		 * acquire MD VM locks, which means we must not call
 		 * vm_fault().  Some (out of tree) callers mark
 		 * too wide a code area with vm_fault_disable_pagefaults()
 		 * already, use the VM_PROT_QUICK_NOFAULT flag to request
 		 * the proper behaviour explicitly.
 		 */
 		if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
 		    (curthread->td_pflags & TDP_NOFAULTING) != 0)
 			goto error;
 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 			if (*mp == NULL && vm_fault(map, va, prot,
 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
 				goto error;
 	}
 	return (count);
 error:	
 	for (mp = ma; mp < ma + count; mp++)
 		if (*mp != NULL)
 			vm_page_unwire(*mp, PQ_INACTIVE);
 	return (-1);
 }
 
 /*
  *	Routine:
  *		vm_fault_copy_entry
  *	Function:
  *		Create new shadow object backing dst_entry with private copy of
  *		all underlying pages. When src_entry is equal to dst_entry,
  *		function implements COW for wired-down map entry. Otherwise,
  *		it forks wired entry into dst_map.
  *
  *	In/out conditions:
  *		The source and destination maps must be locked for write.
  *		The source map entry must be wired down (or be a sharing map
  *		entry corresponding to a main map entry that is wired down).
  */
 void
 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
     vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
     vm_ooffset_t *fork_charge)
 {
 	vm_object_t backing_object, dst_object, object, src_object;
 	vm_pindex_t dst_pindex, pindex, src_pindex;
 	vm_prot_t access, prot;
 	vm_offset_t vaddr;
 	vm_page_t dst_m;
 	vm_page_t src_m;
 	boolean_t upgrade;
 
 #ifdef	lint
 	src_map++;
 #endif	/* lint */
 
 	upgrade = src_entry == dst_entry;
 	access = prot = dst_entry->protection;
 
 	src_object = src_entry->object.vm_object;
 	src_pindex = OFF_TO_IDX(src_entry->offset);
 
 	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 		dst_object = src_object;
 		vm_object_reference(dst_object);
 	} else {
 		/*
 		 * Create the top-level object for the destination entry.
 		 * Doesn't actually shadow anything - we copy the pages
 		 * directly.
 		 */
 		dst_object = vm_object_allocate_anon(atop(dst_entry->end -
 		    dst_entry->start), NULL, NULL, 0);
 #if VM_NRESERVLEVEL > 0
 		dst_object->flags |= OBJ_COLORED;
 		dst_object->pg_color = atop(dst_entry->start);
 #endif
 		dst_object->domain = src_object->domain;
 		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 
 	VM_OBJECT_WLOCK(dst_object);
 	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
 	    ("vm_fault_copy_entry: vm_object not NULL"));
 	if (src_object != dst_object) {
 		dst_entry->object.vm_object = dst_object;
 		dst_entry->offset = 0;
 		dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC;
 	}
 	if (fork_charge != NULL) {
 		KASSERT(dst_entry->cred == NULL,
 		    ("vm_fault_copy_entry: leaked swp charge"));
 		dst_object->cred = curthread->td_ucred;
 		crhold(dst_object->cred);
 		*fork_charge += dst_object->charge;
 	} else if ((dst_object->type == OBJT_DEFAULT ||
 	    dst_object->type == OBJT_SWAP) &&
 	    dst_object->cred == NULL) {
 		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 		    dst_entry));
 		dst_object->cred = dst_entry->cred;
 		dst_entry->cred = NULL;
 	}
 
 	/*
 	 * If not an upgrade, then enter the mappings in the pmap as
 	 * read and/or execute accesses.  Otherwise, enter them as
 	 * write accesses.
 	 *
 	 * A writeable large page mapping is only created if all of
 	 * the constituent small page mappings are modified. Marking
 	 * PTEs as modified on inception allows promotion to happen
 	 * without taking potentially large number of soft faults.
 	 */
 	if (!upgrade)
 		access &= ~VM_PROT_WRITE;
 
 	/*
 	 * Loop through all of the virtual pages within the entry's
 	 * range, copying each page from the source object to the
 	 * destination object.  Since the source is wired, those pages
 	 * must exist.  In contrast, the destination is pageable.
 	 * Since the destination object doesn't share any backing storage
 	 * with the source object, all of its pages must be dirtied,
 	 * regardless of whether they can be written.
 	 */
 	for (vaddr = dst_entry->start, dst_pindex = 0;
 	    vaddr < dst_entry->end;
 	    vaddr += PAGE_SIZE, dst_pindex++) {
 again:
 		/*
 		 * Find the page in the source object, and copy it in.
 		 * Because the source is wired down, the page will be
 		 * in memory.
 		 */
 		if (src_object != dst_object)
 			VM_OBJECT_RLOCK(src_object);
 		object = src_object;
 		pindex = src_pindex + dst_pindex;
 		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
 		    (backing_object = object->backing_object) != NULL) {
 			/*
 			 * Unless the source mapping is read-only or
 			 * it is presently being upgraded from
 			 * read-only, the first object in the shadow
 			 * chain should provide all of the pages.  In
 			 * other words, this loop body should never be
 			 * executed when the source mapping is already
 			 * read/write.
 			 */
 			KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
 			    upgrade,
 			    ("vm_fault_copy_entry: main object missing page"));
 
 			VM_OBJECT_RLOCK(backing_object);
 			pindex += OFF_TO_IDX(object->backing_object_offset);
 			if (object != dst_object)
 				VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
 
 		if (object != dst_object) {
 			/*
 			 * Allocate a page in the destination object.
 			 */
 			dst_m = vm_page_alloc(dst_object, (src_object ==
 			    dst_object ? src_pindex : 0) + dst_pindex,
 			    VM_ALLOC_NORMAL);
 			if (dst_m == NULL) {
 				VM_OBJECT_WUNLOCK(dst_object);
 				VM_OBJECT_RUNLOCK(object);
 				vm_wait(dst_object);
 				VM_OBJECT_WLOCK(dst_object);
 				goto again;
 			}
 			pmap_copy_page(src_m, dst_m);
 			VM_OBJECT_RUNLOCK(object);
 			dst_m->dirty = dst_m->valid = src_m->valid;
 		} else {
 			dst_m = src_m;
 			if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0)
 				goto again;
 			if (dst_m->pindex >= dst_object->size) {
 				/*
 				 * We are upgrading.  Index can occur
 				 * out of bounds if the object type is
 				 * vnode and the file was truncated.
 				 */
 				vm_page_xunbusy(dst_m);
 				break;
 			}
 		}
 		VM_OBJECT_WUNLOCK(dst_object);
 
 		/*
 		 * Enter it in the pmap. If a wired, copy-on-write
 		 * mapping is being replaced by a write-enabled
 		 * mapping, then wire that new mapping.
 		 *
 		 * The page can be invalid if the user called
 		 * msync(MS_INVALIDATE) or truncated the backing vnode
 		 * or shared memory object.  In this case, do not
 		 * insert it into pmap, but still do the copy so that
 		 * all copies of the wired map entry have similar
 		 * backing pages.
 		 */
 		if (vm_page_all_valid(dst_m)) {
 			pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
 			    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
 		}
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
 		VM_OBJECT_WLOCK(dst_object);
 		
 		if (upgrade) {
 			if (src_m != dst_m) {
 				vm_page_unwire(src_m, PQ_INACTIVE);
 				vm_page_wire(dst_m);
 			} else {
 				KASSERT(vm_page_wired(dst_m),
 				    ("dst_m %p is not wired", dst_m));
 			}
 		} else {
 			vm_page_activate(dst_m);
 		}
 		vm_page_xunbusy(dst_m);
 	}
 	VM_OBJECT_WUNLOCK(dst_object);
 	if (upgrade) {
 		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
 		vm_object_deallocate(src_object);
 	}
 }
 
 /*
  * Block entry into the machine-independent layer's page fault handler by
  * the calling thread.  Subsequent calls to vm_fault() by that thread will
  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
  * spurious page faults. 
  */
 int
 vm_fault_disable_pagefaults(void)
 {
 
 	return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
 }
 
 void
 vm_fault_enable_pagefaults(int save)
 {
 
 	curthread_pflags_restore(save);
 }
Index: projects/clang1100-import/sys/vm/vm_mmap.c
===================================================================
--- projects/clang1100-import/sys/vm/vm_mmap.c	(revision 364278)
+++ projects/clang1100-import/sys/vm/vm_mmap.c	(revision 364279)
@@ -1,1671 +1,1671 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/elf.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
 #include <machine/md_var.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vnode_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 int old_mlock = 0;
 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
 static int mincore_mapped = 1;
 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
     "mincore reports mappings, not residency");
 static int imply_prot_max = 0;
 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
     "Imply maximum page protections in mmap() when none are specified");
 
 #ifdef MAP_32BIT
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 int
 sys_sbrk(struct thread *td, struct sbrk_args *uap)
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 int
 sys_sstk(struct thread *td, struct sstk_args *uap)
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43)
 int
 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
 {
 
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 */
 
 
 /*
  * Memory Map (mmap) system call.  Note that the file offset
  * and address are allowed to be NOT page aligned, though if
  * the MAP_FIXED flag it set, both must have the same remainder
  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  * page-aligned, the actual mapping starts at trunc_page(addr)
  * and the return value is adjusted up by the page offset.
  *
  * Generally speaking, only character devices which are themselves
  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  * there would be no cache coherency between a descriptor and a VM mapping
  * both to the same character device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	void *addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 int
 sys_mmap(struct thread *td, struct mmap_args *uap)
 {
 
 	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
 	    uap->flags, uap->fd, uap->pos));
 }
 
 int
 kern_mmap_maxprot(struct proc *p, int prot)
 {
 
 	if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
 	    (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
 		return (_PROT_ALL);
 	if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
 	    prot != PROT_NONE)
 		 return (prot);
 	return (_PROT_ALL);
 }
 
 int
 kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags,
     int fd, off_t pos)
 {
 	struct mmap_req mr = {
 		.mr_hint = addr0,
 		.mr_len = len,
 		.mr_prot = prot,
 		.mr_flags = flags,
 		.mr_fd = fd,
 		.mr_pos = pos
 	};
 
 	return (kern_mmap_req(td, &mr));
 }
 
 int
 kern_mmap_req(struct thread *td, const struct mmap_req *mrp)
 {
 	struct vmspace *vms;
 	struct file *fp;
 	struct proc *p;
 	off_t pos;
 	vm_offset_t addr;
 	vm_size_t len, pageoff, size;
 	vm_prot_t cap_maxprot;
 	int align, error, fd, flags, max_prot, prot;
 	cap_rights_t rights;
 	mmap_check_fp_fn check_fp_fn;
 
 	addr  = mrp->mr_hint;
 	len = mrp->mr_len;
 	prot = mrp->mr_prot;
 	flags = mrp->mr_flags;
 	fd = mrp->mr_fd;
 	pos = mrp->mr_pos;
 	check_fp_fn = mrp->mr_check_fp_fn;
 
 	if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
 		return (EINVAL);
 	max_prot = PROT_MAX_EXTRACT(prot);
 	prot = PROT_EXTRACT(prot);
 	if (max_prot != 0 && (max_prot & prot) != prot)
 		return (ENOTSUP);
 
 	p = td->td_proc;
 
 	/*
 	 * Always honor PROT_MAX if set.  If not, default to all
 	 * permissions unless we're implying maximum permissions.
 	 */
 	if (max_prot == 0)
 		max_prot = kern_mmap_maxprot(p, prot);
 
 	vms = p->p_vmspace;
 	fp = NULL;
 	AUDIT_ARG_FD(fd);
 
 	/*
 	 * Ignore old flags that used to be defined but did not do anything.
 	 */
 	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
 	
 	/*
 	 * Enforce the constraints.
 	 * Mapping of length 0 is only allowed for old binaries.
 	 * Anonymous mapping shall specify -1 as filedescriptor and
 	 * zero position for new code. Be nice to ancient a.out
 	 * binaries and correct pos for anonymous mapping, since old
 	 * ld.so sometimes issues anonymous map requests with non-zero
 	 * pos.
 	 */
 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
 		if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
 		    ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
 			return (EINVAL);
 	} else {
 		if ((flags & MAP_ANON) != 0)
 			pos = 0;
 	}
 
 	if (flags & MAP_STACK) {
 		if ((fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
 	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
 	    MAP_PREFAULT_READ | MAP_GUARD |
 #ifdef MAP_32BIT
 	    MAP_32BIT |
 #endif
 	    MAP_ALIGNMENT_MASK)) != 0)
 		return (EINVAL);
 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
 		return (EINVAL);
 	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
 		return (EINVAL);
 	if (prot != PROT_NONE &&
 	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
 		return (EINVAL);
 	if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
 	    pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
 #ifdef MAP_32BIT
 	    MAP_32BIT |
 #endif
 	    MAP_ALIGNMENT_MASK)) != 0))
 		return (EINVAL);
 
 	/*
 	 * Align the file position to a page boundary,
 	 * and save its page offset component.
 	 */
 	pageoff = (pos & PAGE_MASK);
 	pos -= pageoff;
 
 	/* Compute size from len by rounding (on both ends). */
 	size = len + pageoff;			/* low end... */
 	size = round_page(size);		/* hi end */
 	/* Check for rounding up to zero. */
 	if (len > size)
 		return (ENOMEM);
 
 	/* Ensure alignment is at least a page and fits in a pointer. */
 	align = flags & MAP_ALIGNMENT_MASK;
 	if (align != 0 && align != MAP_ALIGNED_SUPER &&
 	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
 	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		/*
 		 * The specified address must have the same remainder
 		 * as the file offset taken modulo PAGE_SIZE, so it
 		 * should be aligned after adjustment by pageoff.
 		 */
 		addr -= pageoff;
 		if (addr & PAGE_MASK)
 			return (EINVAL);
 
 		/* Address range must be all in user VM space. */
 		if (!vm_map_range_valid(&vms->vm_map, addr, addr + size))
 			return (EINVAL);
 #ifdef MAP_32BIT
 		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
 			return (EINVAL);
 	} else if (flags & MAP_32BIT) {
 		/*
 		 * For MAP_32BIT, override the hint if it is too high and
 		 * do not bother moving the mapping past the heap (since
 		 * the heap is usually above 2GB).
 		 */
 		if (addr + size > MAP_32BIT_MAX_ADDR)
 			addr = 0;
 #endif
 	} else {
 		/*
 		 * XXX for non-fixed mappings where no hint is provided or
 		 * the hint would fall in the potential heap space,
 		 * place it after the end of the largest possible heap.
 		 *
 		 * There should really be a pmap call to determine a reasonable
 		 * location.
 		 */
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
 			    lim_max(td, RLIMIT_DATA));
 	}
 	if (len == 0) {
 		/*
 		 * Return success without mapping anything for old
 		 * binaries that request a page-aligned mapping of
 		 * length 0.  For modern binaries, this function
 		 * returns an error earlier.
 		 */
 		error = 0;
 	} else if ((flags & MAP_GUARD) != 0) {
 		error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
 		    VM_PROT_NONE, flags, NULL, pos, FALSE, td);
 	} else if ((flags & MAP_ANON) != 0) {
 		/*
 		 * Mapping blank space is trivial.
 		 *
 		 * This relies on VM_PROT_* matching PROT_*.
 		 */
 		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
 		    max_prot, flags, NULL, pos, FALSE, td);
 	} else {
 		/*
 		 * Mapping file, get fp for validation and don't let the
 		 * descriptor disappear on us if we block. Check capability
 		 * rights, but also return the maximum rights to be combined
 		 * with maxprot later.
 		 */
 		cap_rights_init_one(&rights, CAP_MMAP);
 		if (prot & PROT_READ)
 			cap_rights_set_one(&rights, CAP_MMAP_R);
 		if ((flags & MAP_SHARED) != 0) {
 			if (prot & PROT_WRITE)
 				cap_rights_set_one(&rights, CAP_MMAP_W);
 		}
 		if (prot & PROT_EXEC)
 			cap_rights_set_one(&rights, CAP_MMAP_X);
 		error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
 		if (error != 0)
 			goto done;
 		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
 		    p->p_osrel >= P_OSREL_MAP_FSTRICT) {
 			error = EINVAL;
 			goto done;
 		}
 		if (check_fp_fn != NULL) {
 			error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
 			    flags);
 			if (error != 0)
 				goto done;
 		}
 		/* This relies on VM_PROT_* matching PROT_*. */
 		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
 		    max_prot & cap_maxprot, flags, pos, td);
 	}
 
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
 	if (fp)
 		fdrop(fp, td);
 
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
 {
 
 	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
 	    uap->flags, uap->fd, uap->pos));
 }
 #endif
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
 ommap(struct thread *td, struct ommap_args *uap)
 {
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
 	int flags, prot;
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
 	prot = cvtbsdprot[uap->prot & 0x7];
 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
 	    prot != 0)
 		prot |= PROT_EXEC;
 #endif
 	flags = 0;
 	if (uap->flags & OMAP_ANON)
 		flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
 		flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
 		flags |= MAP_SHARED;
 	else
 		flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
 		flags |= MAP_FIXED;
 	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
 	    uap->fd, uap->pos));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	void *addr;
 	size_t len;
 	int flags;
 };
 #endif
 int
 sys_msync(struct thread *td, struct msync_args *uap)
 {
 
 	return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
 }
 
 int
 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
 {
 	vm_offset_t addr;
 	vm_size_t pageoff;
 	vm_map_t map;
 	int rv;
 
 	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 		return (ENOMEM);
 	case KERN_INVALID_ARGUMENT:
 		return (EBUSY);
 	case KERN_FAILURE:
 		return (EIO);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	void *addr;
 	size_t len;
 };
 #endif
 int
 sys_munmap(struct thread *td, struct munmap_args *uap)
 {
 
 	return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
 }
 
 int
 kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
 	bool pmc_handled;
 #endif
 	vm_offset_t addr, end;
 	vm_size_t pageoff;
 	vm_map_t map;
 
 	if (size == 0)
 		return (EINVAL);
 
 	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	end = addr + size;
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (!vm_map_range_valid(map, addr, end))
 		return (EINVAL);
 
 	vm_map_lock(map);
 #ifdef HWPMC_HOOKS
 	pmc_handled = false;
 	if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
 		pmc_handled = true;
 		/*
 		 * Inform hwpmc if the address range being unmapped contains
 		 * an executable region.
 		 */
 		pkm.pm_address = (uintptr_t) NULL;
 		if (vm_map_lookup_entry(map, addr, &entry)) {
 			for (; entry->start < end;
 			    entry = vm_map_entry_succ(entry)) {
 				if (vm_map_check_protection(map, entry->start,
 					entry->end, VM_PROT_EXECUTE) == TRUE) {
 					pkm.pm_address = (uintptr_t) addr;
 					pkm.pm_size = (size_t) size;
 					break;
 				}
 			}
 		}
 	}
 #endif
 	vm_map_delete(map, addr, end);
 
 #ifdef HWPMC_HOOKS
 	if (__predict_false(pmc_handled)) {
 		/* downgrade the lock to prevent a LOR with the pmc-sx lock */
 		vm_map_lock_downgrade(map);
 		if (pkm.pm_address != (uintptr_t) NULL)
 			PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
 		vm_map_unlock_read(map);
 	} else
 #endif
 		vm_map_unlock(map);
 
 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	const void *addr;
 	size_t len;
 	int prot;
 };
 #endif
 int
 sys_mprotect(struct thread *td, struct mprotect_args *uap)
 {
 
 	return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
 }
 
 int
 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
 {
 	vm_offset_t addr;
 	vm_size_t pageoff;
 	int vm_error, max_prot;
 
 	addr = addr0;
 	if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
 		return (EINVAL);
 	max_prot = PROT_MAX_EXTRACT(prot);
 	prot = PROT_EXTRACT(prot);
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		if (((addr + size) & 0xffffffff) < addr)
 			return (EINVAL);
 	} else
 #endif
 	if (addr + size < addr)
 		return (EINVAL);
 
 	vm_error = KERN_SUCCESS;
 	if (max_prot != 0) {
 		if ((max_prot & prot) != prot)
 			return (ENOTSUP);
 		vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
 		    addr, addr + size, max_prot, TRUE);
 	}
 	if (vm_error == KERN_SUCCESS)
 		vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
 		    addr, addr + size, prot, FALSE);
 
 	switch (vm_error) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	case KERN_RESOURCE_SHORTAGE:
 		return (ENOMEM);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct minherit_args {
 	void *addr;
 	size_t len;
 	int inherit;
 };
 #endif
 int
 sys_minherit(struct thread *td, struct minherit_args *uap)
 {
 
 	return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
 	    uap->inherit));
 }
 
 int
 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_inherit_t inherit;
 
 	addr = (vm_offset_t)addr0;
 	size = len;
 	inherit = inherit0;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, inherit)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	void *addr;
 	size_t len;
 	int behav;
 };
 #endif
 
 int
 sys_madvise(struct thread *td, struct madvise_args *uap)
 {
 
 	return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
 }
 
 int
 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
 {
 	vm_map_t map;
 	vm_offset_t addr, end, start;
 	int flags;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
 	if (behav == MADV_PROTECT) {
 		flags = PPROT_SET;
 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
 		    PROC_SPROTECT, &flags));
 	}
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	addr = addr0;
 	if (!vm_map_range_valid(map, addr, addr + len))
 		return (EINVAL);
 
 	/*
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
 	start = trunc_page(addr);
 	end = round_page(addr + len);
 
 	/*
 	 * vm_map_madvise() checks for illegal values of behav.
 	 */
 	return (vm_map_madvise(map, start, end, behav));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	const void *addr;
 	size_t len;
 	char *vec;
 };
 #endif
 
 int
 sys_mincore(struct thread *td, struct mincore_args *uap)
 {
 
 	return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
 }
 
 int
 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
 {
 	pmap_t pmap;
 	vm_map_t map;
 	vm_map_entry_t current, entry;
 	vm_object_t object;
 	vm_offset_t addr, cend, end, first_addr;
 	vm_paddr_t pa;
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int error, lastvecindex, mincoreinfo, vecindex;
 	unsigned int timestamp;
 
 	/*
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
 	first_addr = addr = trunc_page(addr0);
 	end = round_page(addr0 + len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
 RestartScan:
 	timestamp = map->timestamp;
 
 	if (!vm_map_lookup_entry(map, addr, &entry)) {
 		vm_map_unlock_read(map);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
 	 * in the current processes address space, we can easily look
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
 	while (entry->start < end) {
 
 		/*
 		 * check for contiguity
 		 */
 		current = entry;
 		entry = vm_map_entry_succ(current);
 		if (current->end < end &&
 		    entry->start > current->end) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
 
 		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 		    current->object.vm_object == NULL)
 			continue;
 
 		/*
 		 * limit this scan to the current map entry and the
 		 * limits for the mincore call
 		 */
 		if (addr < current->start)
 			addr = current->start;
 		cend = current->end;
 		if (cend > end)
 			cend = end;
 
 		for (; addr < cend; addr += PAGE_SIZE) {
 			/*
 			 * Check pmap first, it is likely faster, also
 			 * it can provide info as to whether we are the
 			 * one referencing or modifying the page.
 			 */
 			m = NULL;
 			object = NULL;
 retry:
 			pa = 0;
 			mincoreinfo = pmap_mincore(pmap, addr, &pa);
 			if (mincore_mapped) {
 				/*
 				 * We only care about this pmap's
 				 * mapping of the page, if any.
 				 */
 				;
 			} else if (pa != 0) {
 				/*
 				 * The page is mapped by this process but not
 				 * both accessed and modified.  It is also
 				 * managed.  Acquire the object lock so that
 				 * other mappings might be examined.  The page's
 				 * identity may change at any point before its
 				 * object lock is acquired, so re-validate if
 				 * necessary.
 				 */
 				m = PHYS_TO_VM_PAGE(pa);
 				while (object == NULL || m->object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = atomic_load_ptr(&m->object);
 					if (object == NULL)
 						goto retry;
 					VM_OBJECT_WLOCK(object);
 				}
 				if (pa != pmap_extract(pmap, addr))
 					goto retry;
 				KASSERT(vm_page_all_valid(m),
 				    ("mincore: page %p is mapped but invalid",
 				    m));
 			} else if (mincoreinfo == 0) {
 				/*
 				 * The page is not mapped by this process.  If
 				 * the object implements managed pages, then
 				 * determine if the page is resident so that
 				 * the mappings might be examined.
 				 */
 				if (current->object.vm_object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = current->object.vm_object;
 					VM_OBJECT_WLOCK(object);
 				}
 				if (object->type == OBJT_DEFAULT ||
 				    object->type == OBJT_SWAP ||
 				    object->type == OBJT_VNODE) {
 					pindex = OFF_TO_IDX(current->offset +
 					    (addr - current->start));
 					m = vm_page_lookup(object, pindex);
 					if (m != NULL && vm_page_none_valid(m))
 						m = NULL;
 					if (m != NULL)
 						mincoreinfo = MINCORE_INCORE;
 				}
 			}
 			if (m != NULL) {
 				VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 				/* Examine other mappings of the page. */
 				if (m->dirty == 0 && pmap_is_modified(m))
 					vm_page_dirty(m);
 				if (m->dirty != 0)
 					mincoreinfo |= MINCORE_MODIFIED_OTHER;
 
 				/*
 				 * The first test for PGA_REFERENCED is an
 				 * optimization.  The second test is
 				 * required because a concurrent pmap
 				 * operation could clear the last reference
 				 * and set PGA_REFERENCED before the call to
 				 * pmap_is_referenced(). 
 				 */
 				if ((m->a.flags & PGA_REFERENCED) != 0 ||
 				    pmap_is_referenced(m) ||
 				    (m->a.flags & PGA_REFERENCED) != 0)
 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
 			}
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
 			 * the map, we release the lock.
 			 */
 			vm_map_unlock_read(map);
 
 			/*
 			 * calculate index into user supplied byte vector
 			 */
 			vecindex = atop(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
 				++lastvecindex;
 				error = subyte(vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
 			}
 
 			/*
 			 * Pass the page information to the user
 			 */
 			error = subyte(vec + vecindex, mincoreinfo);
 			if (error) {
 				error = EFAULT;
 				goto done2;
 			}
 
 			/*
 			 * If the map has changed, due to the subyte, the previous
 			 * output may be invalid.
 			 */
 			vm_map_lock_read(map);
 			if (timestamp != map->timestamp)
 				goto RestartScan;
 
 			lastvecindex = vecindex;
 		}
 	}
 
 	/*
 	 * subyte may page fault.  In case it needs to modify
 	 * the map, we release the lock.
 	 */
 	vm_map_unlock_read(map);
 
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
 	vecindex = atop(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
 	}
 
 	/*
 	 * If the map has changed, due to the subyte, the previous
 	 * output may be invalid.
 	 */
 	vm_map_lock_read(map);
 	if (timestamp != map->timestamp)
 		goto RestartScan;
 	vm_map_unlock_read(map);
 done2:
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 int
 sys_mlock(struct thread *td, struct mlock_args *uap)
 {
 
 	return (kern_mlock(td->td_proc, td->td_ucred,
 	    __DECONST(uintptr_t, uap->addr), uap->len));
 }
 
 int
 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
 	vm_map_t map;
 	unsigned long nsize;
 	int error;
 
 	error = priv_check_cred(cred, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 	addr = addr0;
 	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_user_wired)
 		return (ENOMEM);
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
 	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(proc);
 		error = racct_set(proc, RACCT_MEMLOCK, nsize);
 		PROC_UNLOCK(proc);
 		if (error != 0)
 			return (ENOMEM);
 	}
 #endif
 	error = vm_map_wire(map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
 	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(proc);
 		racct_set(proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 int
 sys_mlockall(struct thread *td, struct mlockall_args *uap)
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 
 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 		return (EINVAL);
 
 	/*
 	 * If wiring all pages in the process would cause it to exceed
 	 * a hard resource limit, return ENOMEM.
 	 */
 	if (!old_mlock && uap->how & MCL_CURRENT) {
 		if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
 			return (ENOMEM);
 	}
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 		PROC_UNLOCK(td->td_proc);
 		if (error != 0)
 			return (ENOMEM);
 	}
 #endif
 
 	if (uap->how & MCL_FUTURE) {
 		vm_map_lock(map);
 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
 		vm_map_unlock(map);
 		error = 0;
 	}
 
 	if (uap->how & MCL_CURRENT) {
 		/*
 		 * P1003.1-2001 mandates that all currently mapped pages
 		 * will be memory resident and locked (wired) upon return
 		 * from mlockall(). vm_map_wire() will wire pages, by
 		 * calling vm_fault_wire() for each page in the region.
 		 */
 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 		if (error == KERN_SUCCESS)
 			error = 0;
 		else if (error == KERN_RESOURCE_SHORTAGE)
 			error = ENOMEM;
 		else
 			error = EAGAIN;
 	}
 #ifdef RACCT
 	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlockall_args {
 	register_t dummy;
 };
 #endif
 
 int
 sys_munlockall(struct thread *td, struct munlockall_args *uap)
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 
 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
 	vm_map_lock(map);
 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
 	vm_map_unlock(map);
 
 	/* Forcibly unwire all pages. */
 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 #ifdef RACCT
 	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 int
 sys_munlock(struct thread *td, struct munlock_args *uap)
 {
 
 	return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
 }
 
 int
 kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
 {
 	vm_offset_t addr, end, last, start;
 #ifdef RACCT
 	vm_map_t map;
 #endif
 	int error;
 
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 	addr = addr0;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
 	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		map = &td->td_proc->p_vmspace->vm_map;
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * vm_mmap_vnode()
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
     boolean_t *writecounted)
 {
 	struct vattr va;
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	struct ucred *cred;
 	int error, flags;
 	bool writex;
 
 	cred = td->td_ucred;
 	writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
 	    (*flagsp & MAP_SHARED) != 0;
-	if ((error = vget(vp, LK_SHARED, td)) != 0)
+	if ((error = vget(vp, LK_SHARED)) != 0)
 		return (error);
 	AUDIT_ARG_VNODE1(vp);
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
 	if (vp->v_type == VREG) {
 		/*
 		 * Get the proper underlying object
 		 */
 		if (obj == NULL) {
 			error = EINVAL;
 			goto done;
 		}
 		if (obj->type == OBJT_VNODE && obj->handle != vp) {
 			vput(vp);
 			vp = (struct vnode *)obj->handle;
 			/*
 			 * Bypass filesystems obey the mpsafety of the
 			 * underlying fs.  Tmpfs never bypasses.
 			 */
-			error = vget(vp, LK_SHARED, td);
+			error = vget(vp, LK_SHARED);
 			if (error != 0)
 				return (error);
 		}
 		if (writex) {
 			*writecounted = TRUE;
 			vm_pager_update_writecount(obj, 0, objsize);
 		}
 	} else {
 		error = EINVAL;
 		goto done;
 	}
 	if ((error = VOP_GETATTR(vp, &va, cred)))
 		goto done;
 #ifdef MAC
 	/* This relies on VM_PROT_* matching PROT_*. */
 	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 			if (prot & VM_PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
 			*maxprotp &= ~VM_PROT_WRITE;
 		}
 	}
 	/*
 	 * If it is a regular file without any references
 	 * we do not need to sync it.
 	 * Adjust object size to be the size of actual file.
 	 */
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
 	if (obj->type == OBJT_VNODE) {
 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 		    cred);
 		if (obj == NULL) {
 			error = ENOMEM;
 			goto done;
 		}
 	} else {
 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 		    ("wrong object type"));
 		vm_object_reference(obj);
 #if VM_NRESERVLEVEL > 0
 		if ((obj->flags & OBJ_COLORED) == 0) {
 			VM_OBJECT_WLOCK(obj);
 			vm_object_color(obj, 0);
 			VM_OBJECT_WUNLOCK(obj);
 		}
 #endif
 	}
 	*objp = obj;
 	*flagsp = flags;
 
 	VOP_MMAPPED(vp);
 
 done:
 	if (error != 0 && *writecounted) {
 		*writecounted = FALSE;
 		vm_pager_update_writecount(obj, objsize, 0);
 	}
 	vput(vp);
 	return (error);
 }
 
 /*
  * vm_mmap_cdev()
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on cdevs.
  */
 int
 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
     vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
     vm_ooffset_t *foff, vm_object_t *objp)
 {
 	vm_object_t obj;
 	int error, flags;
 
 	flags = *flagsp;
 
 	if (dsw->d_flags & D_MMAP_ANON) {
 		*objp = NULL;
 		*foff = 0;
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
 	}
 	/*
 	 * cdevs do not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	if (flags & (MAP_PRIVATE|MAP_COPY))
 		return (EINVAL);
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
 	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
 	if (error != 0)
 		return (error);
 #endif
 	/*
 	 * First, try d_mmap_single().  If that is not implemented
 	 * (returns ENODEV), fall back to using the device pager.
 	 * Note that d_mmap_single() must return a reference to the
 	 * object (it needs to bump the reference count of the object
 	 * it returns somehow).
 	 *
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 	if (error != ENODEV)
 		return (error);
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 	    td->td_ucred);
 	if (obj == NULL)
 		return (EINVAL);
 	*objp = obj;
 	*flagsp = flags;
 	return (0);
 }
 
 /*
  * vm_mmap()
  *
  * Internal version of mmap used by exec, sys5 shared memory, and
  * various device drivers.  Handle is either a vnode pointer, a
  * character device, or NULL for MAP_ANON.
  */
 int
 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 	vm_prot_t maxprot, int flags,
 	objtype_t handle_type, void *handle,
 	vm_ooffset_t foff)
 {
 	vm_object_t object;
 	struct thread *td = curthread;
 	int error;
 	boolean_t writecounted;
 
 	if (size == 0)
 		return (EINVAL);
 
 	size = round_page(size);
 	object = NULL;
 	writecounted = FALSE;
 
 	/*
 	 * Lookup/allocate object.
 	 */
 	switch (handle_type) {
 	case OBJT_DEVICE: {
 		struct cdevsw *dsw;
 		struct cdev *cdev;
 		int ref;
 
 		cdev = handle;
 		dsw = dev_refthread(cdev, &ref);
 		if (dsw == NULL)
 			return (ENXIO);
 		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
 		    dsw, &foff, &object);
 		dev_relthread(cdev, ref);
 		break;
 	}
 	case OBJT_VNODE:
 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 		    handle, &foff, &object, &writecounted);
 		break;
 	case OBJT_DEFAULT:
 		if (handle == NULL) {
 			error = 0;
 			break;
 		}
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, writecounted, td);
 	if (error != 0 && object != NULL) {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vm_pager_release_writecount(object, 0, size);
 		vm_object_deallocate(object);
 	}
 	return (error);
 }
 
 /*
  * Internal version of mmap that maps a specific VM object into an
  * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
  */
 int
 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
     vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
     boolean_t writecounted, struct thread *td)
 {
 	boolean_t curmap, fitit;
 	vm_offset_t max_addr;
 	int docow, error, findspace, rv;
 
 	curmap = map == &td->td_proc->p_vmspace->vm_map;
 	if (curmap) {
 		RACCT_PROC_LOCK(td->td_proc);
 		if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
 			RACCT_PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 			RACCT_PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
 			    lim_cur(td, RLIMIT_MEMLOCK)) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				RACCT_PROC_UNLOCK(td->td_proc);
 				return (ENOMEM);
 			}
 			error = racct_set(td->td_proc, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + size);
 			if (error != 0) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				RACCT_PROC_UNLOCK(td->td_proc);
 				return (error);
 			}
 		}
 		RACCT_PROC_UNLOCK(td->td_proc);
 	}
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
 	 * The mmap() system call already enforces this by subtracting
 	 * the page offset from the file offset, but checking here
 	 * catches errors in device drivers (e.g. d_single_mmap()
 	 * callbacks) and other internal mapping requests (such as in
 	 * exec).
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 	}
 
 	if (flags & MAP_ANON) {
 		if (object != NULL || foff != 0)
 			return (EINVAL);
 		docow = 0;
 	} else if (flags & MAP_PREFAULT_READ)
 		docow = MAP_PREFAULT;
 	else
 		docow = MAP_PREFAULT_PARTIAL;
 
 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 		docow |= MAP_COPY_ON_WRITE;
 	if (flags & MAP_NOSYNC)
 		docow |= MAP_DISABLE_SYNCER;
 	if (flags & MAP_NOCORE)
 		docow |= MAP_DISABLE_COREDUMP;
 	/* Shared memory is also shared with children. */
 	if (flags & MAP_SHARED)
 		docow |= MAP_INHERIT_SHARE;
 	if (writecounted)
 		docow |= MAP_WRITECOUNT;
 	if (flags & MAP_STACK) {
 		if (object != NULL)
 			return (EINVAL);
 		docow |= MAP_STACK_GROWS_DOWN;
 	}
 	if ((flags & MAP_EXCL) != 0)
 		docow |= MAP_CHECK_EXCL;
 	if ((flags & MAP_GUARD) != 0)
 		docow |= MAP_CREATE_GUARD;
 
 	if (fitit) {
 		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 			findspace = VMFS_SUPER_SPACE;
 		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 			findspace = VMFS_ALIGNED_SPACE(flags >>
 			    MAP_ALIGNMENT_SHIFT);
 		else
 			findspace = VMFS_OPTIMAL_SPACE;
 		max_addr = 0;
 #ifdef MAP_32BIT
 		if ((flags & MAP_32BIT) != 0)
 			max_addr = MAP_32BIT_MAX_ADDR;
 #endif
 		if (curmap) {
 			rv = vm_map_find_min(map, object, foff, addr, size,
 			    round_page((vm_offset_t)td->td_proc->p_vmspace->
 			    vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr,
 			    findspace, prot, maxprot, docow);
 		} else {
 			rv = vm_map_find(map, object, foff, addr, size,
 			    max_addr, findspace, prot, maxprot, docow);
 		}
 	} else {
 		rv = vm_map_fixed(map, object, foff, *addr, size,
 		    prot, maxprot, docow);
 	}
 
 	if (rv == KERN_SUCCESS) {
 		/*
 		 * If the process has requested that all future mappings
 		 * be wired, then heed this.
 		 */
 		if ((map->flags & MAP_WIREFUTURE) != 0) {
 			vm_map_lock(map);
 			if ((map->flags & MAP_WIREFUTURE) != 0)
 				(void)vm_map_wire_locked(map, *addr,
 				    *addr + size, VM_MAP_WIRE_USER |
 				    ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
 				    VM_MAP_WIRE_NOHOLES));
 			vm_map_unlock(map);
 		}
 	}
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * Translate a Mach VM return code to zero on success or the appropriate errno
  * on failure.
  */
 int
 vm_mmap_to_errno(int rv)
 {
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }
Index: projects/clang1100-import/sys/vm/vm_pageout.c
===================================================================
--- projects/clang1100-import/sys/vm/vm_pageout.c	(revision 364278)
+++ projects/clang1100-import/sys/vm/vm_pageout.c	(revision 364279)
@@ -1,2406 +1,2406 @@
 /*-
  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/blockcount.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
 static int vm_pageout_cluster(vm_page_t m);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
     NULL);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
 SDT_PROVIDER_DEFINE(vm);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
 /* Pagedaemon activity rates, in subdivisions of one second. */
 #define	VM_LAUNDER_RATE		10
 #define	VM_INACT_SCAN_RATE	10
 
 static int vm_pageout_oom_seq = 12;
 
 static int vm_pageout_update_period;
 static int disable_swap_pageouts;
 static int lowmem_period = 10;
 static int swapdev_enabled;
 
 static int vm_panic_on_oom = 0;
 
 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
 	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
 	"Panic on the given number of out-of-memory errors instead of killing the largest process");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
 	CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
 	"Maximum active LRU update period");
 
 /* Access with get_pageout_threads_per_domain(). */
 static int pageout_threads_per_domain = 1;
 SYSCTL_INT(_vm, OID_AUTO, pageout_threads_per_domain, CTLFLAG_RDTUN,
     &pageout_threads_per_domain, 0,
     "Number of worker threads comprising each per-domain pagedaemon");
   
 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
 	"Low memory callback period");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
 	CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
 	"back-to-back calls to oom detector to start OOM");
 
 static int act_scan_laundry_weight = 3;
 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
     &act_scan_laundry_weight, 0,
     "weight given to clean vs. dirty pages in active queue scans");
 
 static u_int vm_background_launder_rate = 4096;
 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
     &vm_background_launder_rate, 0,
     "background laundering rate, in kilobytes per second");
 
 static u_int vm_background_launder_max = 20 * 1024;
 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
     &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
 
 int vm_pageout_page_count = 32;
 
 u_long vm_page_max_user_wired;
 SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW,
     &vm_page_max_user_wired, 0,
     "system-wide limit to user-wired page count");
 
 static u_int isqrt(u_int num);
 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
     bool in_shortfall);
 static void vm_pageout_laundry_worker(void *arg);
 
 struct scan_state {
 	struct vm_batchqueue bq;
 	struct vm_pagequeue *pq;
 	vm_page_t	marker;
 	int		maxscan;
 	int		scanned;
 };
 
 static void
 vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
     vm_page_t marker, vm_page_t after, int maxscan)
 {
 
 	vm_pagequeue_assert_locked(pq);
 	KASSERT((marker->a.flags & PGA_ENQUEUED) == 0,
 	    ("marker %p already enqueued", marker));
 
 	if (after == NULL)
 		TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
 	else
 		TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
 	vm_page_aflag_set(marker, PGA_ENQUEUED);
 
 	vm_batchqueue_init(&ss->bq);
 	ss->pq = pq;
 	ss->marker = marker;
 	ss->maxscan = maxscan;
 	ss->scanned = 0;
 	vm_pagequeue_unlock(pq);
 }
 
 static void
 vm_pageout_end_scan(struct scan_state *ss)
 {
 	struct vm_pagequeue *pq;
 
 	pq = ss->pq;
 	vm_pagequeue_assert_locked(pq);
 	KASSERT((ss->marker->a.flags & PGA_ENQUEUED) != 0,
 	    ("marker %p not enqueued", ss->marker));
 
 	TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
 	vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
 	pq->pq_pdpages += ss->scanned;
 }
 
 /*
  * Add a small number of queued pages to a batch queue for later processing
  * without the corresponding queue lock held.  The caller must have enqueued a
  * marker page at the desired start point for the scan.  Pages will be
  * physically dequeued if the caller so requests.  Otherwise, the returned
  * batch may contain marker pages, and it is up to the caller to handle them.
  *
  * When processing the batch queue, vm_pageout_defer() must be used to
  * determine whether the page has been logically dequeued since the batch was
  * collected.
  */
 static __always_inline void
 vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
 {
 	struct vm_pagequeue *pq;
 	vm_page_t m, marker, n;
 
 	marker = ss->marker;
 	pq = ss->pq;
 
 	KASSERT((marker->a.flags & PGA_ENQUEUED) != 0,
 	    ("marker %p not enqueued", ss->marker));
 
 	vm_pagequeue_lock(pq);
 	for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
 	    ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
 	    m = n, ss->scanned++) {
 		n = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) == 0) {
 			KASSERT((m->a.flags & PGA_ENQUEUED) != 0,
 			    ("page %p not enqueued", m));
 			KASSERT((m->flags & PG_FICTITIOUS) == 0,
 			    ("Fictitious page %p cannot be in page queue", m));
 			KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("Unmanaged page %p cannot be in page queue", m));
 		} else if (dequeue)
 			continue;
 
 		(void)vm_batchqueue_insert(&ss->bq, m);
 		if (dequeue) {
 			TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 			vm_page_aflag_clear(m, PGA_ENQUEUED);
 		}
 	}
 	TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
 	if (__predict_true(m != NULL))
 		TAILQ_INSERT_BEFORE(m, marker, plinks.q);
 	else
 		TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
 	if (dequeue)
 		vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  * Return the next page to be scanned, or NULL if the scan is complete.
  */
 static __always_inline vm_page_t
 vm_pageout_next(struct scan_state *ss, const bool dequeue)
 {
 
 	if (ss->bq.bq_cnt == 0)
 		vm_pageout_collect_batch(ss, dequeue);
 	return (vm_batchqueue_pop(&ss->bq));
 }
 
 /*
  * Determine whether processing of a page should be deferred and ensure that any
  * outstanding queue operations are processed.
  */
 static __always_inline bool
 vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued)
 {
 	vm_page_astate_t as;
 
 	as = vm_page_astate_load(m);
 	if (__predict_false(as.queue != queue ||
 	    ((as.flags & PGA_ENQUEUED) != 0) != enqueued))
 		return (true);
 	if ((as.flags & PGA_QUEUE_OP_MASK) != 0) {
 		vm_page_pqbatch_submit(m, queue);
 		return (true);
 	}
 	return (false);
 }
 
 /*
  * Scan for pages at adjacent offsets within the given page's object that are
  * eligible for laundering, form a cluster of these pages and the given page,
  * and launder that cluster.
  */
 static int
 vm_pageout_cluster(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
 	vm_pindex_t pindex;
 	int ib, is, page_base, pageout_count;
 
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	pindex = m->pindex;
 
 	vm_page_assert_xbusied(m);
 
 	mc[vm_pageout_page_count] = pb = ps = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	ib = 1;
 	is = 1;
 
 	/*
 	 * We can cluster only if the page is not clean, busy, or held, and
 	 * the page is in the laundry queue.
 	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying to
 	 * align the clusters (which leaves sporadic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 more:
 	while (ib != 0 && pageout_count < vm_pageout_page_count) {
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
 		if ((p = vm_page_prev(pb)) == NULL ||
 		    vm_page_tryxbusy(p) == 0) {
 			ib = 0;
 			break;
 		}
 		if (vm_page_wired(p)) {
 			ib = 0;
 			vm_page_xunbusy(p);
 			break;
 		}
 		vm_page_test_dirty(p);
 		if (p->dirty == 0) {
 			ib = 0;
 			vm_page_xunbusy(p);
 			break;
 		}
 		if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) {
 			vm_page_xunbusy(p);
 			ib = 0;
 			break;
 		}
 		mc[--page_base] = pb = p;
 		++pageout_count;
 		++ib;
 
 		/*
 		 * We are at an alignment boundary.  Stop here, and switch
 		 * directions.  Do not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
 		if ((p = vm_page_next(ps)) == NULL ||
 		    vm_page_tryxbusy(p) == 0)
 			break;
 		if (vm_page_wired(p)) {
 			vm_page_xunbusy(p);
 			break;
 		}
 		vm_page_test_dirty(p);
 		if (p->dirty == 0) {
 			vm_page_xunbusy(p);
 			break;
 		}
 		if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) {
 			vm_page_xunbusy(p);
 			break;
 		}
 		mc[page_base + pageout_count] = ps = p;
 		++pageout_count;
 		++is;
 	}
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
 	 * when possible, even past an alignment boundary.  This catches
 	 * boundary conditions.
 	 */
 	if (ib != 0 && pageout_count < vm_pageout_page_count)
 		goto more;
 
 	return (vm_pageout_flush(&mc[page_base], pageout_count,
 	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  *
  *	Returned runlen is the count of pages between mreq and first
  *	page after mreq with status VM_PAGER_AGAIN.
  *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
  *	for any page in runlen set.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
     boolean_t *eio)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i, runlen;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Initiate I/O.  Mark the pages shared busy and verify that they're
 	 * valid and read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(vm_page_all_valid(mc[i]),
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		KASSERT((mc[i]->a.flags & PGA_WRITEABLE) == 0,
 		    ("vm_pageout_flush: writeable page %p", mc[i]));
 		vm_page_busy_downgrade(mc[i]);
 	}
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count, flags, pageout_status);
 
 	runlen = count - mreq;
 	if (eio != NULL)
 		*eio = FALSE;
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
 		    !pmap_page_is_write_mapped(mt),
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 			/*
 			 * The page may have moved since laundering started, in
 			 * which case it should be left alone.
 			 */
 			if (vm_page_in_laundry(mt))
 				vm_page_deactivate_noreuse(mt);
 			/* FALLTHROUGH */
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * The page is outside the object's range.  We pretend
 			 * that the page out worked and clean the page, so the
 			 * changes will be lost if the page is reclaimed by
 			 * the page daemon.
 			 */
 			vm_page_undirty(mt);
 			if (vm_page_in_laundry(mt))
 				vm_page_deactivate_noreuse(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If the page couldn't be paged out to swap because the
 			 * pager wasn't able to find space, place the page in
 			 * the PQ_UNSWAPPABLE holding queue.  This is an
 			 * optimization that prevents the page daemon from
 			 * wasting CPU cycles on pages that cannot be reclaimed
 			 * becase no swap device is configured.
 			 *
 			 * Otherwise, reactivate the page so that it doesn't
 			 * clog the laundry and inactive queues.  (We will try
 			 * paging it out again later.)
 			 */
 			if (object->type == OBJT_SWAP &&
 			    pageout_status[i] == VM_PAGER_FAIL) {
 				vm_page_unswappable(mt);
 				numpagedout++;
 			} else
 				vm_page_activate(mt);
 			if (eio != NULL && i >= mreq && i - mreq < runlen)
 				*eio = TRUE;
 			break;
 		case VM_PAGER_AGAIN:
 			if (i >= mreq && i - mreq < runlen)
 				runlen = i - mreq;
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
 		}
 	}
 	if (prunlen != NULL)
 		*prunlen = runlen;
 	return (numpagedout);
 }
 
 static void
 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
 {
 
 	atomic_store_rel_int(&swapdev_enabled, 1);
 }
 
 static void
 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
 {
 
 	if (swap_pager_nswapdev() == 1)
 		atomic_store_rel_int(&swapdev_enabled, 0);
 }
 
 /*
  * Attempt to acquire all of the necessary locks to launder a page and
  * then call through the clustering layer to PUTPAGES.  Wait a short
  * time for a vnode lock.
  *
  * Requires the page and object lock on entry, releases both before return.
  * Returns 0 on success and an errno otherwise.
  */
 static int
 vm_pageout_clean(vm_page_t m, int *numpagedout)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int error, lockmode;
 
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 	vp = NULL;
 	mp = NULL;
 
 	/*
 	 * The object is already known NOT to be dead.   It
 	 * is possible for the vget() to block the whole
 	 * pageout daemon, but the new low-memory handling
 	 * code should prevent it.
 	 *
 	 * We can't wait forever for the vnode lock, we might
 	 * deadlock due to a vn_read() getting stuck in
 	 * vm_wait while holding this vnode.  We skip the 
 	 * vnode if we can't get it in a reasonable amount
 	 * of time.
 	 */
 	if (object->type == OBJT_VNODE) {
 		vm_page_xunbusy(m);
 		vp = object->handle;
 		if (vp->v_type == VREG &&
 		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			mp = NULL;
 			error = EDEADLK;
 			goto unlock_all;
 		}
 		KASSERT(mp != NULL,
 		    ("vp %p with NULL v_mount", vp));
 		vm_object_reference_locked(object);
 		pindex = m->pindex;
 		VM_OBJECT_WUNLOCK(object);
 		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
 		    LK_SHARED : LK_EXCLUSIVE;
-		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
+		if (vget(vp, lockmode | LK_TIMELOCK)) {
 			vp = NULL;
 			error = EDEADLK;
 			goto unlock_mp;
 		}
 		VM_OBJECT_WLOCK(object);
 
 		/*
 		 * Ensure that the object and vnode were not disassociated
 		 * while locks were dropped.
 		 */
 		if (vp->v_object != object) {
 			error = ENOENT;
 			goto unlock_all;
 		}
 
 		/*
 		 * While the object was unlocked, the page may have been:
 		 * (1) moved to a different queue,
 		 * (2) reallocated to a different object,
 		 * (3) reallocated to a different offset, or
 		 * (4) cleaned.
 		 */
 		if (!vm_page_in_laundry(m) || m->object != object ||
 		    m->pindex != pindex || m->dirty == 0) {
 			error = ENXIO;
 			goto unlock_all;
 		}
 
 		/*
 		 * The page may have been busied while the object lock was
 		 * released.
 		 */
 		if (vm_page_tryxbusy(m) == 0) {
 			error = EBUSY;
 			goto unlock_all;
 		}
 	}
 
 	/*
 	 * Remove all writeable mappings, failing if the page is wired.
 	 */
 	if (!vm_page_try_remove_write(m)) {
 		vm_page_xunbusy(m);
 		error = EBUSY;
 		goto unlock_all;
 	}
 
 	/*
 	 * If a page is dirty, then it is either being washed
 	 * (but not yet cleaned) or it is still in the
 	 * laundry.  If it is still in the laundry, then we
 	 * start the cleaning operation. 
 	 */
 	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
 		error = EIO;
 
 unlock_all:
 	VM_OBJECT_WUNLOCK(object);
 
 unlock_mp:
 	if (mp != NULL) {
 		if (vp != NULL)
 			vput(vp);
 		vm_object_deallocate(object);
 		vn_finished_write(mp);
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to launder the specified number of pages.
  *
  * Returns the number of pages successfully laundered.
  */
 static int
 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 {
 	struct scan_state ss;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	vm_page_t m, marker;
 	vm_page_astate_t new, old;
 	int act_delta, error, numpagedout, queue, refs, starting_target;
 	int vnodes_skipped;
 	bool pageout_ok;
 
 	object = NULL;
 	starting_target = launder;
 	vnodes_skipped = 0;
 
 	/*
 	 * Scan the laundry queues for pages eligible to be laundered.  We stop
 	 * once the target number of dirty pages have been laundered, or once
 	 * we've reached the end of the queue.  A single iteration of this loop
 	 * may cause more than one page to be laundered because of clustering.
 	 *
 	 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
 	 * swap devices are configured.
 	 */
 	if (atomic_load_acq_int(&swapdev_enabled))
 		queue = PQ_UNSWAPPABLE;
 	else
 		queue = PQ_LAUNDRY;
 
 scan:
 	marker = &vmd->vmd_markers[queue];
 	pq = &vmd->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 	while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
 		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
 
 		/*
 		 * Don't touch a page that was removed from the queue after the
 		 * page queue lock was released.  Otherwise, ensure that any
 		 * pending queue operations, such as dequeues for wired pages,
 		 * are handled.
 		 */
 		if (vm_pageout_defer(m, queue, true))
 			continue;
 
 		/*
 		 * Lock the page's object.
 		 */
 		if (object == NULL || object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 			object = atomic_load_ptr(&m->object);
 			if (__predict_false(object == NULL))
 				/* The page is being freed by another thread. */
 				continue;
 
 			/* Depends on type-stability. */
 			VM_OBJECT_WLOCK(object);
 			if (__predict_false(m->object != object)) {
 				VM_OBJECT_WUNLOCK(object);
 				object = NULL;
 				continue;
 			}
 		}
 
 		if (vm_page_tryxbusy(m) == 0)
 			continue;
 
 		/*
 		 * Check for wirings now that we hold the object lock and have
 		 * exclusively busied the page.  If the page is mapped, it may
 		 * still be wired by pmap lookups.  The call to
 		 * vm_page_try_remove_all() below atomically checks for such
 		 * wirings and removes mappings.  If the page is unmapped, the
 		 * wire count is guaranteed not to increase after this check.
 		 */
 		if (__predict_false(vm_page_wired(m)))
 			goto skip_page;
 
 		/*
 		 * Invalid pages can be easily freed.  They cannot be
 		 * mapped; vm_page_free() asserts this.
 		 */
 		if (vm_page_none_valid(m))
 			goto free_page;
 
 		refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;
 
 		for (old = vm_page_astate_load(m);;) {
 			/*
 			 * Check to see if the page has been removed from the
 			 * queue since the first such check.  Leave it alone if
 			 * so, discarding any references collected by
 			 * pmap_ts_referenced().
 			 */
 			if (__predict_false(_vm_page_queue(old) == PQ_NONE))
 				goto skip_page;
 
 			new = old;
 			act_delta = refs;
 			if ((old.flags & PGA_REFERENCED) != 0) {
 				new.flags &= ~PGA_REFERENCED;
 				act_delta++;
 			}
 			if (act_delta == 0) {
 				;
 			} else if (object->ref_count != 0) {
 				/*
 				 * Increase the activation count if the page was
 				 * referenced while in the laundry queue.  This
 				 * makes it less likely that the page will be
 				 * returned prematurely to the laundry queue.
 				 */
 				new.act_count += ACT_ADVANCE +
 				    act_delta;
 				if (new.act_count > ACT_MAX)
 					new.act_count = ACT_MAX;
 
 				new.flags &= ~PGA_QUEUE_OP_MASK;
 				new.flags |= PGA_REQUEUE;
 				new.queue = PQ_ACTIVE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 
 				/*
 				 * If this was a background laundering, count
 				 * activated pages towards our target.  The
 				 * purpose of background laundering is to ensure
 				 * that pages are eventually cycled through the
 				 * laundry queue, and an activation is a valid
 				 * way out.
 				 */
 				if (!in_shortfall)
 					launder--;
 				VM_CNT_INC(v_reactivated);
 				goto skip_page;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
 				new.flags |= PGA_REQUEUE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 				goto skip_page;
 			}
 			break;
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of freeing it.  If, however, any of the page's
 		 * mappings allow write access, then the page may still be
 		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
 			if (m->dirty == 0 && !vm_page_try_remove_all(m))
 				goto skip_page;
 		}
 
 		/*
 		 * Clean pages are freed, and dirty pages are paged out unless
 		 * they belong to a dead object.  Requeueing dirty pages from
 		 * dead objects is pointless, as they are being paged out and
 		 * freed by the thread that destroyed the object.
 		 */
 		if (m->dirty == 0) {
 free_page:
 			/*
 			 * Now we are guaranteed that no other threads are
 			 * manipulating the page, check for a last-second
 			 * reference.
 			 */
 			if (vm_pageout_defer(m, queue, true))
 				goto skip_page;
 			vm_page_free(m);
 			VM_CNT_INC(v_dfree);
 		} else if ((object->flags & OBJ_DEAD) == 0) {
 			if (object->type != OBJT_SWAP &&
 			    object->type != OBJT_DEFAULT)
 				pageout_ok = true;
 			else if (disable_swap_pageouts)
 				pageout_ok = false;
 			else
 				pageout_ok = true;
 			if (!pageout_ok) {
 				vm_page_launder(m);
 				goto skip_page;
 			}
 
 			/*
 			 * Form a cluster with adjacent, dirty pages from the
 			 * same object, and page out that entire cluster.
 			 *
 			 * The adjacent, dirty pages must also be in the
 			 * laundry.  However, their mappings are not checked
 			 * for new references.  Consequently, a recently
 			 * referenced page may be paged out.  However, that
 			 * page will not be prematurely reclaimed.  After page
 			 * out, the page will be placed in the inactive queue,
 			 * where any new references will be detected and the
 			 * page reactivated.
 			 */
 			error = vm_pageout_clean(m, &numpagedout);
 			if (error == 0) {
 				launder -= numpagedout;
 				ss.scanned += numpagedout;
 			} else if (error == EDEADLK) {
 				pageout_lock_miss++;
 				vnodes_skipped++;
 			}
 			object = NULL;
 		} else {
 skip_page:
 			vm_page_xunbusy(m);
 		}
 	}
 	if (object != NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		object = NULL;
 	}
 	vm_pagequeue_lock(pq);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 
 	if (launder > 0 && queue == PQ_UNSWAPPABLE) {
 		queue = PQ_LAUNDRY;
 		goto scan;
 	}
 
 	/*
 	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
 	 * and we didn't launder enough pages.
 	 */
 	if (vnodes_skipped > 0 && launder > 0)
 		(void)speedup_syncer();
 
 	return (starting_target - launder);
 }
 
 /*
  * Compute the integer square root.
  */
 static u_int
 isqrt(u_int num)
 {
 	u_int bit, root, tmp;
 
 	bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0;
 	root = 0;
 	while (bit != 0) {
 		tmp = root + bit;
 		root >>= 1;
 		if (num >= tmp) {
 			num -= tmp;
 			root += bit;
 		}
 		bit >>= 2;
 	}
 	return (root);
 }
 
 /*
  * Perform the work of the laundry thread: periodically wake up and determine
  * whether any pages need to be laundered.  If so, determine the number of pages
  * that need to be laundered, and launder them.
  */
 static void
 vm_pageout_laundry_worker(void *arg)
 {
 	struct vm_domain *vmd;
 	struct vm_pagequeue *pq;
 	uint64_t nclean, ndirty, nfreed;
 	int domain, last_target, launder, shortfall, shortfall_cycle, target;
 	bool in_shortfall;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 
 	shortfall = 0;
 	in_shortfall = false;
 	shortfall_cycle = 0;
 	last_target = target = 0;
 	nfreed = 0;
 
 	/*
 	 * Calls to these handlers are serialized by the swap syscall lock.
 	 */
 	(void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
 	    EVENTHANDLER_PRI_ANY);
 	(void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
 	    EVENTHANDLER_PRI_ANY);
 
 	/*
 	 * The pageout laundry worker is never done, so loop forever.
 	 */
 	for (;;) {
 		KASSERT(target >= 0, ("negative target %d", target));
 		KASSERT(shortfall_cycle >= 0,
 		    ("negative cycle %d", shortfall_cycle));
 		launder = 0;
 
 		/*
 		 * First determine whether we need to launder pages to meet a
 		 * shortage of free pages.
 		 */
 		if (shortfall > 0) {
 			in_shortfall = true;
 			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
 			target = shortfall;
 		} else if (!in_shortfall)
 			goto trybackground;
 		else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {
 			/*
 			 * We recently entered shortfall and began laundering
 			 * pages.  If we have completed that laundering run
 			 * (and we are no longer in shortfall) or we have met
 			 * our laundry target through other activity, then we
 			 * can stop laundering pages.
 			 */
 			in_shortfall = false;
 			target = 0;
 			goto trybackground;
 		}
 		launder = target / shortfall_cycle--;
 		goto dolaundry;
 
 		/*
 		 * There's no immediate need to launder any pages; see if we
 		 * meet the conditions to perform background laundering:
 		 *
 		 * 1. The ratio of dirty to clean inactive pages exceeds the
 		 *    background laundering threshold, or
 		 * 2. we haven't yet reached the target of the current
 		 *    background laundering run.
 		 *
 		 * The background laundering threshold is not a constant.
 		 * Instead, it is a slowly growing function of the number of
 		 * clean pages freed by the page daemon since the last
 		 * background laundering.  Thus, as the ratio of dirty to
 		 * clean inactive pages grows, the amount of memory pressure
 		 * required to trigger laundering decreases.  We ensure
 		 * that the threshold is non-zero after an inactive queue
 		 * scan, even if that scan failed to free a single clean page.
 		 */
 trybackground:
 		nclean = vmd->vmd_free_count +
 		    vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
 		ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
 		if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,
 		    vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {
 			target = vmd->vmd_background_launder_target;
 		}
 
 		/*
 		 * We have a non-zero background laundering target.  If we've
 		 * laundered up to our maximum without observing a page daemon
 		 * request, just stop.  This is a safety belt that ensures we
 		 * don't launder an excessive amount if memory pressure is low
 		 * and the ratio of dirty to clean pages is large.  Otherwise,
 		 * proceed at the background laundering rate.
 		 */
 		if (target > 0) {
 			if (nfreed > 0) {
 				nfreed = 0;
 				last_target = target;
 			} else if (last_target - target >=
 			    vm_background_launder_max * PAGE_SIZE / 1024) {
 				target = 0;
 			}
 			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
 			launder /= VM_LAUNDER_RATE;
 			if (launder > target)
 				launder = target;
 		}
 
 dolaundry:
 		if (launder > 0) {
 			/*
 			 * Because of I/O clustering, the number of laundered
 			 * pages could exceed "target" by the maximum size of
 			 * a cluster minus one. 
 			 */
 			target -= min(vm_pageout_launder(vmd, launder,
 			    in_shortfall), target);
 			pause("laundp", hz / VM_LAUNDER_RATE);
 		}
 
 		/*
 		 * If we're not currently laundering pages and the page daemon
 		 * hasn't posted a new request, sleep until the page daemon
 		 * kicks us.
 		 */
 		vm_pagequeue_lock(pq);
 		if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
 			(void)mtx_sleep(&vmd->vmd_laundry_request,
 			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
 
 		/*
 		 * If the pagedaemon has indicated that it's in shortfall, start
 		 * a shortfall laundering unless we're already in the middle of
 		 * one.  This may preempt a background laundering.
 		 */
 		if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
 		    (!in_shortfall || shortfall_cycle == 0)) {
 			shortfall = vm_laundry_target(vmd) +
 			    vmd->vmd_pageout_deficit;
 			target = 0;
 		} else
 			shortfall = 0;
 
 		if (target == 0)
 			vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
 		nfreed += vmd->vmd_clean_pages_freed;
 		vmd->vmd_clean_pages_freed = 0;
 		vm_pagequeue_unlock(pq);
 	}
 }
 
 /*
  * Compute the number of pages we want to try to move from the
  * active queue to either the inactive or laundry queue.
  *
  * When scanning active pages during a shortage, we make clean pages
  * count more heavily towards the page shortage than dirty pages.
  * This is because dirty pages must be laundered before they can be
  * reused and thus have less utility when attempting to quickly
  * alleviate a free page shortage.  However, this weighting also
  * causes the scan to deactivate dirty pages more aggressively,
  * improving the effectiveness of clustering.
  */
 static int
 vm_pageout_active_target(struct vm_domain *vmd)
 {
 	int shortage;
 
 	shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -
 	    (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +
 	    vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);
 	shortage *= act_scan_laundry_weight;
 	return (shortage);
 }
 
 /*
  * Scan the active queue.  If there is no shortage of inactive pages, scan a
  * small portion of the queue in order to maintain quasi-LRU.
  */
 static void
 vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 {
 	struct scan_state ss;
 	vm_object_t object;
 	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	vm_page_astate_t old, new;
 	long min_scan;
 	int act_delta, max_scan, ps_delta, refs, scan_tick;
 	uint8_t nqueue;
 
 	marker = &vmd->vmd_markers[PQ_ACTIVE];
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
 
 	/*
 	 * If we're just idle polling attempt to visit every
 	 * active page within 'update_period' seconds.
 	 */
 	scan_tick = ticks;
 	if (vm_pageout_update_period != 0) {
 		min_scan = pq->pq_cnt;
 		min_scan *= scan_tick - vmd->vmd_last_active_scan;
 		min_scan /= hz * vm_pageout_update_period;
 	} else
 		min_scan = 0;
 	if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))
 		vmd->vmd_last_active_scan = scan_tick;
 
 	/*
 	 * Scan the active queue for pages that can be deactivated.  Update
 	 * the per-page activity counter and use it to identify deactivation
 	 * candidates.  Held pages may be deactivated.
 	 *
 	 * To avoid requeuing each page that remains in the active queue, we
 	 * implement the CLOCK algorithm.  To keep the implementation of the
 	 * enqueue operation consistent for all page queues, we use two hands,
 	 * represented by marker pages. Scans begin at the first hand, which
 	 * precedes the second hand in the queue.  When the two hands meet,
 	 * they are moved back to the head and tail of the queue, respectively,
 	 * and scanning resumes.
 	 */
 	max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;
 act_scan:
 	vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
 	while ((m = vm_pageout_next(&ss, false)) != NULL) {
 		if (__predict_false(m == &vmd->vmd_clock[1])) {
 			vm_pagequeue_lock(pq);
 			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
 			TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
 			    plinks.q);
 			TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
 			    plinks.q);
 			max_scan -= ss.scanned;
 			vm_pageout_end_scan(&ss);
 			goto act_scan;
 		}
 		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
 
 		/*
 		 * Don't touch a page that was removed from the queue after the
 		 * page queue lock was released.  Otherwise, ensure that any
 		 * pending queue operations, such as dequeues for wired pages,
 		 * are handled.
 		 */
 		if (vm_pageout_defer(m, PQ_ACTIVE, true))
 			continue;
 
 		/*
 		 * A page's object pointer may be set to NULL before
 		 * the object lock is acquired.
 		 */
 		object = atomic_load_ptr(&m->object);
 		if (__predict_false(object == NULL))
 			/*
 			 * The page has been removed from its object.
 			 */
 			continue;
 
 		/* Deferred free of swap space. */
 		if ((m->a.flags & PGA_SWAP_FREE) != 0 &&
 		    VM_OBJECT_TRYWLOCK(object)) {
 			if (m->object == object)
 				vm_pager_page_unswapped(m);
 			VM_OBJECT_WUNLOCK(object);
 		}
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 *
 		 * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 		 * that a reference from a concurrently destroyed mapping is
 		 * observed here and now.
 		 *
 		 * Perform an unsynchronized object ref count check.  While
 		 * the page lock ensures that the page is not reallocated to
 		 * another object, in particular, one with unmanaged mappings
 		 * that cannot support pmap_ts_referenced(), two races are,
 		 * nonetheless, possible:
 		 * 1) The count was transitioning to zero, but we saw a non-
 		 *    zero value.  pmap_ts_referenced() will return zero
 		 *    because the page is not mapped.
 		 * 2) The count was transitioning to one, but we saw zero.
 		 *    This race delays the detection of a new reference.  At
 		 *    worst, we will deactivate and reactivate the page.
 		 */
 		refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;
 
 		old = vm_page_astate_load(m);
 		do {
 			/*
 			 * Check to see if the page has been removed from the
 			 * queue since the first such check.  Leave it alone if
 			 * so, discarding any references collected by
 			 * pmap_ts_referenced().
 			 */
 			if (__predict_false(_vm_page_queue(old) == PQ_NONE))
 				break;
 
 			/*
 			 * Advance or decay the act_count based on recent usage.
 			 */
 			new = old;
 			act_delta = refs;
 			if ((old.flags & PGA_REFERENCED) != 0) {
 				new.flags &= ~PGA_REFERENCED;
 				act_delta++;
 			}
 			if (act_delta != 0) {
 				new.act_count += ACT_ADVANCE + act_delta;
 				if (new.act_count > ACT_MAX)
 					new.act_count = ACT_MAX;
 			} else {
 				new.act_count -= min(new.act_count,
 				    ACT_DECLINE);
 			}
 
 			if (new.act_count > 0) {
 				/*
 				 * Adjust the activation count and keep the page
 				 * in the active queue.  The count might be left
 				 * unchanged if it is saturated.  The page may
 				 * have been moved to a different queue since we
 				 * started the scan, in which case we move it
 				 * back.
 				 */
 				ps_delta = 0;
 				if (old.queue != PQ_ACTIVE) {
 					new.flags &= ~PGA_QUEUE_OP_MASK;
 					new.flags |= PGA_REQUEUE;
 					new.queue = PQ_ACTIVE;
 				}
 			} else {
 				/*
 				 * When not short for inactive pages, let dirty
 				 * pages go through the inactive queue before
 				 * moving to the laundry queue.  This gives them
 				 * some extra time to be reactivated,
 				 * potentially avoiding an expensive pageout.
 				 * However, during a page shortage, the inactive
 				 * queue is necessarily small, and so dirty
 				 * pages would only spend a trivial amount of
 				 * time in the inactive queue.  Therefore, we
 				 * might as well place them directly in the
 				 * laundry queue to reduce queuing overhead.
 				 *
 				 * Calling vm_page_test_dirty() here would
 				 * require acquisition of the object's write
 				 * lock.  However, during a page shortage,
 				 * directing dirty pages into the laundry queue
 				 * is only an optimization and not a
 				 * requirement.  Therefore, we simply rely on
 				 * the opportunistic updates to the page's dirty
 				 * field by the pmap.
 				 */
 				if (page_shortage <= 0) {
 					nqueue = PQ_INACTIVE;
 					ps_delta = 0;
 				} else if (m->dirty == 0) {
 					nqueue = PQ_INACTIVE;
 					ps_delta = act_scan_laundry_weight;
 				} else {
 					nqueue = PQ_LAUNDRY;
 					ps_delta = 1;
 				}
 
 				new.flags &= ~PGA_QUEUE_OP_MASK;
 				new.flags |= PGA_REQUEUE;
 				new.queue = nqueue;
 			}
 		} while (!vm_page_pqstate_commit(m, &old, new));
 
 		page_shortage -= ps_delta;
 	}
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 	TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 }
 
 static int
 vm_pageout_reinsert_inactive_page(struct vm_pagequeue *pq, vm_page_t marker,
     vm_page_t m)
 {
 	vm_page_astate_t as;
 
 	vm_pagequeue_assert_locked(pq);
 
 	as = vm_page_astate_load(m);
 	if (as.queue != PQ_INACTIVE || (as.flags & PGA_ENQUEUED) != 0)
 		return (0);
 	vm_page_aflag_set(m, PGA_ENQUEUED);
 	TAILQ_INSERT_BEFORE(marker, m, plinks.q);
 	return (1);
 }
 
 /*
  * Re-add stuck pages to the inactive queue.  We will examine them again
  * during the next scan.  If the queue state of a page has changed since
  * it was physically removed from the page queue in
  * vm_pageout_collect_batch(), don't do anything with that page.
  */
 static void
 vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
     vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 	vm_page_t marker;
 	int delta;
 
 	delta = 0;
 	marker = ss->marker;
 	pq = ss->pq;
 
 	if (m != NULL) {
 		if (vm_batchqueue_insert(bq, m))
 			return;
 		vm_pagequeue_lock(pq);
 		delta += vm_pageout_reinsert_inactive_page(pq, marker, m);
 	} else
 		vm_pagequeue_lock(pq);
 	while ((m = vm_batchqueue_pop(bq)) != NULL)
 		delta += vm_pageout_reinsert_inactive_page(pq, marker, m);
 	vm_pagequeue_cnt_add(pq, delta);
 	vm_pagequeue_unlock(pq);
 	vm_batchqueue_init(bq);
 }
 
 static void
 vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage)
 {
 	struct timeval start, end;
 	struct scan_state ss;
 	struct vm_batchqueue rq;
 	struct vm_page marker_page;
 	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	vm_page_astate_t old, new;
 	int act_delta, addl_page_shortage, starting_page_shortage, refs;
 
 	object = NULL;
 	vm_batchqueue_init(&rq);
 	getmicrouptime(&start);
 
 	/*
 	 * The addl_page_shortage is an estimate of the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
 	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
 	addl_page_shortage = 0;
 
 	/*
 	 * Start scanning the inactive queue for pages that we can free.  The
 	 * scan will stop when we reach the target or we have scanned the
 	 * entire queue.  (Note that m->a.act_count is not used to make
 	 * decisions for the inactive queue, only for the active queue.)
 	 */
 	starting_page_shortage = page_shortage;
 	marker = &marker_page;
 	vm_page_init_marker(marker, PQ_INACTIVE, 0);
 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	vm_pagequeue_lock(pq);
 	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 	while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
 		KASSERT((m->flags & PG_MARKER) == 0,
 		    ("marker page %p was dequeued", m));
 
 		/*
 		 * Don't touch a page that was removed from the queue after the
 		 * page queue lock was released.  Otherwise, ensure that any
 		 * pending queue operations, such as dequeues for wired pages,
 		 * are handled.
 		 */
 		if (vm_pageout_defer(m, PQ_INACTIVE, false))
 			continue;
 
 		/*
 		 * Lock the page's object.
 		 */
 		if (object == NULL || object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 			object = atomic_load_ptr(&m->object);
 			if (__predict_false(object == NULL))
 				/* The page is being freed by another thread. */
 				continue;
 
 			/* Depends on type-stability. */
 			VM_OBJECT_WLOCK(object);
 			if (__predict_false(m->object != object)) {
 				VM_OBJECT_WUNLOCK(object);
 				object = NULL;
 				goto reinsert;
 			}
 		}
 
 		if (vm_page_tryxbusy(m) == 0) {
 			/*
 			 * Don't mess with busy pages.  Leave them at
 			 * the front of the queue.  Most likely, they
 			 * are being paged out and will leave the
 			 * queue shortly after the scan finishes.  So,
 			 * they ought to be discounted from the
 			 * inactive count.
 			 */
 			addl_page_shortage++;
 			goto reinsert;
 		}
 
 		/* Deferred free of swap space. */
 		if ((m->a.flags & PGA_SWAP_FREE) != 0)
 			vm_pager_page_unswapped(m);
 
 		/*
 		 * Check for wirings now that we hold the object lock and have
 		 * exclusively busied the page.  If the page is mapped, it may
 		 * still be wired by pmap lookups.  The call to
 		 * vm_page_try_remove_all() below atomically checks for such
 		 * wirings and removes mappings.  If the page is unmapped, the
 		 * wire count is guaranteed not to increase after this check.
 		 */
 		if (__predict_false(vm_page_wired(m)))
 			goto skip_page;
 
 		/*
 		 * Invalid pages can be easily freed. They cannot be
 		 * mapped, vm_page_free() asserts this.
 		 */
 		if (vm_page_none_valid(m))
 			goto free_page;
 
 		refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;
 
 		for (old = vm_page_astate_load(m);;) {
 			/*
 			 * Check to see if the page has been removed from the
 			 * queue since the first such check.  Leave it alone if
 			 * so, discarding any references collected by
 			 * pmap_ts_referenced().
 			 */
 			if (__predict_false(_vm_page_queue(old) == PQ_NONE))
 				goto skip_page;
 
 			new = old;
 			act_delta = refs;
 			if ((old.flags & PGA_REFERENCED) != 0) {
 				new.flags &= ~PGA_REFERENCED;
 				act_delta++;
 			}
 			if (act_delta == 0) {
 				;
 			} else if (object->ref_count != 0) {
 				/*
 				 * Increase the activation count if the
 				 * page was referenced while in the
 				 * inactive queue.  This makes it less
 				 * likely that the page will be returned
 				 * prematurely to the inactive queue.
 				 */
 				new.act_count += ACT_ADVANCE +
 				    act_delta;
 				if (new.act_count > ACT_MAX)
 					new.act_count = ACT_MAX;
 
 				new.flags &= ~PGA_QUEUE_OP_MASK;
 				new.flags |= PGA_REQUEUE;
 				new.queue = PQ_ACTIVE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 
 				VM_CNT_INC(v_reactivated);
 				goto skip_page;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
 				new.queue = PQ_INACTIVE;
 				new.flags |= PGA_REQUEUE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 				goto skip_page;
 			}
 			break;
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of freeing it.  If, however, any of the page's
 		 * mappings allow write access, then the page may still be
 		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
 			if (m->dirty == 0 && !vm_page_try_remove_all(m))
 				goto skip_page;
 		}
 
 		/*
 		 * Clean pages can be freed, but dirty pages must be sent back
 		 * to the laundry, unless they belong to a dead object.
 		 * Requeueing dirty pages from dead objects is pointless, as
 		 * they are being paged out and freed by the thread that
 		 * destroyed the object.
 		 */
 		if (m->dirty == 0) {
 free_page:
 			/*
 			 * Now we are guaranteed that no other threads are
 			 * manipulating the page, check for a last-second
 			 * reference that would save it from doom.
 			 */
 			if (vm_pageout_defer(m, PQ_INACTIVE, false))
 				goto skip_page;
 
 			/*
 			 * Because we dequeued the page and have already checked
 			 * for pending dequeue and enqueue requests, we can
 			 * safely disassociate the page from the inactive queue
 			 * without holding the queue lock.
 			 */
 			m->a.queue = PQ_NONE;
 			vm_page_free(m);
 			page_shortage--;
 			continue;
 		}
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_page_launder(m);
 skip_page:
 		vm_page_xunbusy(m);
 		continue;
 reinsert:
 		vm_pageout_reinsert_inactive(&ss, &rq, m);
 	}
 	if (object != NULL)
 		VM_OBJECT_WUNLOCK(object);
 	vm_pageout_reinsert_inactive(&ss, &rq, NULL);
 	vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
 	vm_pagequeue_lock(pq);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 
 	/*
 	 * Record the remaining shortage and the progress and rate it was made.
 	 */
 	atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage);
 	getmicrouptime(&end);
 	timevalsub(&end, &start);
 	atomic_add_int(&vmd->vmd_inactive_us,
 	    end.tv_sec * 1000000 + end.tv_usec);
 	atomic_add_int(&vmd->vmd_inactive_freed,
 	    starting_page_shortage - page_shortage);
 }
 
 /*
  * Dispatch a number of inactive threads according to load and collect the
  * results to prevent a coherent (CEM: incoherent?) view of paging activity on
  * this domain.
  */
 static int
 vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage)
 {
 	u_int freed, pps, threads, us;
 
 	vmd->vmd_inactive_shortage = shortage;
 
 	/*
 	 * If we have more work than we can do in a quarter of our interval, we
 	 * fire off multiple threads to process it.
 	 */
 	if (vmd->vmd_inactive_threads > 1 && vmd->vmd_inactive_pps != 0 &&
 	    shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) {
 		threads = vmd->vmd_inactive_threads;
 		vm_domain_pageout_lock(vmd);
 		vmd->vmd_inactive_shortage /= threads;
 		blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1);
 		blockcount_acquire(&vmd->vmd_inactive_running, threads - 1);
 		wakeup(&vmd->vmd_inactive_shortage);
 		vm_domain_pageout_unlock(vmd);
 	}
 
 	/* Run the local thread scan. */
 	vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage);
 
 	/*
 	 * Block until helper threads report results and then accumulate
 	 * totals.
 	 */
 	blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM);
 	freed = atomic_readandclear_int(&vmd->vmd_inactive_freed);
 	VM_CNT_ADD(v_dfree, freed);
 
 	/*
 	 * Calculate the per-thread paging rate with an exponential decay of
 	 * prior results.  Careful to avoid integer rounding errors with large
 	 * us values.
 	 */
 	us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1);
 	if (us > 1000000)
 		/* Keep rounding to tenths */
 		pps = (freed * 10) / ((us * 10) / 1000000);
 	else
 		pps = (1000000 / us) * freed;
 	vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2);
 
 	return (shortage - freed);
 }
 
 /*
  * Attempt to reclaim the requested number of pages from the inactive queue.
  * Returns true if the shortage was addressed.
  */
 static int
 vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage)
 {
 	struct vm_pagequeue *pq;
 	u_int addl_page_shortage, deficit, page_shortage;
 	u_int starting_page_shortage;
 
 	/*
 	 * vmd_pageout_deficit counts the number of pages requested in
 	 * allocations that failed because of a free page shortage.  We assume
 	 * that the allocations will be reattempted and thus include the deficit
 	 * in our scan target.
 	 */
 	deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
 	starting_page_shortage = shortage + deficit;
 
 	/*
 	 * Run the inactive scan on as many threads as is necessary.
 	 */
 	page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage);
 	addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage);
 
 	/*
 	 * Wake up the laundry thread so that it can perform any needed
 	 * laundering.  If we didn't meet our target, we're in shortfall and
 	 * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
 	 * swap devices are configured, the laundry thread has no work to do, so
 	 * don't bother waking it up.
 	 *
 	 * The laundry thread uses the number of inactive queue scans elapsed
 	 * since the last laundering to determine whether to launder again, so
 	 * keep count.
 	 */
 	if (starting_page_shortage > 0) {
 		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 		vm_pagequeue_lock(pq);
 		if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
 		    (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {
 			if (page_shortage > 0) {
 				vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
 				VM_CNT_INC(v_pdshortfalls);
 			} else if (vmd->vmd_laundry_request !=
 			    VM_LAUNDRY_SHORTFALL)
 				vmd->vmd_laundry_request =
 				    VM_LAUNDRY_BACKGROUND;
 			wakeup(&vmd->vmd_laundry_request);
 		}
 		vmd->vmd_clean_pages_freed +=
 		    starting_page_shortage - page_shortage;
 		vm_pagequeue_unlock(pq);
 	}
 
 	/*
 	 * Wakeup the swapout daemon if we didn't free the targeted number of
 	 * pages.
 	 */
 	if (page_shortage > 0)
 		vm_swapout_run();
 
 	/*
 	 * If the inactive queue scan fails repeatedly to meet its
 	 * target, kill the largest process.
 	 */
 	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
 
 	/*
 	 * Reclaim pages by swapping out idle processes, if configured to do so.
 	 */
 	vm_swapout_run_idle();
 
 	/*
 	 * See the description of addl_page_shortage above.
 	 */
 	*addl_shortage = addl_page_shortage + deficit;
 
 	return (page_shortage <= 0);
 }
 
 static int vm_pageout_oom_vote;
 
 /*
  * The pagedaemon threads randlomly select one to perform the
  * OOM.  Trying to kill processes before all pagedaemons
  * failed to reach free target is premature.
  */
 static void
 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage)
 {
 	int old_vote;
 
 	if (starting_page_shortage <= 0 || starting_page_shortage !=
 	    page_shortage)
 		vmd->vmd_oom_seq = 0;
 	else
 		vmd->vmd_oom_seq++;
 	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 		if (vmd->vmd_oom) {
 			vmd->vmd_oom = FALSE;
 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
 		}
 		return;
 	}
 
 	/*
 	 * Do not follow the call sequence until OOM condition is
 	 * cleared.
 	 */
 	vmd->vmd_oom_seq = 0;
 
 	if (vmd->vmd_oom)
 		return;
 
 	vmd->vmd_oom = TRUE;
 	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 	if (old_vote != vm_ndomains - 1)
 		return;
 
 	/*
 	 * The current pagedaemon thread is the last in the quorum to
 	 * start OOM.  Initiate the selection and signaling of the
 	 * victim.
 	 */
 	vm_pageout_oom(VM_OOM_MEM);
 
 	/*
 	 * After one round of OOM terror, recall our vote.  On the
 	 * next pass, current pagedaemon would vote again if the low
 	 * memory condition is still there, due to vmd_oom being
 	 * false.
 	 */
 	vmd->vmd_oom = FALSE;
 	atomic_subtract_int(&vm_pageout_oom_vote, 1);
 }
 
 /*
  * The OOM killer is the page daemon's action of last resort when
  * memory allocation requests have been stalled for a prolonged period
  * of time because it cannot reclaim memory.  This function computes
  * the approximate number of physical pages that could be reclaimed if
  * the specified address space is destroyed.
  *
  * Private, anonymous memory owned by the address space is the
  * principal resource that we expect to recover after an OOM kill.
  * Since the physical pages mapped by the address space's COW entries
  * are typically shared pages, they are unlikely to be released and so
  * they are not counted.
  *
  * To get to the point where the page daemon runs the OOM killer, its
  * efforts to write-back vnode-backed pages may have stalled.  This
  * could be caused by a memory allocation deadlock in the write path
  * that might be resolved by an OOM kill.  Therefore, physical pages
  * belonging to vnode-backed objects are counted, because they might
  * be freed without being written out first if the address space holds
  * the last reference to an unlinked vnode.
  *
  * Similarly, physical pages belonging to OBJT_PHYS objects are
  * counted because the address space might hold the last reference to
  * the object.
  */
 static long
 vm_pageout_oom_pagecount(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t obj;
 	long res;
 
 	map = &vmspace->vm_map;
 	KASSERT(!map->system_map, ("system map"));
 	sx_assert(&map->lock, SA_LOCKED);
 	res = 0;
 	VM_MAP_ENTRY_FOREACH(entry, map) {
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			continue;
 		obj = entry->object.vm_object;
 		if (obj == NULL)
 			continue;
 		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
 		    obj->ref_count != 1)
 			continue;
 		switch (obj->type) {
 		case OBJT_DEFAULT:
 		case OBJT_SWAP:
 		case OBJT_PHYS:
 		case OBJT_VNODE:
 			res += obj->resident_page_count;
 			break;
 		}
 	}
 	return (res);
 }
 
 static int vm_oom_ratelim_last;
 static int vm_oom_pf_secs = 10;
 SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0,
     "");
 static struct mtx vm_oom_ratelim_mtx;
 
 void
 vm_pageout_oom(int shortage)
 {
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	struct thread *td;
 	struct vmspace *vm;
 	int now;
 	bool breakout;
 
 	/*
 	 * For OOM requests originating from vm_fault(), there is a high
 	 * chance that a single large process faults simultaneously in
 	 * several threads.  Also, on an active system running many
 	 * processes of middle-size, like buildworld, all of them
 	 * could fault almost simultaneously as well.
 	 *
 	 * To avoid killing too many processes, rate-limit OOMs
 	 * initiated by vm_fault() time-outs on the waits for free
 	 * pages.
 	 */
 	mtx_lock(&vm_oom_ratelim_mtx);
 	now = ticks;
 	if (shortage == VM_OOM_MEM_PF &&
 	    (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) {
 		mtx_unlock(&vm_oom_ratelim_mtx);
 		return;
 	}
 	vm_oom_ratelim_last = now;
 	mtx_unlock(&vm_oom_ratelim_mtx);
 
 	/*
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of its child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	bigproc = NULL;
 	bigsize = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 
 		/*
 		 * If this is a system, protected or killed process, skip it.
 		 */
 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 		    p->p_pid == 1 || P_KILLED(p) ||
 		    (p->p_pid < 48 && swap_pager_avail != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * If the process is in a non-running type state,
 		 * don't touch it.  Check all the threads individually.
 		 */
 		breakout = false;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (!TD_ON_RUNQ(td) &&
 			    !TD_IS_RUNNING(td) &&
 			    !TD_IS_SLEEPING(td) &&
 			    !TD_IS_SUSPENDED(td) &&
 			    !TD_IS_SWAPPED(td)) {
 				thread_unlock(td);
 				breakout = true;
 				break;
 			}
 			thread_unlock(td);
 		}
 		if (breakout) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * get the process size
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		_PHOLD_LITE(p);
 		PROC_UNLOCK(p);
 		sx_sunlock(&allproc_lock);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
 			vmspace_free(vm);
 			sx_slock(&allproc_lock);
 			PRELE(p);
 			continue;
 		}
 		size = vmspace_swap_count(vm);
 		if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF)
 			size += vm_pageout_oom_pagecount(vm);
 		vm_map_unlock_read(&vm->vm_map);
 		vmspace_free(vm);
 		sx_slock(&allproc_lock);
 
 		/*
 		 * If this process is bigger than the biggest one,
 		 * remember it.
 		 */
 		if (size > bigsize) {
 			if (bigproc != NULL)
 				PRELE(bigproc);
 			bigproc = p;
 			bigsize = size;
 		} else {
 			PRELE(p);
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	if (bigproc != NULL) {
 		if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0)
 			panic("out of swap space");
 		PROC_LOCK(bigproc);
 		killproc(bigproc, "out of swap space");
 		sched_nice(bigproc, PRIO_MIN);
 		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
 	}
 }
 
 /*
  * Signal a free page shortage to subsystems that have registered an event
  * handler.  Reclaim memory from UMA in the event of a severe shortage.
  * Return true if the free page count should be re-evaluated.
  */
 static bool
 vm_pageout_lowmem(void)
 {
 	static int lowmem_ticks = 0;
 	int last;
 	bool ret;
 
 	ret = false;
 
 	last = atomic_load_int(&lowmem_ticks);
 	while ((u_int)(ticks - last) / hz >= lowmem_period) {
 		if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)
 			continue;
 
 		/*
 		 * Decrease registered cache sizes.
 		 */
 		SDT_PROBE0(vm, , , vm__lowmem_scan);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
 
 		/*
 		 * We do this explicitly after the caches have been
 		 * drained above.
 		 */
 		uma_reclaim(UMA_RECLAIM_TRIM);
 		ret = true;
 		break;
 	}
 
 	/*
 	 * Kick off an asynchronous reclaim of cached memory if one of the
 	 * page daemons is failing to keep up with demand.  Use the "severe"
 	 * threshold instead of "min" to ensure that we do not blow away the
 	 * caches if a subset of the NUMA domains are depleted by kernel memory
 	 * allocations; the domainset iterators automatically skip domains
 	 * below the "min" threshold on the first pass.
 	 *
 	 * UMA reclaim worker has its own rate-limiting mechanism, so don't
 	 * worry about kicking it too often.
 	 */
 	if (vm_page_count_severe())
 		uma_reclaim_wakeup();
 
 	return (ret);
 }
 
 static void
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *vmd;
 	u_int ofree;
 	int addl_shortage, domain, shortage;
 	bool target_met;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	shortage = 0;
 	target_met = true;
 
 	/*
 	 * XXXKIB It could be useful to bind pageout daemon threads to
 	 * the cores belonging to the domain, from which vm_page_array
 	 * is allocated.
 	 */
 
 	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 	vmd->vmd_last_active_scan = ticks;
 
 	/*
 	 * The pageout daemon worker is never done, so loop forever.
 	 */
 	while (TRUE) {
 		vm_domain_pageout_lock(vmd);
 
 		/*
 		 * We need to clear wanted before we check the limits.  This
 		 * prevents races with wakers who will check wanted after they
 		 * reach the limit.
 		 */
 		atomic_store_int(&vmd->vmd_pageout_wanted, 0);
 
 		/*
 		 * Might the page daemon need to run again?
 		 */
 		if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 			/*
 			 * Yes.  If the scan failed to produce enough free
 			 * pages, sleep uninterruptibly for some time in the
 			 * hope that the laundry thread will clean some pages.
 			 */
 			vm_domain_pageout_unlock(vmd);
 			if (!target_met)
 				pause("pwait", hz / VM_INACT_SCAN_RATE);
 		} else {
 			/*
 			 * No, sleep until the next wakeup or until pages
 			 * need to have their reference stats updated.
 			 */
 			if (mtx_sleep(&vmd->vmd_pageout_wanted,
 			    vm_domain_pageout_lockptr(vmd), PDROP | PVM,
 			    "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 				VM_CNT_INC(v_pdwakeups);
 		}
 
 		/* Prevent spurious wakeups by ensuring that wanted is set. */
 		atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 
 		/*
 		 * Use the controller to calculate how many pages to free in
 		 * this interval, and scan the inactive queue.  If the lowmem
 		 * handlers appear to have freed up some pages, subtract the
 		 * difference from the inactive queue scan target.
 		 */
 		shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
 		if (shortage > 0) {
 			ofree = vmd->vmd_free_count;
 			if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)
 				shortage -= min(vmd->vmd_free_count - ofree,
 				    (u_int)shortage);
 			target_met = vm_pageout_inactive(vmd, shortage,
 			    &addl_shortage);
 		} else
 			addl_shortage = 0;
 
 		/*
 		 * Scan the active queue.  A positive value for shortage
 		 * indicates that we must aggressively deactivate pages to avoid
 		 * a shortfall.
 		 */
 		shortage = vm_pageout_active_target(vmd) + addl_shortage;
 		vm_pageout_scan_active(vmd, shortage);
 	}
 }
 
 /*
  * vm_pageout_helper runs additional pageout daemons in times of high paging
  * activity.
  */
 static void
 vm_pageout_helper(void *arg)
 {
 	struct vm_domain *vmd;
 	int domain;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 
 	vm_domain_pageout_lock(vmd);
 	for (;;) {
 		msleep(&vmd->vmd_inactive_shortage,
 		    vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0);
 		blockcount_release(&vmd->vmd_inactive_starting, 1);
 
 		vm_domain_pageout_unlock(vmd);
 		vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage);
 		vm_domain_pageout_lock(vmd);
 
 		/*
 		 * Release the running count while the pageout lock is held to
 		 * prevent wakeup races.
 		 */
 		blockcount_release(&vmd->vmd_inactive_running, 1);
 	}
 }
 
 static int
 get_pageout_threads_per_domain(void)
 {
 	static bool resolved = false;
 	int half_cpus_per_dom;
 
 	/*
 	 * This is serialized externally by the sorted autoconfig portion of
 	 * boot.
 	 */
 	if (__predict_true(resolved))
 		return (pageout_threads_per_domain);
 
 	/*
 	 * Semi-arbitrarily constrain pagedaemon threads to less than half the
 	 * total number of threads in the system as an insane upper limit.
 	 */
 	half_cpus_per_dom = howmany(mp_ncpus / vm_ndomains, 2);
 
 	if (pageout_threads_per_domain < 1) {
 		printf("Invalid tuneable vm.pageout_threads_per_domain value: "
 		    "%d out of valid range: [1-%d]; clamping to 1\n",
 		    pageout_threads_per_domain, half_cpus_per_dom);
 		pageout_threads_per_domain = 1;
 	} else if (pageout_threads_per_domain > half_cpus_per_dom) {
 		printf("Invalid tuneable vm.pageout_threads_per_domain value: "
 		    "%d out of valid range: [1-%d]; clamping to %d\n",
 		    pageout_threads_per_domain, half_cpus_per_dom,
 		    half_cpus_per_dom);
 		pageout_threads_per_domain = half_cpus_per_dom;
 	}
 	resolved = true;
 	return (pageout_threads_per_domain);
 }
 
 /*
  * Initialize basic pageout daemon settings.  See the comment above the
  * definition of vm_domain for some explanation of how these thresholds are
  * used.
  */
 static void
 vm_pageout_init_domain(int domain)
 {
 	struct vm_domain *vmd;
 	struct sysctl_oid *oid;
 
 	vmd = VM_DOMAIN(domain);
 	vmd->vmd_interrupt_free_min = 2;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE +
 	    vmd->vmd_interrupt_free_min;
 	vmd->vmd_free_reserved = vm_pageout_page_count +
 	    vmd->vmd_pageout_free_min + vmd->vmd_page_count / 768;
 	vmd->vmd_free_min = vmd->vmd_page_count / 200;
 	vmd->vmd_free_severe = vmd->vmd_free_min / 2;
 	vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
 	vmd->vmd_free_min += vmd->vmd_free_reserved;
 	vmd->vmd_free_severe += vmd->vmd_free_reserved;
 	vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
 	if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
 		vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 
 	/*
 	 * Set the default wakeup threshold to be 10% below the paging
 	 * target.  This keeps the steady state out of shortfall.
 	 */
 	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 
 	/*
 	 * Target amount of memory to move out of the laundry queue during a
 	 * background laundering.  This is proportional to the amount of system
 	 * memory.
 	 */
 	vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 	    vmd->vmd_free_min) / 10;
 
 	/* Initialize the pageout daemon pid controller. */
 	pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
 	    vmd->vmd_free_target, PIDCTRL_BOUND,
 	    PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 	    "pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 
 	vmd->vmd_inactive_threads = get_pageout_threads_per_domain();
 }
 
 static void
 vm_pageout_init(void)
 {
 	u_int freecount;
 	int i;
 
 	/*
 	 * Initialize some paging parameters.
 	 */
 	if (vm_cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	freecount = 0;
 	for (i = 0; i < vm_ndomains; i++) {
 		struct vm_domain *vmd;
 
 		vm_pageout_init_domain(i);
 		vmd = VM_DOMAIN(i);
 		vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
 		vm_cnt.v_free_target += vmd->vmd_free_target;
 		vm_cnt.v_free_min += vmd->vmd_free_min;
 		vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
 		vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
 		vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
 		vm_cnt.v_free_severe += vmd->vmd_free_severe;
 		freecount += vmd->vmd_free_count;
 	}
 
 	/*
 	 * Set interval in seconds for active scan.  We want to visit each
 	 * page at least once every ten minutes.  This is to prevent worst
 	 * case paging behaviors with stale active LRU.
 	 */
 	if (vm_pageout_update_period == 0)
 		vm_pageout_update_period = 600;
 
 	if (vm_page_max_user_wired == 0)
 		vm_page_max_user_wired = freecount / 3;
 }
 
 /*
  *     vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout(void)
 {
 	struct proc *p;
 	struct thread *td;
 	int error, first, i, j, pageout_threads;
 
 	p = curproc;
 	td = curthread;
 	pageout_threads = get_pageout_threads_per_domain();
 
 	mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF);
 	swap_pager_swap_init();
 	for (first = -1, i = 0; i < vm_ndomains; i++) {
 		if (VM_DOMAIN_EMPTY(i)) {
 			if (bootverbose)
 				printf("domain %d empty; skipping pageout\n",
 				    i);
 			continue;
 		}
 		if (first == -1)
 			first = i;
 		else {
 			error = kthread_add(vm_pageout_worker,
 			    (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);
 			if (error != 0)
 				panic("starting pageout for domain %d: %d\n",
 				    i, error);
 		}
 		for (j = 0; j < pageout_threads - 1; j++) {
 			error = kthread_add(vm_pageout_helper,
 			    (void *)(uintptr_t)i, p, NULL, 0, 0,
 			    "dom%d helper%d", i, j);
 			if (error != 0)
 				panic("starting pageout helper %d for domain "
 				    "%d: %d\n", j, i, error);
 		}
 		error = kthread_add(vm_pageout_laundry_worker,
 		    (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);
 		if (error != 0)
 			panic("starting laundry for domain %d: %d", i, error);
 	}
 	error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");
 	if (error != 0)
 		panic("starting uma_reclaim helper, error %d\n", error);
 
 	snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);
 	vm_pageout_worker((void *)(uintptr_t)first);
 }
 
 /*
  * Perform an advisory wakeup of the page daemon.
  */
 void
 pagedaemon_wakeup(int domain)
 {
 	struct vm_domain *vmd;
 
 	vmd = VM_DOMAIN(domain);
 	vm_domain_pageout_assert_unlocked(vmd);
 	if (curproc == pageproc)
 		return;
 
 	if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
 		vm_domain_pageout_lock(vmd);
 		atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 		wakeup(&vmd->vmd_pageout_wanted);
 		vm_domain_pageout_unlock(vmd);
 	}
 }
Index: projects/clang1100-import
===================================================================
--- projects/clang1100-import	(revision 364278)
+++ projects/clang1100-import	(revision 364279)

Property changes on: projects/clang1100-import
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r364264-364278