Index: projects/capsicum-test/lib/clang/llvm.build.mk
===================================================================
--- projects/capsicum-test/lib/clang/llvm.build.mk	(revision 345709)
+++ projects/capsicum-test/lib/clang/llvm.build.mk	(revision 345710)
@@ -1,106 +1,106 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 .ifndef LLVM_SRCS
 .error Please define LLVM_SRCS before including this file
 .endif
 
 .ifndef SRCDIR
 .error Please define SRCDIR before including this file
 .endif
 
 .PATH:		${LLVM_SRCS}/${SRCDIR}
 
 CFLAGS+=	-I${SRCTOP}/lib/clang/include
 CFLAGS+=	-I${LLVM_SRCS}/include
 CFLAGS+=	-DLLVM_BUILD_GLOBAL_ISEL
 CFLAGS+=	-D__STDC_LIMIT_MACROS
 CFLAGS+=	-D__STDC_CONSTANT_MACROS
 #CFLAGS+=	-DNDEBUG
 
 TARGET_ARCH?=	${MACHINE_ARCH}
 BUILD_ARCH?=	${MACHINE_ARCH}
 
 # Armv6 and armv7 uses hard float abi, unless the CPUTYPE has soft in it.
 # arm (for armv4 and armv5 CPUs) always uses the soft float ABI.
 # For all other targets, we stick with 'unknown'.
 .if ${TARGET_ARCH:Marmv[67]*} && (!defined(CPUTYPE) || ${CPUTYPE:M*soft*} == "")
 TARGET_ABI=	-gnueabihf
 .elif ${TARGET_ARCH:Marm*}
 TARGET_ABI=	-gnueabi
 .else
 TARGET_ABI=
 .endif
 VENDOR=		unknown
 OS_VERSION=	freebsd13.0
 
 LLVM_TARGET_TRIPLE?=	${TARGET_ARCH:C/amd64/x86_64/:C/arm64/aarch64/}-${VENDOR}-${OS_VERSION}${TARGET_ABI}
 LLVM_BUILD_TRIPLE?=	${BUILD_ARCH:C/amd64/x86_64/:C/arm64/aarch64/}-${VENDOR}-${OS_VERSION}
 
 CFLAGS+=	-DLLVM_DEFAULT_TARGET_TRIPLE=\"${LLVM_TARGET_TRIPLE}\"
 CFLAGS+=	-DLLVM_HOST_TRIPLE=\"${LLVM_BUILD_TRIPLE}\"
 CFLAGS+=	-DDEFAULT_SYSROOT=\"${TOOLS_PREFIX}\"
 
 .if ${MK_LLVM_TARGET_AARCH64} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_AARCH64
 . if ${MACHINE_CPUARCH} == "aarch64"
 LLVM_NATIVE_ARCH=	AArch64
 . endif
 .endif
 .if ${MK_LLVM_TARGET_ARM} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_ARM
 . if ${MACHINE_CPUARCH} == "arm"
 LLVM_NATIVE_ARCH=	ARM
 . endif
 .endif
 .if ${MK_LLVM_TARGET_BPF} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_BPF
 .endif
 .if ${MK_LLVM_TARGET_MIPS} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_MIPS
 . if ${MACHINE_CPUARCH} == "mips"
 LLVM_NATIVE_ARCH=	Mips
 . endif
 .endif
 .if ${MK_LLVM_TARGET_POWERPC} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_POWERPC
 . if ${MACHINE_CPUARCH} == "powerpc"
 LLVM_NATIVE_ARCH=	PowerPC
 . endif
 .endif
 .if ${MK_LLVM_TARGET_SPARC} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_SPARC
 . if ${MACHINE_CPUARCH} == "sparc64"
 LLVM_NATIVE_ARCH=	Sparc
 . endif
 .endif
 .if ${MK_LLVM_TARGET_X86} != "no"
 CFLAGS+=	-DLLVM_TARGET_ENABLE_X86
 . if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64"
 LLVM_NATIVE_ARCH=	X86
 . endif
 .endif
 
 .ifdef LLVM_NATIVE_ARCH
 CFLAGS+=	-DLLVM_NATIVE_ASMPARSER=LLVMInitialize${LLVM_NATIVE_ARCH}AsmParser
 CFLAGS+=	-DLLVM_NATIVE_ASMPRINTER=LLVMInitialize${LLVM_NATIVE_ARCH}AsmPrinter
 CFLAGS+=	-DLLVM_NATIVE_DISASSEMBLER=LLVMInitialize${LLVM_NATIVE_ARCH}Disassembler
 CFLAGS+=	-DLLVM_NATIVE_TARGET=LLVMInitialize${LLVM_NATIVE_ARCH}Target
 CFLAGS+=	-DLLVM_NATIVE_TARGETINFO=LLVMInitialize${LLVM_NATIVE_ARCH}TargetInfo
 CFLAGS+=	-DLLVM_NATIVE_TARGETMC=LLVMInitialize${LLVM_NATIVE_ARCH}TargetMC
 .endif
 
 CFLAGS+=	-ffunction-sections
 CFLAGS+=	-fdata-sections
 LDFLAGS+=	-Wl,--gc-sections
 
-CXXFLAGS+=	-std=c++11
+CXXSTD?=	c++11
 CXXFLAGS+=	-fno-exceptions
 CXXFLAGS+=	-fno-rtti
 CXXFLAGS.clang+= -stdlib=libc++
 
 .if ${MACHINE_CPUARCH} == "arm"
 STATIC_CFLAGS+= -mlong-calls
 STATIC_CXXFLAGS+= -mlong-calls
 .endif
Index: projects/capsicum-test/lib/libc/aarch64/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/aarch64/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/aarch64/static_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_AARCH64_STATIC_TLS_H
+#define _LIBC_AARCH64_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+	__asm __volatile("mrs	%x0, tpidr_el0" : "=r" (tlsbase));
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/aarch64/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/amd64/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/amd64/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/amd64/static_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_AMD64_STATIC_TLS_H
+#define	_LIBC_AMD64_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+	__asm __volatile("movq %%fs:0, %0" : "=r" (tlsbase));
+	tlsbase -= offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/amd64/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/arm/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/arm/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/arm/static_tls.h	(revision 345710)
@@ -0,0 +1,51 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_ARM_STATIC_TLS_H
+#define _LIBC_ARM_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+#ifdef ARM_TP_ADDRESS
+	tlsbase = *(uintptr_t *)ARM_TP_ADDRESS;
+#else
+	__asm __volatile("mrc  p15, 0, %0, c13, c0, 3" : "=r" (tlsbase));
+#endif
+
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/arm/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/gen/Symbol.map
===================================================================
--- projects/capsicum-test/lib/libc/gen/Symbol.map	(revision 345709)
+++ projects/capsicum-test/lib/libc/gen/Symbol.map	(revision 345710)
@@ -1,549 +1,550 @@
 /*
  * $FreeBSD$
  */
 
 FBSD_1.0 {
 	__xuname;
 	pthread_atfork;
 	pthread_attr_destroy;
 	pthread_attr_getdetachstate;
 	pthread_attr_getguardsize;
 	pthread_attr_getinheritsched;
 	pthread_attr_getschedparam;
 	pthread_attr_getschedpolicy;
 	pthread_attr_getscope;
 	pthread_attr_getstackaddr;
 	pthread_attr_getstacksize;
 	pthread_attr_init;
 	pthread_attr_setdetachstate;
 	pthread_attr_setguardsize;
 	pthread_attr_setinheritsched;
 	pthread_attr_setschedparam;
 	pthread_attr_setschedpolicy;
 	pthread_attr_setscope;
 	pthread_attr_setstackaddr;
 	pthread_attr_setstacksize;
 	pthread_cancel;
 	pthread_cleanup_pop;
 	pthread_cleanup_push;
 	pthread_cond_broadcast;
 	pthread_cond_destroy;
 	pthread_cond_init;
 	pthread_cond_signal;
 	pthread_cond_timedwait;
 	pthread_cond_wait;
 	pthread_detach;
 	pthread_equal;
 	pthread_exit;
 	pthread_getspecific;
 	pthread_join;
 	pthread_key_create;
 	pthread_key_delete;
 	pthread_kill;
 	pthread_main_np;
 	pthread_mutex_destroy;
 	pthread_mutex_init;
 	pthread_mutex_lock;
 	pthread_mutex_trylock;
 	pthread_mutex_unlock;
 	pthread_mutexattr_destroy;
 	pthread_mutexattr_init;
 	pthread_mutexattr_settype;
 	pthread_once;
 	pthread_rwlock_destroy;
 	pthread_rwlock_init;
 	pthread_rwlock_rdlock;
 	pthread_rwlock_tryrdlock;
 	pthread_rwlock_trywrlock;
 	pthread_rwlock_unlock;
 	pthread_rwlock_wrlock;
 	pthread_self;
 	pthread_setcancelstate;
 	pthread_setcanceltype;
 	pthread_setspecific;
 	pthread_sigmask;
 	pthread_testcancel;
 	alarm;
 	arc4random;
 	__assert;
 	check_utility_compat;
 	clock;
 	closedir;
 	confstr;
 	ctermid;
 	ctermid_r;
 	daemon;
 	getdiskbyname;
 	dladdr;
 	dlclose;
 	dlerror;
 	dlfunc;
 	dllockinit;
 	dlopen;
 	dlsym;
 	dlvsym;
 	dlinfo;
 	dl_iterate_phdr;
 	drand48;
 	erand48;
 	err_set_file;
 	err_set_exit;
 	err;
 	verr;
 	errc;
 	verrc;
 	errx;
 	verrx;
 	warn;
 	vwarn;
 	warnc;
 	vwarnc;
 	warnx;
 	vwarnx;
 	sys_errlist;
 	sys_nerr;
 	errno;
 	exect;
 	execl;
 	execle;
 	execlp;
 	execv;
 	execvp;
 	execvP;
 	fmtcheck;
 	fmtmsg;
 	fnmatch;
 	__fpclassifyf;
 	__fpclassifyd;
 	__fpclassifyl;
 	frexp;
 	setfstab;
 	getfstab;
 	getfsent;
 	getfsspec;
 	getfsfile;
 	setfsent;
 	endfsent;
 	ftok;
 	getbootfile;
 	getbsize;
 	cgetset;
 	cgetcap;
 	cgetent;
 	cgetmatch;
 	cgetfirst;
 	cgetclose;
 	cgetnext;
 	cgetstr;
 	cgetustr;
 	cgetnum;
 	getcwd;
 	getdomainname;
 	setgrent;
 	setgroupent;
 	endgrent;
 	getgrent_r;
 	getgrnam_r;
 	getgrgid_r;
 	getgrnam;
 	getgrgid;
 	getgrent;
 	/*
 	 * Why are __gr_parse_entry() and __gr_match_entry() not static in
 	 * gen/getgrent.c?
 	 */
 	getgrouplist;
 	gethostname;
 	getloadavg;
 	getlogin;
 	getlogin_r;
 	setnetgrent;
 	getnetgrent;
 	endnetgrent;
 	innetgr;
 	getosreldate;
 	getpagesize;
 	getpeereid;
 	_getprogname;
 	getprogname;
 	setpwent;
 	setpassent;
 	endpwent;
 	getpwent_r;
 	getpwnam_r;
 	getpwuid_r;
 	getpwnam;
 	getpwuid;
 	getpwent;
 	getttynam;
 	getttyent;
 	setttyent;
 	endttyent;
 	isdialuptty;
 	isnettty;
 	getusershell;
 	endusershell;
 	setusershell;
 	getvfsbyname;
 	__isnan;
 	isnan;
 	__isnanf;
 	isnanf;
 	__isinf;
 	isinf;
 	__isinff;
 	__isinfl;
 	isatty;
 	initgroups;
 	jrand48;
 	lcong48;
 	ldexp;
 	lockf;
 	lrand48;
 	modf;
 	mrand48;
 	nice;
 	nlist;
 	nrand48;
 	opendir;
 	pause;
 	posix_madvise;
 	popen;
 	pclose;
 	psignal;
 	raise;
 	readpassphrase;
 	getpass;
 	rewinddir;
 	seed48;
 	seekdir;
 	user_from_uid;
 	group_from_gid;
 	setdomainname;
 	sethostname;
 	longjmperror;
 	getmode;
 	setmode;
 	setproctitle;
 	setprogname;
 	siginterrupt;
 	sys_signame;
 	sys_siglist;
 	sys_nsig;
 	signal;
 	sigaddset;
 	sigdelset;
 	sigemptyset;
 	sigfillset;
 	sigismember;
 	sleep;
 	srand48;
 	fstatvfs;
 	statvfs;
 	sl_init;
 	sl_add;
 	sl_free;
 	sl_find;
 	fflagstostr;
 	strtofflags;
 	sysconf;
 	sysctl;
 	sysctlbyname;
 	sysctlnametomib;
 	syslog;
 	vsyslog;
 	openlog;
 	closelog;
 	setlogmask;
 	ttyname_r;
 	ttyname;
 	timezone;
 	times;
 	time;
 	telldir;
 	tcgetattr;
 	tcsetattr;
 	tcsetpgrp;
 	tcgetpgrp;
 	cfgetospeed;
 	cfgetispeed;
 	cfsetospeed;
 	cfsetispeed;
 	cfsetspeed;
 	cfmakeraw;
 	tcsendbreak;
 	_init_tls;
 	__tls_get_addr;
 	tcdrain;
 	tcflush;
 	tcflow;
 	ualarm;
 	ulimit;
 	uname;
 	strunvis;
 	strunvisx;
 	usleep;
 	utime;
 	valloc;
 	vis;
 	strvis;
 	strvisx;
 	wait;
 	wait3;
 	waitpid;
 	wordexp;
 	wordfree;
 };
 
 FBSD_1.1 {
 	arc4random_buf;
 	arc4random_uniform;
 	fdevname;
 	fdevname_r;
 	fdopendir;
 	feature_present;
 	posix_spawn;
 	posix_spawn_file_actions_addclose;
 	posix_spawn_file_actions_adddup2;
 	posix_spawn_file_actions_addopen;
 	posix_spawn_file_actions_destroy;
 	posix_spawn_file_actions_init;
 	posix_spawnattr_destroy;
 	posix_spawnattr_getflags;
 	posix_spawnattr_getpgroup;
 	posix_spawnattr_getschedparam;
 	posix_spawnattr_getschedpolicy;
 	posix_spawnattr_getsigdefault;
 	posix_spawnattr_getsigmask;
 	posix_spawnattr_init;
 	posix_spawnattr_setflags;
 	posix_spawnattr_setpgroup;
 	posix_spawnattr_setschedparam;
 	posix_spawnattr_setschedpolicy;
 	posix_spawnattr_setsigdefault;
 	posix_spawnattr_setsigmask;
 	posix_spawnp;
 	semctl;
 	tcgetsid;
 	tcsetsid;
 	__pthread_cleanup_pop_imp;
 	__pthread_cleanup_push_imp;
 };
 
 FBSD_1.2 {
 	cfmakesane;
 	endutxent;
 	getpagesizes;
 	getutxent;
 	getutxid;
 	getutxline;
 	getutxuser;
 	pututxline;
 	sem_close;
 	sem_destroy;
 	sem_getvalue;
 	sem_init;
 	sem_open;
 	sem_post;
 	sem_timedwait;
 	sem_trywait;
 	sem_unlink;
 	sem_wait;
 	setutxdb;
 	setutxent;
 };
 
 FBSD_1.3 {
 	clock_getcpuclockid;
 	dirfd;
 	dup3;
 	fdclosedir;
 	fdlopen;
 	__FreeBSD_libc_enter_restricted_mode;
 	getcontextx;
 	gid_from_group;
 	nvis;
 	pwcache_userdb;
 	pwcache_groupdb;
 	snvis;
 	strenvisx;
 	strnunvis;
 	strnunvisx;
 	strnvis;
 	strnvisx;
 	strsenvisx;
 	strsnvis;
 	strsnvisx;
 	strsvis;
 	strsvisx;
 	svis;
 	uid_from_user;
 	unvis;
 	waitid;
 };
 
 FBSD_1.4 {
 	getnetgrent_r;
 	pthread_mutex_consistent;
 	pthread_mutexattr_getrobust;
 	pthread_mutexattr_setrobust;
 	stravis;
 };
 
 FBSD_1.5 {
 	alphasort;
 	basename;
 	daemonfd;
 	devname;
 	devname_r;
 	dirname;
 	elf_aux_info;
 	fts_children;
 	fts_close;
 	fts_get_clientptr;
 	fts_get_stream;
 	fts_open;
 	fts_read;
 	fts_set;
 	fts_set_clientptr;
 	ftw;
 	getentropy;
 	getmntinfo;
 	glob;
 	globfree;
 	nftw;
 	readdir;
 	readdir_r;
 	scandir;
 	scandir_b;
 	sem_clockwait_np;
 	setproctitle_fast;
 	timespec_get;
 };
 
 FBSDprivate_1.0 {
 	/* needed by thread libraries */
 	__thr_jtable;
 
 	_pthread_atfork;
 	_pthread_attr_destroy;
 	_pthread_attr_getdetachstate;
 	_pthread_attr_getguardsize;
 	_pthread_attr_getinheritsched;
 	_pthread_attr_getschedparam;
 	_pthread_attr_getschedpolicy;
 	_pthread_attr_getscope;
 	_pthread_attr_getstackaddr;
 	_pthread_attr_getstacksize;
 	_pthread_attr_init;
 	_pthread_attr_setdetachstate;
 	_pthread_attr_setguardsize;
 	_pthread_attr_setinheritsched;
 	_pthread_attr_setschedparam;
 	_pthread_attr_setschedpolicy;
 	_pthread_attr_setscope;
 	_pthread_attr_setstackaddr;
 	_pthread_attr_setstacksize;
 	_pthread_cancel;
 	_pthread_cancel_enter;
 	_pthread_cancel_leave;
 	_pthread_cleanup_pop;
 	_pthread_cleanup_push;
 	_pthread_cond_broadcast;
 	_pthread_cond_destroy;
 	_pthread_cond_init;
 	_pthread_cond_signal;
 	_pthread_cond_timedwait;
 	_pthread_cond_wait;
 	_pthread_detach;
 	_pthread_equal;
 	_pthread_exit;
 	_pthread_getspecific;
 	_pthread_join;
 	_pthread_key_create;
 	_pthread_key_delete;
 	_pthread_kill;
 	_pthread_main_np;
 	_pthread_mutex_destroy;
 	_pthread_mutex_init_calloc_cb;
 	_pthread_mutex_init;
 	_pthread_mutex_lock;
 	_pthread_mutex_trylock;
 	_pthread_mutex_unlock;
 	_pthread_mutexattr_destroy;
 	_pthread_mutexattr_init;
 	_pthread_mutexattr_settype;
 	_pthread_once;
 	_pthread_rwlock_destroy;
 	_pthread_rwlock_init;
 	_pthread_rwlock_rdlock;
 	_pthread_rwlock_tryrdlock;
 	_pthread_rwlock_trywrlock;
 	_pthread_rwlock_unlock;
 	_pthread_rwlock_wrlock;
 	_pthread_self;
 	_pthread_setcancelstate;
 	_pthread_setcanceltype;
 	_pthread_setspecific;
 	_pthread_sigmask;
 	_pthread_testcancel;
 	_spinlock;
 	_spinunlock;
 	_rtld_addr_phdr;
 	_rtld_atfork_pre;
 	_rtld_atfork_post;
 	_rtld_error;		/* for private use */
 	_rtld_get_stack_prot;
 	_rtld_is_dlopened;
 	_rtld_thread_init;	/* for private use */
 	__elf_phdr_match_addr;
 	_err;
 	_warn;
 	__fmtcheck;
 	/* __pw_match_entry; */
 	/* __pw_parse_entry; */
 	__fdnlist;	/* used by libkvm */
 	/* __aout_fdnlist; */
 	/* __elf_is_okay__; */
 	/* __elf_fdnlist; */
 	__opendir2;
 	__pause;
 	_pause;
 	__raise;
 	_raise;
 	__sleep;
 	_sleep;
 	_rtld_allocate_tls;
 	_rtld_free_tls;
 #if defined(i386)
 	___libc_tls_get_addr;	/* x86 only */
 #endif
 	__libc_tls_get_addr;
 	__tcdrain;
 	_tcdrain;
 	__usleep;
 	_usleep;
 	__wait;
 	_wait;
 	__waitpid;
 	_waitpid;
 
 	_libc_sem_init_compat;
 	_libc_sem_destroy_compat;
 	_libc_sem_open_compat;
 	_libc_sem_close_compat;
 	_libc_sem_unlink_compat;
 	_libc_sem_wait_compat;
 	_libc_sem_trywait_compat;
 	_libc_sem_timedwait_compat;
 	_libc_sem_post_compat;
 	_libc_sem_getvalue_compat;
 
 	__libc_tcdrain;
 
 	__elf_aux_vector;
+	__pthread_distribute_static_tls;
 	__pthread_map_stacks_exec;
 	__fillcontextx;
 	__fillcontextx2;
 	__getcontextx_size;
 };
Index: projects/capsicum-test/lib/libc/gen/elf_utils.c
===================================================================
--- projects/capsicum-test/lib/libc/gen/elf_utils.c	(revision 345709)
+++ projects/capsicum-test/lib/libc/gen/elf_utils.c	(revision 345710)
@@ -1,99 +1,123 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Konstantin Belousov <kib@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/sysctl.h>
 #include <link.h>
 #include <stddef.h>
+#include <string.h>
 #include "libc_private.h"
+#include "static_tls.h"
 
 int __elf_phdr_match_addr(struct dl_phdr_info *, void *);
 void __pthread_map_stacks_exec(void);
+void __pthread_distribute_static_tls(size_t, void *, size_t, size_t);
 
 int
 __elf_phdr_match_addr(struct dl_phdr_info *phdr_info, void *addr)
 {
 	const Elf_Phdr *ph;
 	int i;
 
 	for (i = 0; i < phdr_info->dlpi_phnum; i++) {
 		ph = &phdr_info->dlpi_phdr[i];
 		if (ph->p_type != PT_LOAD)
 			continue;
 
 		/* ELFv1 ABI for powerpc64 passes function descriptor
 		 * pointers around, not function pointers.  The function
 		 * descriptors live in .opd, which is a non-executable segment.
 		 * The PF_X check would therefore make all address checks fail,
 		 * causing a crash in some instances.  Don't skip over
 		 * non-executable segments in the ELFv1 powerpc64 case.
 		 */
 #if !defined(__powerpc64__) || (defined(_CALL_ELF) && _CALL_ELF == 2)
 		if ((ph->p_flags & PF_X) == 0)
 			continue;
 #endif
 
 		if (phdr_info->dlpi_addr + ph->p_vaddr <= (uintptr_t)addr &&
 		    (uintptr_t)addr + sizeof(addr) < phdr_info->dlpi_addr +
 		    ph->p_vaddr + ph->p_memsz)
 			break;
 	}
 	return (i != phdr_info->dlpi_phnum);
 }
 
 void
 __libc_map_stacks_exec(void)
 {
 	int mib[2];
 	struct rlimit rlim;
 	u_long usrstack;
 	size_t len;
 	
 	mib[0] = CTL_KERN;
 	mib[1] = KERN_USRSTACK;
 	len = sizeof(usrstack);
 	if (sysctl(mib, sizeof(mib) / sizeof(mib[0]), &usrstack, &len, NULL, 0)
 	    == -1)
 		return;
 	if (getrlimit(RLIMIT_STACK, &rlim) == -1)
 		return;
 	mprotect((void *)(uintptr_t)(usrstack - rlim.rlim_cur),
 	    rlim.rlim_cur, _rtld_get_stack_prot());
 }
 
 #pragma weak __pthread_map_stacks_exec
 void
 __pthread_map_stacks_exec(void)
 {
 
 	((void (*)(void))__libc_interposing[INTERPOS_map_stacks_exec])();
+}
+
+void
+__libc_distribute_static_tls(size_t offset, void *src, size_t len,
+    size_t total_len)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = _libc_get_static_tls_base(offset);
+	memcpy((void *)tlsbase, src, len);
+	memset((char *)tlsbase + len, 0, total_len - len);
+}
+
+#pragma weak __pthread_distribute_static_tls
+void
+__pthread_distribute_static_tls(size_t offset, void *src, size_t len,
+    size_t total_len)
+{
+
+	((void (*)(size_t, void *, size_t, size_t))__libc_interposing[
+	    INTERPOS_distribute_static_tls])(offset, src, len, total_len);
 }
Index: projects/capsicum-test/lib/libc/i386/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/i386/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/i386/static_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_I386_STATIC_TLS_H
+#define _LIBC_I386_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+	__asm __volatile("movl %%gs:0, %0" : "=r" (tlsbase));
+	tlsbase -= offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/i386/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/include/libc_private.h
===================================================================
--- projects/capsicum-test/lib/libc/include/libc_private.h	(revision 345709)
+++ projects/capsicum-test/lib/libc/include/libc_private.h	(revision 345710)
@@ -1,427 +1,430 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1998 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  * Private definitions for libc, libc_r and libpthread.
  *
  */
 
 #ifndef _LIBC_PRIVATE_H_
 #define _LIBC_PRIVATE_H_
 #include <sys/_types.h>
 #include <sys/_pthreadtypes.h>
 
 /*
  * This global flag is non-zero when a process has created one
  * or more threads. It is used to avoid calling locking functions
  * when they are not required.
  */
 #ifndef __LIBC_ISTHREADED_DECLARED
 #define __LIBC_ISTHREADED_DECLARED
 extern int	__isthreaded;
 #endif
 
 /*
  * Elf_Auxinfo *__elf_aux_vector, the pointer to the ELF aux vector
  * provided by kernel. Either set for us by rtld, or found at runtime
  * on stack for static binaries.
  *
  * Type is void to avoid polluting whole libc with ELF types.
  */
 extern void	*__elf_aux_vector;
 
 /*
  * libc should use libc_dlopen internally, which respects a global
  * flag where loading of new shared objects can be restricted.
  */
 void *libc_dlopen(const char *, int);
 
 /*
  * For dynamic linker.
  */
 void _rtld_error(const char *fmt, ...);
 
 /*
  * File lock contention is difficult to diagnose without knowing
  * where locks were set. Allow a debug library to be built which
  * records the source file and line number of each lock call.
  */
 #ifdef	_FLOCK_DEBUG
 #define _FLOCKFILE(x)	_flockfile_debug(x, __FILE__, __LINE__)
 #else
 #define _FLOCKFILE(x)	_flockfile(x)
 #endif
 
 /*
  * Macros for locking and unlocking FILEs. These test if the
  * process is threaded to avoid locking when not required.
  */
 #define	FLOCKFILE(fp)		if (__isthreaded) _FLOCKFILE(fp)
 #define	FUNLOCKFILE(fp)		if (__isthreaded) _funlockfile(fp)
 
 struct _spinlock;
 extern struct _spinlock __stdio_thread_lock __hidden;
 #define STDIO_THREAD_LOCK()				\
 do {							\
 	if (__isthreaded)				\
 		_SPINLOCK(&__stdio_thread_lock);	\
 } while (0)
 #define STDIO_THREAD_UNLOCK()				\
 do {							\
 	if (__isthreaded)				\
 		_SPINUNLOCK(&__stdio_thread_lock);	\
 } while (0)
 
 void		__libc_spinlock_stub(struct _spinlock *);
 void		__libc_spinunlock_stub(struct _spinlock *);
 
 /*
  * Indexes into the pthread jump table.
  *
  * Warning! If you change this type, you must also change the threads
  * libraries that reference it (libc_r, libpthread).
  */
 typedef enum {
 	PJT_ATFORK,
 	PJT_ATTR_DESTROY,
 	PJT_ATTR_GETDETACHSTATE,
 	PJT_ATTR_GETGUARDSIZE,
 	PJT_ATTR_GETINHERITSCHED,
 	PJT_ATTR_GETSCHEDPARAM,
 	PJT_ATTR_GETSCHEDPOLICY,
 	PJT_ATTR_GETSCOPE,
 	PJT_ATTR_GETSTACKADDR,
 	PJT_ATTR_GETSTACKSIZE,
 	PJT_ATTR_INIT,
 	PJT_ATTR_SETDETACHSTATE,
 	PJT_ATTR_SETGUARDSIZE,
 	PJT_ATTR_SETINHERITSCHED,
 	PJT_ATTR_SETSCHEDPARAM,
 	PJT_ATTR_SETSCHEDPOLICY,
 	PJT_ATTR_SETSCOPE,
 	PJT_ATTR_SETSTACKADDR,
 	PJT_ATTR_SETSTACKSIZE,
 	PJT_CANCEL,
 	PJT_CLEANUP_POP,
 	PJT_CLEANUP_PUSH,
 	PJT_COND_BROADCAST,
 	PJT_COND_DESTROY,
 	PJT_COND_INIT,
 	PJT_COND_SIGNAL,
 	PJT_COND_TIMEDWAIT,
 	PJT_COND_WAIT,
 	PJT_DETACH,
 	PJT_EQUAL,
 	PJT_EXIT,
 	PJT_GETSPECIFIC,
 	PJT_JOIN,
 	PJT_KEY_CREATE,
 	PJT_KEY_DELETE,
 	PJT_KILL,
 	PJT_MAIN_NP,
 	PJT_MUTEXATTR_DESTROY,
 	PJT_MUTEXATTR_INIT,
 	PJT_MUTEXATTR_SETTYPE,
 	PJT_MUTEX_DESTROY,
 	PJT_MUTEX_INIT,
 	PJT_MUTEX_LOCK,
 	PJT_MUTEX_TRYLOCK,
 	PJT_MUTEX_UNLOCK,
 	PJT_ONCE,
 	PJT_RWLOCK_DESTROY,
 	PJT_RWLOCK_INIT,
 	PJT_RWLOCK_RDLOCK,
 	PJT_RWLOCK_TRYRDLOCK,
 	PJT_RWLOCK_TRYWRLOCK,
 	PJT_RWLOCK_UNLOCK,
 	PJT_RWLOCK_WRLOCK,
 	PJT_SELF,
 	PJT_SETCANCELSTATE,
 	PJT_SETCANCELTYPE,
 	PJT_SETSPECIFIC,
 	PJT_SIGMASK,
 	PJT_TESTCANCEL,
 	PJT_CLEANUP_POP_IMP,
 	PJT_CLEANUP_PUSH_IMP,
 	PJT_CANCEL_ENTER,
 	PJT_CANCEL_LEAVE,
 	PJT_MUTEX_CONSISTENT,
 	PJT_MUTEXATTR_GETROBUST,
 	PJT_MUTEXATTR_SETROBUST,
 	PJT_MAX
 } pjt_index_t;
 
 typedef int (*pthread_func_t)(void);
 typedef pthread_func_t pthread_func_entry_t[2];
 
 extern pthread_func_entry_t __thr_jtable[];
 
 void	__set_error_selector(int *(*arg)(void));
 int	_pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex,
 	    void *(calloc_cb)(__size_t, __size_t));
 
 typedef int (*interpos_func_t)(void);
 interpos_func_t *__libc_interposing_slot(int interposno);
 extern interpos_func_t __libc_interposing[] __hidden;
 
 enum {
 	INTERPOS_accept,
 	INTERPOS_accept4,
 	INTERPOS_aio_suspend,
 	INTERPOS_close,
 	INTERPOS_connect,
 	INTERPOS_fcntl,
 	INTERPOS_fsync,
 	INTERPOS_fork,
 	INTERPOS_msync,
 	INTERPOS_nanosleep,
 	INTERPOS_openat,
 	INTERPOS_poll,
 	INTERPOS_pselect,
 	INTERPOS_recvfrom,
 	INTERPOS_recvmsg,
 	INTERPOS_select,
 	INTERPOS_sendmsg,
 	INTERPOS_sendto,
 	INTERPOS_setcontext,
 	INTERPOS_sigaction,
 	INTERPOS_sigprocmask,
 	INTERPOS_sigsuspend,
 	INTERPOS_sigwait,
 	INTERPOS_sigtimedwait,
 	INTERPOS_sigwaitinfo,
 	INTERPOS_swapcontext,
 	INTERPOS_system,
 	INTERPOS_tcdrain,
 	INTERPOS_read,
 	INTERPOS_readv,
 	INTERPOS_wait4,
 	INTERPOS_write,
 	INTERPOS_writev,
 	INTERPOS__pthread_mutex_init_calloc_cb,
 	INTERPOS_spinlock,
 	INTERPOS_spinunlock,
 	INTERPOS_kevent,
 	INTERPOS_wait6,
 	INTERPOS_ppoll,
 	INTERPOS_map_stacks_exec,
 	INTERPOS_fdatasync,
 	INTERPOS_clock_nanosleep,
+	INTERPOS_distribute_static_tls,
 	INTERPOS_MAX
 };
 
 /*
  * yplib internal interfaces
  */
 #ifdef YP
 int _yp_check(char **);
 #endif
 
 /*
  * Initialise TLS for static programs
  */
 void _init_tls(void);
 
 /*
  * Provides pthread_once()-like functionality for both single-threaded
  * and multi-threaded applications.
  */
 int _once(pthread_once_t *, void (*)(void));
 
 /*
  * Set the TLS thread pointer
  */
 void _set_tp(void *tp);
 
 /*
  * This is a pointer in the C run-time startup code. It is used
  * by getprogname() and setprogname().
  */
 extern const char *__progname;
 
 /*
  * This function is used by the threading libraries to notify malloc that a
  * thread is exiting.
  */
 void _malloc_thread_cleanup(void);
 
 /*
  * This function is used by the threading libraries to notify libc that a
  * thread is exiting, so its thread-local dtors should be called.
  */
 void __cxa_thread_call_dtors(void);
 int __cxa_thread_atexit_hidden(void (*dtor_func)(void *), void *obj,
     void *dso_symbol) __hidden;
 
 /*
  * These functions are used by the threading libraries in order to protect
  * malloc across fork().
  */
 void _malloc_prefork(void);
 void _malloc_postfork(void);
 
 void _malloc_first_thread(void);
 
 /*
  * Function to clean up streams, called from abort() and exit().
  */
 extern void (*__cleanup)(void) __hidden;
 
 /*
  * Get kern.osreldate to detect ABI revisions.  Explicitly
  * ignores value of $OSVERSION and caches result.
  */
 int __getosreldate(void);
 #include <sys/_types.h>
 #include <sys/_sigset.h>
 
 struct aiocb;
 struct fd_set;
 struct iovec;
 struct kevent;
 struct msghdr;
 struct pollfd;
 struct rusage;
 struct sigaction;
 struct sockaddr;
 struct stat;
 struct statfs;
 struct timespec;
 struct timeval;
 struct timezone;
 struct __siginfo;
 struct __ucontext;
 struct __wrusage;
 enum idtype;
 int		__sys_aio_suspend(const struct aiocb * const[], int,
 		    const struct timespec *);
 int		__sys_accept(int, struct sockaddr *, __socklen_t *);
 int		__sys_accept4(int, struct sockaddr *, __socklen_t *, int);
 int		__sys_clock_gettime(__clockid_t, struct timespec *ts);
 int		__sys_clock_nanosleep(__clockid_t, int,
 		    const struct timespec *, struct timespec *);
 int		__sys_close(int);
 int		__sys_connect(int, const struct sockaddr *, __socklen_t);
 int		__sys_fcntl(int, int, ...);
 int		__sys_fdatasync(int);
 int		__sys_fstat(int fd, struct stat *);
 int		__sys_fstatfs(int fd, struct statfs *);
 int		__sys_fstatat(int, const char *, struct stat *, int);
 int		__sys_fsync(int);
 __pid_t		__sys_fork(void);
 int		__sys_ftruncate(int, __off_t);
 __ssize_t	__sys_getdirentries(int, char *, __size_t, __off_t *);
 int		__sys_getfsstat(struct statfs *, long, int);
 int		__sys_gettimeofday(struct timeval *, struct timezone *);
 int		__sys_kevent(int, const struct kevent *, int, struct kevent *,
 		    int, const struct timespec *);
 __off_t		__sys_lseek(int, __off_t, int);
 void	       *__sys_mmap(void *, __size_t, int, int, int, __off_t);
 int		__sys_msync(void *, __size_t, int);
 int		__sys_nanosleep(const struct timespec *, struct timespec *);
 int		__sys_open(const char *, int, ...);
 int		__sys_openat(int, const char *, int, ...);
 int		__sys_pselect(int, struct fd_set *, struct fd_set *,
 		    struct fd_set *, const struct timespec *,
 		    const __sigset_t *);
 int		__sys_ptrace(int, __pid_t, char *, int);
 int		__sys_poll(struct pollfd *, unsigned, int);
 int		__sys_ppoll(struct pollfd *, unsigned, const struct timespec *,
 		    const __sigset_t *);
 __ssize_t	__sys_pread(int, void *, __size_t, __off_t);
 __ssize_t	__sys_pwrite(int, const void *, __size_t, __off_t);
 __ssize_t	__sys_read(int, void *, __size_t);
 __ssize_t	__sys_readv(int, const struct iovec *, int);
 __ssize_t	__sys_recv(int, void *, __size_t, int);
 __ssize_t	__sys_recvfrom(int, void *, __size_t, int, struct sockaddr *,
 		    __socklen_t *);
 __ssize_t	__sys_recvmsg(int, struct msghdr *, int);
 int		__sys_select(int, struct fd_set *, struct fd_set *,
 		    struct fd_set *, struct timeval *);
 __ssize_t	__sys_sendmsg(int, const struct msghdr *, int);
 __ssize_t	__sys_sendto(int, const void *, __size_t, int,
 		    const struct sockaddr *, __socklen_t);
 int		__sys_setcontext(const struct __ucontext *);
 int		__sys_sigaction(int, const struct sigaction *,
 		    struct sigaction *);
 int		__sys_sigprocmask(int, const __sigset_t *, __sigset_t *);
 int		__sys_sigsuspend(const __sigset_t *);
 int		__sys_sigtimedwait(const __sigset_t *, struct __siginfo *,
 		    const struct timespec *);
 int		__sys_sigwait(const __sigset_t *, int *);
 int		__sys_sigwaitinfo(const __sigset_t *, struct __siginfo *);
 int		__sys_statfs(const char *, struct statfs *);
 int		__sys_swapcontext(struct __ucontext *,
 		    const struct __ucontext *);
 int		__sys_thr_kill(long, int);
 int		__sys_thr_self(long *);
 int		__sys_truncate(const char *, __off_t);
 __pid_t		__sys_wait4(__pid_t, int *, int, struct rusage *);
 __pid_t		__sys_wait6(enum idtype, __id_t, int *, int,
 		    struct __wrusage *, struct __siginfo *);
 __ssize_t	__sys_write(int, const void *, __size_t);
 __ssize_t	__sys_writev(int, const struct iovec *, int);
 
 int		__libc_sigaction(int, const struct sigaction *,
 		    struct sigaction *) __hidden;
 int		__libc_sigprocmask(int, const __sigset_t *, __sigset_t *)
 		    __hidden;
 int		__libc_sigsuspend(const __sigset_t *) __hidden;
 int		__libc_sigwait(const __sigset_t * __restrict,
 		    int * restrict sig);
 int		__libc_system(const char *);
 int		__libc_tcdrain(int);
 int		__fcntl_compat(int fd, int cmd, ...);
 
 int		__sys_futimens(int fd, const struct timespec *times) __hidden;
 int		__sys_utimensat(int fd, const char *path,
 		    const struct timespec *times, int flag) __hidden;
 
 /* execve() with PATH processing to implement posix_spawnp() */
 int _execvpe(const char *, char * const *, char * const *);
 
 int _elf_aux_info(int aux, void *buf, int buflen);
 struct dl_phdr_info;
 int __elf_phdr_match_addr(struct dl_phdr_info *, void *);
 void __init_elf_aux_vector(void);
 void __libc_map_stacks_exec(void);
+void __libc_distribute_static_tls(__size_t, void *, __size_t, __size_t);
+__uintptr_t __libc_static_tls_base(__size_t);
 
 void	_pthread_cancel_enter(int);
 void	_pthread_cancel_leave(int);
 
 struct _pthread_cleanup_info;
 void	___pthread_cleanup_push_imp(void (*)(void *), void *,
 	    struct _pthread_cleanup_info *);
 void	___pthread_cleanup_pop_imp(int);
 
 void __throw_constraint_handler_s(const char * restrict msg, int error);
 
 #endif /* _LIBC_PRIVATE_H_ */
Index: projects/capsicum-test/lib/libc/mips/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/mips/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/mips/static_tls.h	(revision 345710)
@@ -0,0 +1,64 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_MIPS_STATIC_TLS_H
+#define _LIBC_MIPS_STATIC_TLS_H
+
+#include <machine/tls.h>
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+#if defined(__mips_n64)
+	__asm__ __volatile__ (
+	    ".set\tpush\n\t"
+	    ".set\tmips64r2\n\t"
+	    "rdhwr\t%0, $29\n\t"
+	    ".set\tpop"
+	    : "=r" (tlsbase));
+	tlsbase -= TLS_TP_OFFSET + TLS_TCB_SIZE;
+#else /* mips 32 */
+	__asm__ __volatile__ (
+	    ".set\tpush\n\t"
+	    ".set\tmips32r2\n\t"
+	    "rdhwr\t%0, $29\n\t"
+	    ".set\tpop"
+	    : "=r" (tlsbase));
+	tlsbase -= TLS_TP_OFFSET + TLS_TCB_SIZE;
+#endif /* ! __mips_n64 */
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/mips/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/powerpc/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/powerpc/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/powerpc/static_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_POWERPC_STATIC_TLS_H
+#define _LIBC_POWERPC_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+	__asm __volatile("mr %0,2" : "=r"(tlsbase));
+	tlsbase += offset - 0x7008;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/powerpc/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/powerpc64/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/powerpc64/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/powerpc64/static_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_POWERPC64_STATIC_TLS_H
+#define _LIBC_POWERPC64_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+	__asm __volatile("mr %0,13" : "=r"(tlsbase));
+	tlsbase += offset - 0x7010;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/powerpc64/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/riscv/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/riscv/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/riscv/static_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_RISCV_STATIC_TLS_H
+#define _LIBC_RISCV_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	uintptr_t tlsbase;
+
+	__asm __volatile("mv %0, tp" : "=r"(tlsbase));
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/riscv/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/sparc64/static_tls.h
===================================================================
--- projects/capsicum-test/lib/libc/sparc64/static_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libc/sparc64/static_tls.h	(revision 345710)
@@ -0,0 +1,44 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LIBC_SPARC64_STATIC_TLS_H
+#define _LIBC_SPARC64_STATIC_TLS_H
+
+static __inline uintptr_t
+_libc_get_static_tls_base(size_t offset)
+{
+	register uintptr_t tlsbase __asm("%g7");
+
+	return (tlsbase + offset);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libc/sparc64/static_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libc/sys/interposing_table.c
===================================================================
--- projects/capsicum-test/lib/libc/sys/interposing_table.c	(revision 345709)
+++ projects/capsicum-test/lib/libc/sys/interposing_table.c	(revision 345710)
@@ -1,92 +1,93 @@
 /*
  * Copyright (c) 2014 The FreeBSD Foundation.
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include "libc_private.h"
 
 #define	SLOT(a, b) \
 	[INTERPOS_##a] = (interpos_func_t)b
 interpos_func_t __libc_interposing[INTERPOS_MAX] = {
 	SLOT(accept, __sys_accept),
 	SLOT(accept4, __sys_accept4),
 	SLOT(aio_suspend, __sys_aio_suspend),
 	SLOT(close, __sys_close),
 	SLOT(connect, __sys_connect),
 	SLOT(fcntl, __sys_fcntl),
 	SLOT(fsync, __sys_fsync),
 	SLOT(fork, __sys_fork),
 	SLOT(msync, __sys_msync),
 	SLOT(nanosleep, __sys_nanosleep),
 	SLOT(openat, __sys_openat),
 	SLOT(poll, __sys_poll),
 	SLOT(pselect, __sys_pselect),
 	SLOT(read, __sys_read),
 	SLOT(readv, __sys_readv),
 	SLOT(recvfrom, __sys_recvfrom),
 	SLOT(recvmsg, __sys_recvmsg),
 	SLOT(select, __sys_select),
 	SLOT(sendmsg, __sys_sendmsg),
 	SLOT(sendto, __sys_sendto),
 	SLOT(setcontext, __sys_setcontext),
 	SLOT(sigaction, __sys_sigaction),
 	SLOT(sigprocmask, __sys_sigprocmask),
 	SLOT(sigsuspend, __sys_sigsuspend),
 	SLOT(sigwait, __libc_sigwait),
 	SLOT(sigtimedwait, __sys_sigtimedwait),
 	SLOT(sigwaitinfo, __sys_sigwaitinfo),
 	SLOT(swapcontext, __sys_swapcontext),
 	SLOT(system, __libc_system),
 	SLOT(tcdrain, __libc_tcdrain),
 	SLOT(wait4, __sys_wait4),
 	SLOT(write, __sys_write),
 	SLOT(writev, __sys_writev),
 	SLOT(_pthread_mutex_init_calloc_cb, _pthread_mutex_init_calloc_cb_stub),
 	SLOT(spinlock, __libc_spinlock_stub),
 	SLOT(spinunlock, __libc_spinunlock_stub),
 	SLOT(kevent, __sys_kevent),
 	SLOT(wait6, __sys_wait6),
 	SLOT(ppoll, __sys_ppoll),
 	SLOT(map_stacks_exec, __libc_map_stacks_exec),
 	SLOT(fdatasync, __sys_fdatasync),
 	SLOT(clock_nanosleep, __sys_clock_nanosleep),
+	SLOT(distribute_static_tls, __libc_distribute_static_tls),
 };
 #undef SLOT
 
 interpos_func_t *
 __libc_interposing_slot(int interposno)
 {
 
 	return (&__libc_interposing[interposno]);
 }
Index: projects/capsicum-test/lib/libc/tests/stdlib/Makefile
===================================================================
--- projects/capsicum-test/lib/libc/tests/stdlib/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libc/tests/stdlib/Makefile	(revision 345710)
@@ -1,68 +1,68 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 ATF_TESTS_C+=		dynthr_test
 ATF_TESTS_C+=		heapsort_test
 ATF_TESTS_C+=		mergesort_test
 ATF_TESTS_C+=		qsort_test
 ATF_TESTS_C+=		set_constraint_handler_s_test
 ATF_TESTS_C+=		strfmon_test
 ATF_TESTS_C+=		tsearch_test
 .if ${COMPILER_FEATURES:Mc++11}
 ATF_TESTS_CXX+=		cxa_thread_atexit_test
 ATF_TESTS_CXX+=		cxa_thread_atexit_nothr_test
 .endif
 
 # Not sure why this isn't defined for all architectures, since most
 # have long double.
 .if ${MACHINE_CPUARCH} == "aarch64" || \
     ${MACHINE_CPUARCH} == "amd64" || \
     ${MACHINE_CPUARCH} == "i386"
 CFLAGS+=	-D__HAVE_LONG_DOUBLE
 .endif
 
 # TODO: t_getenv_thread, t_mi_vector_hash, t_strtoi
 NETBSD_ATF_TESTS_C+=	abs_test
 NETBSD_ATF_TESTS_C+=	atoi_test
 NETBSD_ATF_TESTS_C+=	div_test
 NETBSD_ATF_TESTS_C+=	getenv_test
 NETBSD_ATF_TESTS_C+=	exit_test
 NETBSD_ATF_TESTS_C+=	hsearch_test
 NETBSD_ATF_TESTS_C+=	posix_memalign_test
 NETBSD_ATF_TESTS_C+=	random_test
 NETBSD_ATF_TESTS_C+=	strtod_test
 NETBSD_ATF_TESTS_C+=	strtol_test
 NETBSD_ATF_TESTS_C+=	system_test
 
 # TODO: need to come up with a correct explanation of what the patch pho does
 # with h_atexit
 #ATF_TESTS_SH=	atexit_test
 NETBSD_ATF_TESTS_SH=	getopt_test
 
 .include "../Makefile.netbsd-tests"
 
 BINDIR=		${TESTSDIR}
 
 # TODO: see comment above
 #PROGS+=		h_atexit
 PROGS+=		h_getopt h_getopt_long
 
 CFLAGS+=	-I${.CURDIR}
 
-CXXFLAGS.cxa_thread_atexit_test+=	-std=c++11
-CXXFLAGS.cxa_thread_atexit_nothr_test+=	-std=c++11
+CXXSTD.cxa_thread_atexit_test=	c++11
+CXXSTD.cxa_thread_atexit_nothr_test=	c++11
 LIBADD.cxa_thread_atexit_test+=		pthread
 
 .for t in h_getopt h_getopt_long
 CFLAGS.$t+=	-I${LIBNETBSD_SRCDIR} -I${SRCTOP}/contrib/netbsd-tests
 LDFLAGS.$t+=	-L${LIBNETBSD_OBJDIR}
 
 LIBADD.${t}+=	netbsd util
 .endfor
 
 LIBADD.strtod_test+=		m
 
 SUBDIR+=	dynthr_mod
 
 .include <bsd.test.mk>
Index: projects/capsicum-test/lib/libc++/Makefile
===================================================================
--- projects/capsicum-test/lib/libc++/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libc++/Makefile	(revision 345710)
@@ -1,275 +1,273 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=	clibs
 _LIBCXXRTDIR=	${SRCTOP}/contrib/libcxxrt
 HDRDIR=		${SRCTOP}/contrib/libc++/include
 SRCDIR=		${SRCTOP}/contrib/libc++/src
 CXXINCLUDEDIR=	${INCLUDEDIR}/c++/v${SHLIB_MAJOR}
 .if ${MACHINE_CPUARCH} == "arm"
 STATIC_CXXFLAGS+= -mlong-calls
 .endif
 
 .PATH: ${SRCDIR}
 
 LIB=		c++
 SHLIB_MAJOR=	1
 SHLIB_LDSCRIPT=	libc++.ldscript
 
 SRCS+=		algorithm.cpp
 SRCS+=		any.cpp
 SRCS+=		bind.cpp
 SRCS+=		charconv.cpp
 SRCS+=		chrono.cpp
 SRCS+=		condition_variable.cpp
 SRCS+=		debug.cpp
 SRCS+=		exception.cpp
 SRCS+=		functional.cpp
 SRCS+=		future.cpp
 SRCS+=		hash.cpp
 SRCS+=		ios.cpp
 SRCS+=		iostream.cpp
 SRCS+=		locale.cpp
 SRCS+=		memory.cpp
 SRCS+=		mutex.cpp
 SRCS+=		new.cpp
 SRCS+=		optional.cpp
 SRCS+=		random.cpp
 SRCS+=		regex.cpp
 SRCS+=		shared_mutex.cpp
 SRCS+=		stdexcept.cpp
 SRCS+=		string.cpp
 SRCS+=		strstream.cpp
 SRCS+=		system_error.cpp
 SRCS+=		thread.cpp
 SRCS+=		typeinfo.cpp
 SRCS+=		utility.cpp
 SRCS+=		valarray.cpp
 SRCS+=		variant.cpp
 SRCS+=		vector.cpp
 SRCS+=		filesystem/directory_iterator.cpp
 SRCS+=		filesystem/int128_builtins.cpp
 SRCS+=		filesystem/operations.cpp
 
 CXXRT_SRCS+=	auxhelper.cc
 CXXRT_SRCS+=	dynamic_cast.cc
 CXXRT_SRCS+=	exception.cc
 CXXRT_SRCS+=	guard.cc
 CXXRT_SRCS+=	libelftc_dem_gnu3.c
 CXXRT_SRCS+=	memory.cc
 CXXRT_SRCS+=	stdexcept.cc
 CXXRT_SRCS+=	terminate.cc
 CXXRT_SRCS+=	typeinfo.cc
 
 .for _S in ${CXXRT_SRCS}
 CLEANFILES+=	cxxrt_${_S}
 STATICOBJS+=	cxxrt_${_S:R}.o
 cxxrt_${_S}: ${_LIBCXXRTDIR}/${_S} .NOMETA
 	ln -sf ${.ALLSRC} ${.TARGET}
 .endfor
 
 WARNS=		0
 CFLAGS+=	-isystem ${HDRDIR}
 CFLAGS+=	-isystem ${_LIBCXXRTDIR}
 CFLAGS+=	-nostdinc++
 CFLAGS+=	-nostdlib
 CFLAGS+=	-D_LIBCPP_BUILDING_LIBRARY
 CFLAGS+=	-DLIBCXXRT
-.if empty(CXXFLAGS:M-std=*)
-CXXFLAGS+=	-std=c++11
-.endif
+CXXSTD=		c++11
 
 LIBADD+=	cxxrt
 INCSGROUPS=	STD EXP EXT
 
 STD_HEADERS+=	__bit_reference
 STD_HEADERS+=	__bsd_locale_defaults.h
 STD_HEADERS+=	__bsd_locale_fallbacks.h
 STD_HEADERS+=	__config
 STD_HEADERS+=	__debug
 STD_HEADERS+=	__errc
 STD_HEADERS+=	__functional_03
 STD_HEADERS+=	__functional_base
 STD_HEADERS+=	__functional_base_03
 STD_HEADERS+=	__hash_table
 STD_HEADERS+=	__libcpp_version
 STD_HEADERS+=	__locale
 STD_HEADERS+=	__mutex_base
 STD_HEADERS+=	__node_handle
 STD_HEADERS+=	__nullptr
 STD_HEADERS+=	__split_buffer
 STD_HEADERS+=	__sso_allocator
 STD_HEADERS+=	__std_stream
 STD_HEADERS+=	__string
 STD_HEADERS+=	__threading_support
 STD_HEADERS+=	__tree
 STD_HEADERS+=	__tuple
 STD_HEADERS+=	__undef_macros
 STD_HEADERS+=	algorithm
 STD_HEADERS+=	any
 STD_HEADERS+=	array
 STD_HEADERS+=	atomic
 STD_HEADERS+=	bit
 STD_HEADERS+=	bitset
 STD_HEADERS+=	cassert
 STD_HEADERS+=	ccomplex
 STD_HEADERS+=	cctype
 STD_HEADERS+=	cerrno
 STD_HEADERS+=	cfenv
 STD_HEADERS+=	cfloat
 STD_HEADERS+=	charconv
 STD_HEADERS+=	chrono
 STD_HEADERS+=	cinttypes
 STD_HEADERS+=	ciso646
 STD_HEADERS+=	climits
 STD_HEADERS+=	clocale
 STD_HEADERS+=	cmath
 STD_HEADERS+=	codecvt
 STD_HEADERS+=	compare
 STD_HEADERS+=	complex
 STD_HEADERS+=	complex.h
 STD_HEADERS+=	condition_variable
 STD_HEADERS+=	csetjmp
 STD_HEADERS+=	csignal
 STD_HEADERS+=	cstdarg
 STD_HEADERS+=	cstdbool
 STD_HEADERS+=	cstddef
 STD_HEADERS+=	cstdint
 STD_HEADERS+=	cstdio
 STD_HEADERS+=	cstdlib
 STD_HEADERS+=	cstring
 STD_HEADERS+=	ctgmath
 STD_HEADERS+=	ctime
 STD_HEADERS+=	ctype.h
 STD_HEADERS+=	cwchar
 STD_HEADERS+=	cwctype
 STD_HEADERS+=	deque
 STD_HEADERS+=	errno.h
 STD_HEADERS+=	exception
 STD_HEADERS+=	filesystem
 STD_HEADERS+=	float.h
 STD_HEADERS+=	forward_list
 STD_HEADERS+=	fstream
 STD_HEADERS+=	functional
 STD_HEADERS+=	future
 STD_HEADERS+=	initializer_list
 STD_HEADERS+=	inttypes.h
 STD_HEADERS+=	iomanip
 STD_HEADERS+=	ios
 STD_HEADERS+=	iosfwd
 STD_HEADERS+=	iostream
 STD_HEADERS+=	istream
 STD_HEADERS+=	iterator
 STD_HEADERS+=	limits
 STD_HEADERS+=	limits.h
 STD_HEADERS+=	list
 STD_HEADERS+=	locale
 STD_HEADERS+=	locale.h
 STD_HEADERS+=	map
 STD_HEADERS+=	math.h
 STD_HEADERS+=	memory
 STD_HEADERS+=	mutex
 STD_HEADERS+=	new
 STD_HEADERS+=	numeric
 STD_HEADERS+=	optional
 STD_HEADERS+=	ostream
 STD_HEADERS+=	queue
 STD_HEADERS+=	random
 STD_HEADERS+=	ratio
 STD_HEADERS+=	regex
 STD_HEADERS+=	scoped_allocator
 STD_HEADERS+=	set
 STD_HEADERS+=	setjmp.h
 STD_HEADERS+=	shared_mutex
 STD_HEADERS+=	span
 STD_HEADERS+=	sstream
 STD_HEADERS+=	stack
 STD_HEADERS+=	stdbool.h
 STD_HEADERS+=	stddef.h
 STD_HEADERS+=	stdexcept
 STD_HEADERS+=	stdint.h
 STD_HEADERS+=	stdio.h
 STD_HEADERS+=	stdlib.h
 STD_HEADERS+=	streambuf
 STD_HEADERS+=	string
 STD_HEADERS+=	string.h
 STD_HEADERS+=	string_view
 STD_HEADERS+=	strstream
 STD_HEADERS+=	system_error
 STD_HEADERS+=	tgmath.h
 STD_HEADERS+=	thread
 STD_HEADERS+=	tuple
 STD_HEADERS+=	type_traits
 STD_HEADERS+=	typeindex
 STD_HEADERS+=	typeinfo
 STD_HEADERS+=	unordered_map
 STD_HEADERS+=	unordered_set
 STD_HEADERS+=	utility
 STD_HEADERS+=	valarray
 STD_HEADERS+=	variant
 STD_HEADERS+=	vector
 STD_HEADERS+=	version
 STD_HEADERS+=	wchar.h
 STD_HEADERS+=	wctype.h
 
 RT_HEADERS+=	cxxabi.h
 RT_HEADERS+=	unwind-arm.h
 RT_HEADERS+=	unwind-itanium.h
 RT_HEADERS+=	unwind.h
 
 .for hdr in ${STD_HEADERS}
 STD+=		${HDRDIR}/${hdr}
 INCSLINKS+=	../${hdr} ${CXXINCLUDEDIR}/tr1/${hdr}
 .endfor
 .for hdr in ${RT_HEADERS}
 STD+=		${_LIBCXXRTDIR}/${hdr}
 .endfor
 STDDIR=		${CXXINCLUDEDIR}
 
 EXP_HEADERS+=	__config
 EXP_HEADERS+=	__memory
 EXP_HEADERS+=	algorithm
 EXP_HEADERS+=	any
 EXP_HEADERS+=	chrono
 EXP_HEADERS+=	coroutine
 EXP_HEADERS+=	deque
 EXP_HEADERS+=	filesystem
 EXP_HEADERS+=	forward_list
 EXP_HEADERS+=	functional
 EXP_HEADERS+=	iterator
 EXP_HEADERS+=	list
 EXP_HEADERS+=	map
 EXP_HEADERS+=	memory_resource
 EXP_HEADERS+=	numeric
 EXP_HEADERS+=	optional
 EXP_HEADERS+=	propagate_const
 EXP_HEADERS+=	ratio
 EXP_HEADERS+=	regex
 EXP_HEADERS+=	set
 EXP_HEADERS+=	simd
 EXP_HEADERS+=	string
 EXP_HEADERS+=	string_view
 EXP_HEADERS+=	system_error
 EXP_HEADERS+=	tuple
 EXP_HEADERS+=	type_traits
 EXP_HEADERS+=	unordered_map
 EXP_HEADERS+=	unordered_set
 EXP_HEADERS+=	utility
 EXP_HEADERS+=	vector
 
 .for hdr in ${EXP_HEADERS}
 EXP+=		${HDRDIR}/experimental/${hdr}
 .endfor
 EXPDIR=		${CXXINCLUDEDIR}/experimental
 
 EXT_HEADERS+=	__hash
 EXT_HEADERS+=	hash_map
 EXT_HEADERS+=	hash_set
 
 .for hdr in ${EXT_HEADERS}
 EXT+=		${HDRDIR}/ext/${hdr}
 .endfor
 EXTDIR=		${CXXINCLUDEDIR}/ext
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/lib/libc++experimental/Makefile
===================================================================
--- projects/capsicum-test/lib/libc++experimental/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libc++experimental/Makefile	(revision 345710)
@@ -1,27 +1,25 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=	clibs
 SRCDIR=		${SRCTOP}/contrib/libc++
 
 LIB=		c++experimental
 NO_PIC=
 MK_PROFILE=	no
 
 .PATH:		${SRCDIR}/src/experimental
 
 SRCS+=		memory_resource.cpp
 
 WARNS?=		0
 CXXFLAGS+=	${PICFLAG}
 CXXFLAGS+=	-isystem ${SRCDIR}/include
 CXXFLAGS+=	-nostdinc++
 CXXFLAGS+=	-nostdlib
 CXXFLAGS+=	-D_LIBCPP_BUILDING_LIBRARY
 CXXFLAGS+=	-DLIBCXXRT
-.if empty(CXXFLAGS:M-std=*)
-CXXFLAGS+=	-std=c++14
-.endif
+CXXSTD=		c++14
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/lib/libc++fs/Makefile
===================================================================
--- projects/capsicum-test/lib/libc++fs/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libc++fs/Makefile	(revision 345710)
@@ -1,29 +1,27 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=	clibs
 SRCDIR=		${SRCTOP}/contrib/libc++
 
 LIB=		c++fs
 NO_PIC=
 MK_PROFILE=	no
 
 .PATH:		${SRCDIR}/src/filesystem
 
 SRCS+=		directory_iterator.cpp
 SRCS+=		int128_builtins.cpp
 SRCS+=		operations.cpp
 
 WARNS?=		0
 CXXFLAGS+=	${PICFLAG}
 CXXFLAGS+=	-isystem ${SRCDIR}/include
 CXXFLAGS+=	-nostdinc++
 CXXFLAGS+=	-nostdlib
 CXXFLAGS+=	-D_LIBCPP_BUILDING_LIBRARY
 CXXFLAGS+=	-DLIBCXXRT
-.if empty(CXXFLAGS:M-std=*)
-CXXFLAGS+=	-std=c++14
-.endif
+CXXSTD=		c++14
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/lib/libclang_rt/Makefile.inc
===================================================================
--- projects/capsicum-test/lib/libclang_rt/Makefile.inc	(revision 345709)
+++ projects/capsicum-test/lib/libclang_rt/Makefile.inc	(revision 345710)
@@ -1,44 +1,44 @@
 # $FreeBSD$
 
 .include <bsd.compiler.mk>
 
 # armv[67] is a bit special since we allow a soft-floating version via
 # CPUTYPE matching *soft*. This variant may not actually work though.
 .if ${MACHINE_ARCH:Marmv[67]*} != "" && \
     (!defined(CPUTYPE) || ${CPUTYPE:M*soft*} == "")
 CRTARCH?=	armhf
 .else
 CRTARCH?=	${MACHINE_CPUARCH:C/amd64/x86_64/}
 .endif
 CRTSRC=		${SRCTOP}/contrib/compiler-rt
 
 .PATH:		${CRTSRC}/lib
 
 CLANGDIR=	/usr/lib/clang/8.0.0
 LIBDIR=		${CLANGDIR}/lib/freebsd
 SHLIBDIR=	${LIBDIR}
 
 NO_PIC=
 MK_PROFILE=	no
 
 WARNS?=		0
 
 SSP_CFLAGS=
 CFLAGS+=	-DNDEBUG
 CFLAGS+=	-DHAVE_RPC_XDR_H=0
 CFLAGS+=	-DHAVE_TIRPC_RPC_XDR_H=0
 CFLAGS+=	-DSANITIZER_SUPPORTS_WEAK_HOOKS=0
 CFLAGS+=	-DUBSAN_CAN_USE_CXXABI
 CFLAGS+=	${PICFLAG}
 CFLAGS+=	-fno-builtin
 CFLAGS+=	-fno-exceptions
 CXXFLAGS+=	-fno-rtti
 .if ${COMPILER_TYPE} == clang && ${COMPILER_VERSION} >= 30700
 CFLAGS+=	-fno-sanitize=safe-stack
 .endif
 CFLAGS+=	-fno-stack-protector
 CFLAGS+=	-funwind-tables
 CXXFLAGS+=	-fvisibility-inlines-hidden
 CXXFLAGS+=	-fvisibility=hidden
 CFLAGS+=	-I${CRTSRC}/lib
-CXXFLAGS+=	-std=c++11
+CXXSTD=		c++11
Index: projects/capsicum-test/lib/libcxxrt/Makefile
===================================================================
--- projects/capsicum-test/lib/libcxxrt/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libcxxrt/Makefile	(revision 345710)
@@ -1,30 +1,28 @@
 # $FreeBSD$
 
 PACKAGE=	clibs
 SRCDIR=		${SRCTOP}/contrib/libcxxrt
 
 SHLIB_MAJOR=	1
 SHLIBDIR?=	/lib
 
 .PATH: ${SRCDIR}
 
 LIB=		cxxrt
 
 SRCS+=		libelftc_dem_gnu3.c\
 		terminate.cc\
 		dynamic_cast.cc\
 		memory.cc\
 		auxhelper.cc\
 		exception.cc\
 		stdexcept.cc\
 		typeinfo.cc\
 		guard.cc
 
 WARNS=		0
 CFLAGS+=	-isystem ${SRCDIR} -nostdinc++
-.if empty(CXXFLAGS:M-std=*)
-CXXFLAGS+=	-std=c++11
-.endif
+CXXSTD=		c++11
 VERSION_MAP=	${.CURDIR}/Version.map
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/lib/libgcc_eh/Makefile.inc
===================================================================
--- projects/capsicum-test/lib/libgcc_eh/Makefile.inc	(revision 345709)
+++ projects/capsicum-test/lib/libgcc_eh/Makefile.inc	(revision 345710)
@@ -1,39 +1,37 @@
 # $FreeBSD$
 
 COMPILERRTDIR=	${SRCTOP}/contrib/compiler-rt
 UNWINDINCDIR=	${SRCTOP}/contrib/libunwind/include
 UNWINDSRCDIR=	${SRCTOP}/contrib/libunwind/src
 
 STATIC_CFLAGS+=${PICFLAG} -fvisibility=hidden -DVISIBILITY_HIDDEN
 
 .PATH: ${COMPILERRTDIR}/lib/builtins
 .PATH: ${UNWINDSRCDIR}
 SRCS_EXC+=	gcc_personality_v0.c
 SRCS_EXC+=	int_util.c
 SRCS_EXC+=	Unwind-EHABI.cpp
 SRCS_EXC+=	Unwind-sjlj.c
 SRCS_EXC+=	UnwindLevel1-gcc-ext.c
 SRCS_EXC+=	UnwindLevel1.c
 SRCS_EXC+=	UnwindRegistersRestore.S
 SRCS_EXC+=	UnwindRegistersSave.S
 SRCS_EXC+=	libunwind.cpp
 
 SRCS+=		${SRCS_EXC}
 .for file in ${SRCS_EXC:M*.c}
 CFLAGS.${file}+=	-fno-exceptions -funwind-tables
 .endfor
 .for file in ${SRCS_EXC:M*.cpp}
 CXXFLAGS.${file}+=	-fno-exceptions -funwind-tables
 .endfor
 
 CFLAGS+=	-I${UNWINDINCDIR} -I${.CURDIR} -D_LIBUNWIND_IS_NATIVE_ONLY
-.if empty(CXXFLAGS:M-std=*)
-CXXFLAGS+=	-std=c++11
-.endif
 CXXFLAGS+=	-fno-rtti
+CXXSTD=		c++11
 STATIC_CXXFLAGS+= -fvisibility=hidden -fPIC
 # Probably need to just move this earlier or use CXXFLAGS
 .if ${MK_DIRDEPS_BUILD} == "yes"
 # Avoid dependency on lib/libc++
 CFLAGS+=	-isystem ${SRCTOP}/contrib/libc++/include -nostdinc++
 .endif
Index: projects/capsicum-test/lib/libomp/Makefile
===================================================================
--- projects/capsicum-test/lib/libomp/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libomp/Makefile	(revision 345710)
@@ -1,71 +1,71 @@
 # $FreeBSD$
 
 SHLIB_NAME=	libomp.so
 
 OMPSRC=		${SRCTOP}/contrib/openmp/runtime/src
 ITTSRC=		${OMPSRC}/thirdparty/ittnotify
 .PATH:		${OMPSRC}
 .PATH:		${ITTSRC}
 
 SRCS+=		ittnotify_static.c
 SRCS+=		kmp_affinity.cpp
 SRCS+=		kmp_alloc.cpp
 SRCS+=		kmp_atomic.cpp
 SRCS+=		kmp_barrier.cpp
 SRCS+=		kmp_cancel.cpp
 SRCS+=		kmp_csupport.cpp
 SRCS+=		kmp_debug.cpp
 SRCS+=		kmp_dispatch.cpp
 SRCS+=		kmp_environment.cpp
 SRCS+=		kmp_error.cpp
 SRCS+=		kmp_ftn_cdecl.cpp
 SRCS+=		kmp_ftn_extra.cpp
 SRCS+=		kmp_global.cpp
 SRCS+=		kmp_gsupport.cpp
 SRCS+=		kmp_i18n.cpp
 SRCS+=		kmp_io.cpp
 SRCS+=		kmp_itt.cpp
 SRCS+=		kmp_lock.cpp
 SRCS+=		kmp_runtime.cpp
 SRCS+=		kmp_sched.cpp
 SRCS+=		kmp_settings.cpp
 SRCS+=		kmp_str.cpp
 SRCS+=		kmp_taskdeps.cpp
 SRCS+=		kmp_tasking.cpp
 SRCS+=		kmp_taskq.cpp
 SRCS+=		kmp_threadprivate.cpp
 SRCS+=		kmp_utility.cpp
 SRCS+=		kmp_version.cpp
 SRCS+=		kmp_wait_release.cpp
 SRCS+=		ompt-general.cpp
 SRCS+=		z_Linux_asm.S
 SRCS+=		z_Linux_util.cpp
 INCS+=		omp.h
 
 WARNS?=		1
 
 CFLAGS+=	-D__STDC_CONSTANT_MACROS
 CFLAGS+=	-D__STDC_FORMAT_MACROS
 CFLAGS+=	-D__STDC_LIMIT_MACROS
 CFLAGS+=	-I${.CURDIR}
 CFLAGS+=	-I${OMPSRC}
 CFLAGS+=	-I${ITTSRC}
 CFLAGS+=	-ffunction-sections
 CFLAGS+=	-fdata-sections
 CXXFLAGS+=	-fvisibility-inlines-hidden
-CXXFLAGS+=	-std=c++11
 CXXFLAGS+=	-fno-exceptions
 CXXFLAGS+=	-fno-rtti
+CXXSTD=		c++11
 
 LDFLAGS+=	-Wl,--warn-shared-textrel
 LDFLAGS+=	-Wl,--gc-sections
 LDFLAGS+=	-Wl,-z,noexecstack
 LDFLAGS+=	-Wl,-fini=__kmp_internal_end_fini
 LDFLAGS+=	-Wl,-soname,libomp.so
 
 VERSION_MAP=	${OMPSRC}/exports_so.txt
 
 LIBADD+=	pthread
 LIBADD+=	m
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/lib/libthr/Makefile
===================================================================
--- projects/capsicum-test/lib/libthr/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/libthr/Makefile	(revision 345710)
@@ -1,75 +1,77 @@
 # $FreeBSD$
 #
 # All library objects contain FreeBSD revision strings by default; they may be
 # excluded as a space-saving measure.  To produce a library that does
 # not contain these strings, add -DSTRIP_FBSDID (see <sys/cdefs.h>) to CFLAGS
 # below.
 
 PACKAGE=	clibs
 SHLIBDIR?= /lib
 
 .include <src.opts.mk>
 MK_SSP=	no
 
 LIB=thr
 SHLIB_MAJOR= 3
 NO_WTHREAD_SAFETY=1
 NO_WCAST_ALIGN.gcc=1    # for gcc 4.2
 CFLAGS+=-DPTHREAD_KERNEL
-CFLAGS+=-I${SRCTOP}/lib/libc/include -I${.CURDIR}/thread \
-	-I${SRCTOP}/include
+CFLAGS+=-I${SRCTOP}/lib/libc/include
+CFLAGS+=-I${SRCTOP}/lib/libc/${MACHINE_CPUARCH}
+CFLAGS+=-I${.CURDIR}/thread
+CFLAGS+=-I${SRCTOP}/include
 CFLAGS+=-I${.CURDIR}/arch/${MACHINE_CPUARCH}/include
 CFLAGS+=-I${.CURDIR}/sys
 CFLAGS+=-I${SRCTOP}/libexec/rtld-elf
 CFLAGS+=-I${SRCTOP}/libexec/rtld-elf/${MACHINE_CPUARCH}
 CFLAGS+=-I${SRCTOP}/lib/libthread_db
 CFLAGS+=-Winline
 
 CFLAGS.thr_stack.c+=	-Wno-cast-align
 CFLAGS.rtld_malloc.c+=	-Wno-cast-align
 .include <bsd.compiler.mk>
 .if !(${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} < 40300)
 CFLAGS.thr_symbols.c+=	-Wno-missing-variable-declarations
 .endif
 
 .ifndef NO_THREAD_UNWIND_STACK
 CFLAGS+=-fexceptions
 CFLAGS+=-D_PTHREAD_FORCED_UNWIND
 .endif
 
 LDFLAGS+=-Wl,-znodelete
 
 VERSION_DEF=${SRCTOP}/lib/libc/Versions.def
 SYMBOL_MAPS=${.CURDIR}/pthread.map
 
 MAN=	libthr.3
 
 # enable extra internal consistency checks
 CFLAGS+=-D_PTHREADS_INVARIANTS
 
 PRECIOUSLIB=
 
 .PATH: ${.CURDIR}/arch/${MACHINE_CPUARCH}/${MACHINE_CPUARCH}
 .PATH: ${SRCTOP}/libexec/rtld-elf
 
 .if exists(${.CURDIR}/arch/${MACHINE_CPUARCH}/Makefile.inc)
 .include "${.CURDIR}/arch/${MACHINE_CPUARCH}/Makefile.inc"
 .endif
 .include "${.CURDIR}/sys/Makefile.inc"
 .include "${.CURDIR}/thread/Makefile.inc"
 SRCS+= rtld_malloc.c
 
 .if ${MK_INSTALLLIB} != "no"
 SYMLINKS+=lib${LIB}.a ${LIBDIR}/libpthread.a
 .endif
 .if !defined(NO_PIC)
 SYMLINKS+=lib${LIB}.so ${LIBDIR}/libpthread.so
 .endif
 .if ${MK_PROFILE} != "no"
 SYMLINKS+=lib${LIB}_p.a ${LIBDIR}/libpthread_p.a
 .endif
 
 HAS_TESTS=
 SUBDIR.${MK_TESTS}+= tests
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/lib/libthr/arch/aarch64/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/aarch64/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/aarch64/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_AARCH64_PTHREAD_TLS_H
+#define	_ARCH_AARCH64_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/aarch64/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/amd64/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/amd64/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/amd64/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_AMD64_PTHREAD_TLS_H
+#define	_ARCH_AMD64_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase -= offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/amd64/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/arm/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/arm/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/arm/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_ARM_PTHREAD_TLS_H
+#define	_ARCH_ARM_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/arm/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/i386/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/i386/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/i386/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_I386_PTHREAD_TLS_H
+#define	_ARCH_I386_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase -= offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/i386/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/mips/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/mips/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/mips/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_MIPS_PTHREAD_TLS_H
+#define	_ARCH_MIPS_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/mips/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/powerpc/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/powerpc/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/powerpc/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_POWERPC_PTHREAD_TLS_H
+#define	_ARCH_POWERPC_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/powerpc/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/riscv/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/riscv/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/riscv/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_RISCV_PTHREAD_TLS_H
+#define	_ARCH_RISCV_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase += offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/riscv/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/arch/sparc64/include/pthread_tls.h
===================================================================
--- projects/capsicum-test/lib/libthr/arch/sparc64/include/pthread_tls.h	(nonexistent)
+++ projects/capsicum-test/lib/libthr/arch/sparc64/include/pthread_tls.h	(revision 345710)
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ARCH_SPARC64_PTHREAD_TLS_H
+#define	_ARCH_SPARC64_PTHREAD_TLS_H
+
+static __inline uintptr_t
+_get_static_tls_base(struct pthread *thr, size_t offset)
+{
+	uintptr_t tlsbase;
+
+	tlsbase = (uintptr_t)thr->tcb;
+	tlsbase -= offset;
+	return (tlsbase);
+}
+
+#endif

Property changes on: projects/capsicum-test/lib/libthr/arch/sparc64/include/pthread_tls.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/capsicum-test/lib/libthr/pthread.map
===================================================================
--- projects/capsicum-test/lib/libthr/pthread.map	(revision 345709)
+++ projects/capsicum-test/lib/libthr/pthread.map	(revision 345710)
@@ -1,327 +1,328 @@
 /*
  * $FreeBSD$
  */
 
 /*
  * Use the same naming scheme as libc.
  */
 FBSD_1.0 {
 	pthread_atfork;
 	pthread_barrier_destroy;
 	pthread_barrier_init;
 	pthread_barrier_wait;
 	pthread_barrierattr_destroy;
 	pthread_barrierattr_getpshared;
 	pthread_barrierattr_init;
 	pthread_barrierattr_setpshared;
 	pthread_attr_destroy;
 	pthread_attr_get_np;
 	pthread_attr_getdetachstate;
 	pthread_attr_getguardsize;
 	pthread_attr_getinheritsched;
 	pthread_attr_getschedparam;
 	pthread_attr_getschedpolicy;
 	pthread_attr_getscope;
 	pthread_attr_getstack;
 	pthread_attr_getstackaddr;
 	pthread_attr_getstacksize;
 	pthread_attr_init;
 	pthread_attr_setcreatesuspend_np;
 	pthread_attr_setdetachstate;
 	pthread_attr_setguardsize;
 	pthread_attr_setinheritsched;
 	pthread_attr_setschedparam;
 	pthread_attr_setschedpolicy;
 	pthread_attr_setscope;
 	pthread_attr_setstack;
 	pthread_attr_setstackaddr;
 	pthread_attr_setstacksize;
 	pthread_cancel;
 	pthread_cleanup_pop;
 	pthread_cleanup_push;
 	pthread_cond_broadcast;
 	pthread_cond_destroy;
 	pthread_cond_init;
 	pthread_cond_signal;
 	pthread_cond_timedwait;
 	pthread_cond_wait;
 	pthread_condattr_destroy;
 	pthread_condattr_getclock;
 	pthread_condattr_getpshared;
 	pthread_condattr_init;
 	pthread_condattr_setclock;
 	pthread_condattr_setpshared;
 	pthread_create;
 	pthread_detach;
 	pthread_equal;
 	pthread_exit;
 	pthread_getconcurrency;
 	pthread_getprio;
 	pthread_getschedparam;
 	pthread_getspecific;
 	pthread_join;
 	pthread_key_create;
 	pthread_key_delete;
 	pthread_kill;
 	pthread_main_np;
 	pthread_multi_np;
 	pthread_mutex_destroy;
 	pthread_mutex_getprioceiling;
 	pthread_mutex_init;
 	pthread_mutex_lock;
 	pthread_mutex_setprioceiling;
 	pthread_mutex_timedlock;
 	pthread_mutex_trylock;
 	pthread_mutex_unlock;
 	pthread_mutexattr_destroy;
 	pthread_mutexattr_getkind_np;
 	pthread_mutexattr_getprioceiling;
 	pthread_mutexattr_getpshared;
 	pthread_mutexattr_getprotocol;
 	pthread_mutexattr_gettype;
 	pthread_mutexattr_init;
 	pthread_mutexattr_setkind_np;
 	pthread_mutexattr_setprioceiling;
 	pthread_mutexattr_setprotocol;
 	pthread_mutexattr_setpshared;
 	pthread_mutexattr_settype;
 	pthread_once;
 	pthread_resume_all_np;
 	pthread_resume_np;
 	pthread_rwlock_destroy;
 	pthread_rwlock_init;
 	pthread_rwlock_rdlock;
 	pthread_rwlock_timedrdlock;
 	pthread_rwlock_timedwrlock;
 	pthread_rwlock_tryrdlock;
 	pthread_rwlock_trywrlock;
 	pthread_rwlock_unlock;
 	pthread_rwlock_wrlock;
 	pthread_rwlockattr_destroy;
 	pthread_rwlockattr_getpshared;
 	pthread_rwlockattr_init;
 	pthread_rwlockattr_setpshared;
 	pthread_set_name_np;
 	pthread_self;
 	pthread_setcancelstate;
 	pthread_setcanceltype;
 	pthread_setconcurrency;
 	pthread_setprio;
 	pthread_setschedparam;
 	pthread_setspecific;
 	pthread_sigmask;
 	pthread_single_np;
 	pthread_spin_destroy;
 	pthread_spin_init;
 	pthread_spin_lock;
 	pthread_spin_trylock;
 	pthread_spin_unlock;
 	pthread_suspend_all_np;
 	pthread_suspend_np;
 	pthread_switch_add_np;
 	pthread_switch_delete_np;
 	pthread_testcancel;
 	pthread_timedjoin_np;
 	pthread_yield;
 };
 
 /*
  * List the private interfaces reserved for use in FreeBSD libraries.
  * These are not part of our application ABI.
  */
 FBSDprivate_1.0 {
 	__pthread_cond_timedwait;
 	__pthread_cond_wait;
 	__pthread_cxa_finalize;
 	__pthread_mutex_init;
 	__pthread_mutex_lock;
 	__pthread_mutex_timedlock;
 	__pthread_mutex_trylock;
+	__pthread_distribute_static_tls;
 	_pthread_atfork;
 	_pthread_barrier_destroy;
 	_pthread_barrier_init;
 	_pthread_barrier_wait;
 	_pthread_barrierattr_destroy;
 	_pthread_barrierattr_getpshared;
 	_pthread_barrierattr_init;
 	_pthread_barrierattr_setpshared;
 	_pthread_attr_destroy;
 	_pthread_attr_get_np;
 	_pthread_attr_getaffinity_np;
 	_pthread_attr_getdetachstate;
 	_pthread_attr_getguardsize;
 	_pthread_attr_getinheritsched;
 	_pthread_attr_getschedparam;
 	_pthread_attr_getschedpolicy;
 	_pthread_attr_getscope;
 	_pthread_attr_getstack;
 	_pthread_attr_getstackaddr;
 	_pthread_attr_getstacksize;
 	_pthread_attr_init;
 	_pthread_attr_setaffinity_np;
 	_pthread_attr_setcreatesuspend_np;
 	_pthread_attr_setdetachstate;
 	_pthread_attr_setguardsize;
 	_pthread_attr_setinheritsched;
 	_pthread_attr_setschedparam;
 	_pthread_attr_setschedpolicy;
 	_pthread_attr_setscope;
 	_pthread_attr_setstack;
 	_pthread_attr_setstackaddr;
 	_pthread_attr_setstacksize;
 	_pthread_cancel;
 	_pthread_cancel_enter;
 	_pthread_cancel_leave;
 	_pthread_cleanup_pop;
 	_pthread_cleanup_push;
 	_pthread_cond_broadcast;
 	_pthread_cond_destroy;
 	_pthread_cond_init;
 	_pthread_cond_signal;
 	_pthread_cond_timedwait;
 	_pthread_cond_wait;
 	_pthread_condattr_destroy;
 	_pthread_condattr_getclock;
 	_pthread_condattr_getpshared;
 	_pthread_condattr_init;
 	_pthread_condattr_setclock;
 	_pthread_condattr_setpshared;
 	_pthread_create;
 	_pthread_detach;
 	_pthread_equal;
 	_pthread_exit;
 	_pthread_getaffinity_np;
 	_pthread_getconcurrency;
 	_pthread_getcpuclockid;
 	_pthread_getprio;
 	_pthread_getschedparam;
 	_pthread_getspecific;
 	_pthread_getthreadid_np;
 	_pthread_join;
 	_pthread_key_create;
 	_pthread_key_delete;
 	_pthread_kill;
 	_pthread_main_np;
 	_pthread_multi_np;
 	_pthread_mutex_destroy;
 	_pthread_mutex_getprioceiling;
 	_pthread_mutex_getspinloops_np;
 	_pthread_mutex_getyieldloops_np;
 	_pthread_mutex_init;
 	_pthread_mutex_init_calloc_cb;
 	_pthread_mutex_isowned_np;
 	_pthread_mutex_lock;
 	_pthread_mutex_setprioceiling;
 	_pthread_mutex_setspinloops_np;
 	_pthread_mutex_setyieldloops_np;
 	_pthread_mutex_timedlock;
 	_pthread_mutex_trylock;
 	_pthread_mutex_unlock;
 	_pthread_mutexattr_destroy;
 	_pthread_mutexattr_getkind_np;
 	_pthread_mutexattr_getprioceiling;
 	_pthread_mutexattr_getprotocol;
 	_pthread_mutexattr_getpshared;
 	_pthread_mutexattr_gettype;
 	_pthread_mutexattr_init;
 	_pthread_mutexattr_setkind_np;
 	_pthread_mutexattr_setprioceiling;
 	_pthread_mutexattr_setprotocol;
 	_pthread_mutexattr_setpshared;
 	_pthread_mutexattr_settype;
 	_pthread_once;
 	_pthread_resume_all_np;
 	_pthread_resume_np;
 	_pthread_rwlock_destroy;
 	_pthread_rwlock_init;
 	_pthread_rwlock_rdlock;
 	_pthread_rwlock_timedrdlock;
 	_pthread_rwlock_timedwrlock;
 	_pthread_rwlock_tryrdlock;
 	_pthread_rwlock_trywrlock;
 	_pthread_rwlock_unlock;
 	_pthread_rwlock_wrlock;
 	_pthread_rwlockattr_destroy;
 	_pthread_rwlockattr_getpshared;
 	_pthread_rwlockattr_init;
 	_pthread_rwlockattr_setpshared;
 	_pthread_self;
 	_pthread_set_name_np;
 	_pthread_setaffinity_np;
 	_pthread_setcancelstate;
 	_pthread_setcanceltype;
 	_pthread_setconcurrency;
 	_pthread_setprio;
 	_pthread_setschedparam;
 	_pthread_setspecific;
 	_pthread_sigmask;
 	_pthread_single_np;
 	_pthread_spin_destroy;
 	_pthread_spin_init;
 	_pthread_spin_lock;
 	_pthread_spin_trylock;
 	_pthread_spin_unlock;
 	_pthread_suspend_all_np;
 	_pthread_suspend_np;
 	_pthread_switch_add_np;
 	_pthread_switch_delete_np;
 	_pthread_testcancel;
 	_pthread_timedjoin_np;
 	_pthread_yield;
 
 	/* Debugger needs these. */
 	_libthr_debug;
 	_thread_active_threads;
 	_thread_bp_create;
 	_thread_bp_death;
 	_thread_event_mask;
 	_thread_keytable;
 	_thread_last_event;
 	_thread_list;
 	_thread_max_keys;
 	_thread_off_attr_flags;
 	_thread_off_dtv;
 	_thread_off_event_buf;
 	_thread_off_event_mask;
 	_thread_off_key_allocated;
 	_thread_off_key_destructor;
 	_thread_off_linkmap;
 	_thread_off_next;
 	_thread_off_report_events;
 	_thread_off_state;
 	_thread_off_tcb;
 	_thread_off_tid;
 	_thread_off_tlsindex;
 	_thread_size_key;
 	_thread_state_running;
 	_thread_state_zoombie;
 };
 
 FBSD_1.1 {
 	__pthread_cleanup_pop_imp;
 	__pthread_cleanup_push_imp;
 	pthread_attr_getaffinity_np;
 	pthread_attr_setaffinity_np;
 	pthread_getaffinity_np;
 	pthread_getcpuclockid;
 	pthread_setaffinity_np;
 	pthread_mutex_getspinloops_np;
 	pthread_mutex_getyieldloops_np;
 	pthread_mutex_isowned_np;
 	pthread_mutex_setspinloops_np;
 	pthread_mutex_setyieldloops_np;
 };
 
 FBSD_1.2 {
 	pthread_getthreadid_np;
 };
 
 FBSD_1.4 {
 	 pthread_mutex_consistent;
 	 pthread_mutexattr_getrobust;
 	 pthread_mutexattr_setrobust;
 };
 
 FBSD_1.5 {
 	 pthread_get_name_np;
 };
Index: projects/capsicum-test/lib/libthr/thread/thr_list.c
===================================================================
--- projects/capsicum-test/lib/libthr/thread/thr_list.c	(revision 345709)
+++ projects/capsicum-test/lib/libthr/thread/thr_list.c	(revision 345710)
@@ -1,364 +1,397 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/queue.h>
 
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 
 #include "libc_private.h"
 #include "thr_private.h"
+#include "static_tls.h"
 
 /*#define DEBUG_THREAD_LIST */
 #ifdef DEBUG_THREAD_LIST
 #define DBG_MSG		stdout_debug
 #else
 #define DBG_MSG(x...)
 #endif
 
 #define MAX_THREADS		100000
 
 /*
  * Define a high water mark for the maximum number of threads that
  * will be cached.  Once this level is reached, any extra threads
  * will be free()'d.
  */
 #define	MAX_CACHED_THREADS	100
 
 /*
  * We've got to keep track of everything that is allocated, not only
  * to have a speedy free list, but also so they can be deallocated
  * after a fork().
  */
 static TAILQ_HEAD(, pthread)	free_threadq;
 static struct umutex		free_thread_lock = DEFAULT_UMUTEX;
 static struct umutex		tcb_lock = DEFAULT_UMUTEX;
 static int			free_thread_count = 0;
 static int			inited = 0;
 static int			total_threads;
 
 LIST_HEAD(thread_hash_head, pthread);
 #define HASH_QUEUES	128
 static struct thread_hash_head	thr_hashtable[HASH_QUEUES];
 #define	THREAD_HASH(thrd)	(((unsigned long)thrd >> 8) % HASH_QUEUES)
 
 static void thr_destroy(struct pthread *curthread, struct pthread *thread);
 
 void
 _thr_list_init(void)
 {
 	int i;
 
 	_gc_count = 0;
 	total_threads = 1;
 	_thr_urwlock_init(&_thr_list_lock);
 	TAILQ_INIT(&_thread_list);
 	TAILQ_INIT(&free_threadq);
 	_thr_umutex_init(&free_thread_lock);
 	_thr_umutex_init(&tcb_lock);
 	if (inited) {
 		for (i = 0; i < HASH_QUEUES; ++i)
 			LIST_INIT(&thr_hashtable[i]);
 	}
 	inited = 1;
 }
 
 void
 _thr_gc(struct pthread *curthread)
 {
 	struct pthread *td, *td_next;
 	TAILQ_HEAD(, pthread) worklist;
 
 	TAILQ_INIT(&worklist);
 	THREAD_LIST_WRLOCK(curthread);
 
 	/* Check the threads waiting for GC. */
 	TAILQ_FOREACH_SAFE(td, &_thread_gc_list, gcle, td_next) {
 		if (td->tid != TID_TERMINATED) {
 			/* make sure we are not still in userland */
 			continue;
 		}
 		_thr_stack_free(&td->attr);
 		THR_GCLIST_REMOVE(td);
 		TAILQ_INSERT_HEAD(&worklist, td, gcle);
 	}
 	THREAD_LIST_UNLOCK(curthread);
 
 	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
 		TAILQ_REMOVE(&worklist, td, gcle);
 		/*
 		 * XXX we don't free initial thread, because there might
 		 * have some code referencing initial thread.
 		 */
 		if (td == _thr_initial) {
 			DBG_MSG("Initial thread won't be freed\n");
 			continue;
 		}
 
 		_thr_free(curthread, td);
 	}
 }
 
 struct pthread *
 _thr_alloc(struct pthread *curthread)
 {
 	struct pthread	*thread = NULL;
 	struct tcb	*tcb;
 
 	if (curthread != NULL) {
 		if (GC_NEEDED())
 			_thr_gc(curthread);
 		if (free_thread_count > 0) {
 			THR_LOCK_ACQUIRE(curthread, &free_thread_lock);
 			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
 				TAILQ_REMOVE(&free_threadq, thread, tle);
 				free_thread_count--;
 			}
 			THR_LOCK_RELEASE(curthread, &free_thread_lock);
 		}
 	}
 	if (thread == NULL) {
 		if (total_threads > MAX_THREADS)
 			return (NULL);
 		atomic_fetchadd_int(&total_threads, 1);
 		thread = calloc(1, sizeof(struct pthread));
 		if (thread == NULL) {
 			atomic_fetchadd_int(&total_threads, -1);
 			return (NULL);
 		}
 		if ((thread->sleepqueue = _sleepq_alloc()) == NULL ||
 		    (thread->wake_addr = _thr_alloc_wake_addr()) == NULL) {
 			thr_destroy(curthread, thread);
 			atomic_fetchadd_int(&total_threads, -1);
 			return (NULL);
 		}
 	} else {
 		bzero(&thread->_pthread_startzero, 
 			__rangeof(struct pthread, _pthread_startzero, _pthread_endzero));
 	}
 	if (curthread != NULL) {
 		THR_LOCK_ACQUIRE(curthread, &tcb_lock);
 		tcb = _tcb_ctor(thread, 0 /* not initial tls */);
 		THR_LOCK_RELEASE(curthread, &tcb_lock);
 	} else {
 		tcb = _tcb_ctor(thread, 1 /* initial tls */);
 	}
 	if (tcb != NULL) {
 		thread->tcb = tcb;
 	} else {
 		thr_destroy(curthread, thread);
 		atomic_fetchadd_int(&total_threads, -1);
 		thread = NULL;
 	}
 	return (thread);
 }
 
 void
 _thr_free(struct pthread *curthread, struct pthread *thread)
 {
 	DBG_MSG("Freeing thread %p\n", thread);
 
 	/*
 	 * Always free tcb, as we only know it is part of RTLD TLS
 	 * block, but don't know its detail and can not assume how
 	 * it works, so better to avoid caching it here.
 	 */
 	if (curthread != NULL) {
 		THR_LOCK_ACQUIRE(curthread, &tcb_lock);
 		_tcb_dtor(thread->tcb);
 		THR_LOCK_RELEASE(curthread, &tcb_lock);
 	} else {
 		_tcb_dtor(thread->tcb);
 	}
 	thread->tcb = NULL;
 	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
 		thr_destroy(curthread, thread);
 		atomic_fetchadd_int(&total_threads, -1);
 	} else {
 		/*
 		 * Add the thread to the free thread list, this also avoids
 		 * pthread id is reused too quickly, may help some buggy apps.
 		 */
 		THR_LOCK_ACQUIRE(curthread, &free_thread_lock);
 		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
 		free_thread_count++;
 		THR_LOCK_RELEASE(curthread, &free_thread_lock);
 	}
 }
 
 static void
 thr_destroy(struct pthread *curthread __unused, struct pthread *thread)
 {
 	if (thread->sleepqueue != NULL)
 		_sleepq_free(thread->sleepqueue);
 	if (thread->wake_addr != NULL)
 		_thr_release_wake_addr(thread->wake_addr);
 	free(thread);
 }
 
 /*
  * Add the thread to the list of all threads and increment
  * number of active threads.
  */
 void
 _thr_link(struct pthread *curthread, struct pthread *thread)
 {
 	THREAD_LIST_WRLOCK(curthread);
 	THR_LIST_ADD(thread);
 	THREAD_LIST_UNLOCK(curthread);
 	atomic_add_int(&_thread_active_threads, 1);
 }
 
 /*
  * Remove an active thread.
  */
 void
 _thr_unlink(struct pthread *curthread, struct pthread *thread)
 {
 	THREAD_LIST_WRLOCK(curthread);
 	THR_LIST_REMOVE(thread);
 	THREAD_LIST_UNLOCK(curthread);
 	atomic_add_int(&_thread_active_threads, -1);
 }
 
 void
 _thr_hash_add(struct pthread *thread)
 {
 	struct thread_hash_head *head;
 
 	head = &thr_hashtable[THREAD_HASH(thread)];
 	LIST_INSERT_HEAD(head, thread, hle);
 }
 
 void
 _thr_hash_remove(struct pthread *thread)
 {
 	LIST_REMOVE(thread, hle);
 }
 
 struct pthread *
 _thr_hash_find(struct pthread *thread)
 {
 	struct pthread *td;
 	struct thread_hash_head *head;
 
 	head = &thr_hashtable[THREAD_HASH(thread)];
 	LIST_FOREACH(td, head, hle) {
 		if (td == thread)
 			return (thread);
 	}
 	return (NULL);
 }
 
 /*
  * Find a thread in the linked list of active threads and add a reference
  * to it.  Threads with positive reference counts will not be deallocated
  * until all references are released.
  */
 int
 _thr_ref_add(struct pthread *curthread, struct pthread *thread,
     int include_dead)
 {
 	int ret;
 
 	if (thread == NULL)
 		/* Invalid thread: */
 		return (EINVAL);
 
 	if ((ret = _thr_find_thread(curthread, thread, include_dead)) == 0) {
 		thread->refcount++;
 		THR_CRITICAL_ENTER(curthread);
 		THR_THREAD_UNLOCK(curthread, thread);
 	}
 
 	/* Return zero if the thread exists: */
 	return (ret);
 }
 
 void
 _thr_ref_delete(struct pthread *curthread, struct pthread *thread)
 {
 	THR_THREAD_LOCK(curthread, thread);
 	thread->refcount--;
 	_thr_try_gc(curthread, thread);
 	THR_CRITICAL_LEAVE(curthread);
 }
 
 /* entered with thread lock held, exit with thread lock released */
 void
 _thr_try_gc(struct pthread *curthread, struct pthread *thread)
 {
 	if (THR_SHOULD_GC(thread)) {
 		THR_REF_ADD(curthread, thread);
 		THR_THREAD_UNLOCK(curthread, thread);
 		THREAD_LIST_WRLOCK(curthread);
 		THR_THREAD_LOCK(curthread, thread);
 		THR_REF_DEL(curthread, thread);
 		if (THR_SHOULD_GC(thread)) {
 			THR_LIST_REMOVE(thread);
 			THR_GCLIST_ADD(thread);
 		}
 		THR_THREAD_UNLOCK(curthread, thread);
 		THREAD_LIST_UNLOCK(curthread);
 	} else {
 		THR_THREAD_UNLOCK(curthread, thread);
 	}
 }
 
 /* return with thread lock held if thread is found */
 int
 _thr_find_thread(struct pthread *curthread, struct pthread *thread,
     int include_dead)
 {
 	struct pthread *pthread;
 	int ret;
 
 	if (thread == NULL)
 		return (EINVAL);
 
 	ret = 0;
 	THREAD_LIST_RDLOCK(curthread);
 	pthread = _thr_hash_find(thread);
 	if (pthread) {
 		THR_THREAD_LOCK(curthread, pthread);
 		if (include_dead == 0 && pthread->state == PS_DEAD) {
 			THR_THREAD_UNLOCK(curthread, pthread);
 			ret = ESRCH;
 		}
 	} else {
 		ret = ESRCH;
 	}
 	THREAD_LIST_UNLOCK(curthread);
 	return (ret);
+}
+
+#include "pthread_tls.h"
+
+static void
+thr_distribute_static_tls(uintptr_t tlsbase, void *src, size_t len,
+    size_t total_len)
+{
+
+	memcpy((void *)tlsbase, src, len);
+	memset((char *)tlsbase + len, 0, total_len - len);
+}
+
+void
+__pthread_distribute_static_tls(size_t offset, void *src, size_t len,
+    size_t total_len)
+{
+	struct pthread *curthread, *thrd;
+	uintptr_t tlsbase;
+
+	if (!_thr_is_inited()) {
+		tlsbase = _libc_get_static_tls_base(offset);
+		thr_distribute_static_tls(tlsbase, src, len, total_len);
+		return;
+	}
+	curthread = _get_curthread();
+	THREAD_LIST_RDLOCK(curthread);
+	TAILQ_FOREACH(thrd, &_thread_list, tle) {
+		tlsbase = _get_static_tls_base(thrd, offset);
+		thr_distribute_static_tls(tlsbase, src, len, total_len);
+	}
+	THREAD_LIST_UNLOCK(curthread);
 }
Index: projects/capsicum-test/lib/libthr/thread/thr_private.h
===================================================================
--- projects/capsicum-test/lib/libthr/thread/thr_private.h	(revision 345709)
+++ projects/capsicum-test/lib/libthr/thread/thr_private.h	(revision 345710)
@@ -1,1017 +1,1019 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 2005 Daniel M. Eischen <deischen@freebsd.org>
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>.
  *
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _THR_PRIVATE_H
 #define _THR_PRIVATE_H
 
 /*
  * Include files.
  */
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/cdefs.h>
 #include <sys/queue.h>
 #include <sys/param.h>
 #include <sys/cpuset.h>
 #include <machine/atomic.h>
 #include <errno.h>
 #include <limits.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <ucontext.h>
 #include <sys/thr.h>
 #include <pthread.h>
 
 __NULLABILITY_PRAGMA_PUSH
 
 #define	SYM_FB10(sym)			__CONCAT(sym, _fb10)
 #define	SYM_FBP10(sym)			__CONCAT(sym, _fbp10)
 #define	WEAK_REF(sym, alias)		__weak_reference(sym, alias)
 #define	SYM_COMPAT(sym, impl, ver)	__sym_compat(sym, impl, ver)
 #define	SYM_DEFAULT(sym, impl, ver)	__sym_default(sym, impl, ver)
 
 #define	FB10_COMPAT(func, sym)				\
 	WEAK_REF(func, SYM_FB10(sym));			\
 	SYM_COMPAT(sym, SYM_FB10(sym), FBSD_1.0)
 
 #define	FB10_COMPAT_PRIVATE(func, sym)			\
 	WEAK_REF(func, SYM_FBP10(sym));			\
 	SYM_DEFAULT(sym, SYM_FBP10(sym), FBSDprivate_1.0)
 
 struct pthread;
 extern struct pthread	*_thr_initial __hidden;
 
 #include "pthread_md.h"
 #include "thr_umtx.h"
 #include "thread_db.h"
 
 #ifdef _PTHREAD_FORCED_UNWIND
 #define _BSD_SOURCE
 #include <unwind.h>
 #endif
 
 typedef TAILQ_HEAD(pthreadlist, pthread) pthreadlist;
 typedef TAILQ_HEAD(atfork_head, pthread_atfork) atfork_head;
 TAILQ_HEAD(mutex_queue, pthread_mutex);
 
 /* Signal to do cancellation */
 #define	SIGCANCEL		SIGTHR
 
 /*
  * Kernel fatal error handler macro.
  */
 #define PANIC(args...)		_thread_exitf(__FILE__, __LINE__, ##args)
 
 /* Output debug messages like this: */
 #define stdout_debug(args...)	_thread_printf(STDOUT_FILENO, ##args)
 #define stderr_debug(args...)	_thread_printf(STDERR_FILENO, ##args)
 
 #ifdef _PTHREADS_INVARIANTS
 #define THR_ASSERT(cond, msg) do {	\
 	if (__predict_false(!(cond)))	\
 		PANIC(msg);		\
 } while (0)
 #else
 #define THR_ASSERT(cond, msg)
 #endif
 
 #ifdef PIC
 # define STATIC_LIB_REQUIRE(name)
 #else
 # define STATIC_LIB_REQUIRE(name) __asm (".globl " #name)
 #endif
 
 #define	TIMESPEC_ADD(dst, src, val)				\
 	do { 							\
 		(dst)->tv_sec = (src)->tv_sec + (val)->tv_sec;	\
 		(dst)->tv_nsec = (src)->tv_nsec + (val)->tv_nsec; \
 		if ((dst)->tv_nsec >= 1000000000) {		\
 			(dst)->tv_sec++;			\
 			(dst)->tv_nsec -= 1000000000;		\
 		}						\
 	} while (0)
 
 #define	TIMESPEC_SUB(dst, src, val)				\
 	do { 							\
 		(dst)->tv_sec = (src)->tv_sec - (val)->tv_sec;	\
 		(dst)->tv_nsec = (src)->tv_nsec - (val)->tv_nsec; \
 		if ((dst)->tv_nsec < 0) {			\
 			(dst)->tv_sec--;			\
 			(dst)->tv_nsec += 1000000000;		\
 		}						\
 	} while (0)
 
 /* Magic cookie set for shared pthread locks and cv's pointers */
 #define	THR_PSHARED_PTR						\
     ((void *)(uintptr_t)((1ULL << (NBBY * sizeof(long) - 1)) | 1))
 
 /* XXX These values should be same as those defined in pthread.h */
 #define	THR_MUTEX_INITIALIZER		((struct pthread_mutex *)NULL)
 #define	THR_ADAPTIVE_MUTEX_INITIALIZER	((struct pthread_mutex *)1)
 #define	THR_MUTEX_DESTROYED		((struct pthread_mutex *)2)
 #define	THR_COND_INITIALIZER		((struct pthread_cond *)NULL)
 #define	THR_COND_DESTROYED		((struct pthread_cond *)1)
 #define	THR_RWLOCK_INITIALIZER		((struct pthread_rwlock *)NULL)
 #define	THR_RWLOCK_DESTROYED		((struct pthread_rwlock *)1)
 
 #define PMUTEX_FLAG_TYPE_MASK	0x0ff
 #define PMUTEX_FLAG_PRIVATE	0x100
 #define PMUTEX_FLAG_DEFERRED	0x200
 #define PMUTEX_TYPE(mtxflags)	((mtxflags) & PMUTEX_FLAG_TYPE_MASK)
 
 #define	PMUTEX_OWNER_ID(m)	((m)->m_lock.m_owner & ~UMUTEX_CONTESTED)
 
 #define MAX_DEFER_WAITERS       50
 
 /*
  * Values for pthread_mutex m_ps indicator.
  */
 #define	PMUTEX_INITSTAGE_ALLOC	0
 #define	PMUTEX_INITSTAGE_BUSY	1
 #define	PMUTEX_INITSTAGE_DONE	2
 
 struct pthread_mutex {
 	/*
 	 * Lock for accesses to this structure.
 	 */
 	struct umutex			m_lock;
 	int				m_flags;
 	int				m_count;
 	int				m_spinloops;
 	int				m_yieldloops;
 	int				m_ps;	/* pshared init stage */
 	/*
 	 * Link for all mutexes a thread currently owns, of the same
 	 * prio type.
 	 */
 	TAILQ_ENTRY(pthread_mutex)	m_qe;
 	/* Link for all private mutexes a thread currently owns. */
 	TAILQ_ENTRY(pthread_mutex)	m_pqe;
 	struct pthread_mutex		*m_rb_prev;
 };
 
 struct pthread_mutex_attr {
 	enum pthread_mutextype	m_type;
 	int			m_protocol;
 	int			m_ceiling;
 	int			m_pshared;
 	int			m_robust;
 };
 
 #define PTHREAD_MUTEXATTR_STATIC_INITIALIZER \
 	{ PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE, \
 	    PTHREAD_MUTEX_STALLED }
 
 struct pthread_cond {
 	__uint32_t	__has_user_waiters;
 	struct ucond	kcond;
 };
 
 struct pthread_cond_attr {
 	int		c_pshared;
 	int		c_clockid;
 };
 
 struct pthread_barrier {
 	struct umutex		b_lock;
 	struct ucond		b_cv;
 	int64_t			b_cycle;
 	int			b_count;
 	int			b_waiters;
 	int			b_refcount;
 	int			b_destroying;
 };
 
 struct pthread_barrierattr {
 	int		pshared;
 };
 
 struct pthread_spinlock {
 	struct umutex	s_lock;
 };
 
 /*
  * Flags for condition variables.
  */
 #define COND_FLAGS_PRIVATE	0x01
 #define COND_FLAGS_INITED	0x02
 #define COND_FLAGS_BUSY		0x04
 
 /*
  * Cleanup definitions.
  */
 struct pthread_cleanup {
 	struct pthread_cleanup	*prev;
 	void			(*routine)(void *);
 	void			*routine_arg;
 	int			onheap;
 };
 
 #define	THR_CLEANUP_PUSH(td, func, arg) {		\
 	struct pthread_cleanup __cup;			\
 							\
 	__cup.routine = func;				\
 	__cup.routine_arg = arg;			\
 	__cup.onheap = 0;				\
 	__cup.prev = (td)->cleanup;			\
 	(td)->cleanup = &__cup;
 
 #define	THR_CLEANUP_POP(td, exec)			\
 	(td)->cleanup = __cup.prev;			\
 	if ((exec) != 0)				\
 		__cup.routine(__cup.routine_arg);	\
 }
 
 struct pthread_atfork {
 	TAILQ_ENTRY(pthread_atfork) qe;
 	void (*prepare)(void);
 	void (*parent)(void);
 	void (*child)(void);
 };
 
 struct pthread_attr {
 #define pthread_attr_start_copy	sched_policy
 	int	sched_policy;
 	int	sched_inherit;
 	int	prio;
 	int	suspend;
 #define	THR_STACK_USER		0x100	/* 0xFF reserved for <pthread.h> */
 	int	flags;
 	void	*stackaddr_attr;
 	size_t	stacksize_attr;
 	size_t	guardsize_attr;
 #define pthread_attr_end_copy	cpuset
 	cpuset_t	*cpuset;
 	size_t	cpusetsize;
 };
 
 struct wake_addr {
 	struct wake_addr *link;
 	unsigned int	value;
 	char		pad[12];
 };
 
 struct sleepqueue {
 	TAILQ_HEAD(, pthread)    sq_blocked;
 	SLIST_HEAD(, sleepqueue) sq_freeq;
 	LIST_ENTRY(sleepqueue)   sq_hash;
 	SLIST_ENTRY(sleepqueue)  sq_flink;
 	void			 *sq_wchan;
 	int			 sq_type;
 };
 
 /*
  * Thread creation state attributes.
  */
 #define THR_CREATE_RUNNING		0
 #define THR_CREATE_SUSPENDED		1
 
 /*
  * Miscellaneous definitions.
  */
 #define THR_STACK_DEFAULT		(sizeof(void *) / 4 * 1024 * 1024)
 
 /*
  * Maximum size of initial thread's stack.  This perhaps deserves to be larger
  * than the stacks of other threads, since many applications are likely to run
  * almost entirely on this stack.
  */
 #define THR_STACK_INITIAL		(THR_STACK_DEFAULT * 2)
 
 /*
  * Define priorities returned by kernel.
  */
 #define THR_MIN_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_min)
 #define THR_MAX_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_max)
 #define THR_DEF_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_default)
 
 #define THR_MIN_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_min)
 #define THR_MAX_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_max)
 #define THR_DEF_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_default)
 
 /* XXX The SCHED_FIFO should have same priority range as SCHED_RR */
 #define THR_MIN_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO_1].pri_min)
 #define THR_MAX_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO-1].pri_max)
 #define THR_DEF_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO-1].pri_default)
 
 struct pthread_prio {
 	int	pri_min;
 	int	pri_max;
 	int	pri_default;
 };
 
 struct pthread_rwlockattr {
 	int		pshared;
 };
 
 struct pthread_rwlock {
 	struct urwlock 	lock;
 	uint32_t	owner;
 };
 
 /*
  * Thread states.
  */
 enum pthread_state {
 	PS_RUNNING,
 	PS_DEAD
 };
 
 struct pthread_specific_elem {
 	const void	*data;
 	int		seqno;
 };
 
 struct pthread_key {
 	volatile int	allocated;
 	int		seqno;
 	void            (*destructor)(void *);
 };
 
 /*
  * lwpid_t is 32bit but kernel thr API exports tid as long type
  * to preserve the ABI for M:N model in very early date (r131431).
  */
 #define TID(thread)	((uint32_t) ((thread)->tid))
 
 /*
  * Thread structure.
  */
 struct pthread {
 #define _pthread_startzero	tid
 	/* Kernel thread id. */
 	long			tid;
 #define	TID_TERMINATED		1
 
 	/*
 	 * Lock for accesses to this thread structure.
 	 */
 	struct umutex		lock;
 
 	/* Internal condition variable cycle number. */
 	uint32_t		cycle;
 
 	/* How many low level locks the thread held. */
 	int			locklevel;
 
 	/*
 	 * Set to non-zero when this thread has entered a critical
 	 * region.  We allow for recursive entries into critical regions.
 	 */
 	int			critical_count;
 
 	/* Signal blocked counter. */
 	int			sigblock;
 
 	/* Queue entry for list of all threads. */
 	TAILQ_ENTRY(pthread)	tle;	/* link for all threads in process */
 
 	/* Queue entry for GC lists. */
 	TAILQ_ENTRY(pthread)	gcle;
 
 	/* Hash queue entry. */
 	LIST_ENTRY(pthread)	hle;
 
 	/* Sleep queue entry */
 	TAILQ_ENTRY(pthread)    wle;
 
 	/* Threads reference count. */
 	int			refcount;
 
 	/*
 	 * Thread start routine, argument, stack pointer and thread
 	 * attributes.
 	 */
 	void			*(*start_routine)(void *);
 	void			*arg;
 	struct pthread_attr	attr;
 
 #define	SHOULD_CANCEL(thr)					\
 	((thr)->cancel_pending && (thr)->cancel_enable &&	\
 	 (thr)->no_cancel == 0)
 
 	/* Cancellation is enabled */
 	int			cancel_enable;
 
 	/* Cancellation request is pending */
 	int			cancel_pending;
 
 	/* Thread is at cancellation point */
 	int			cancel_point;
 
 	/* Cancellation is temporarily disabled */
 	int			no_cancel;
 
 	/* Asynchronouse cancellation is enabled */
 	int			cancel_async;
 
 	/* Cancellation is in progress */
 	int			cancelling;
 
 	/* Thread temporary signal mask. */
 	sigset_t		sigmask;
 
 	/* Thread should unblock SIGCANCEL. */
 	int			unblock_sigcancel;
 
 	/* In sigsuspend state */
 	int			in_sigsuspend;
 
 	/* deferred signal info	*/
 	siginfo_t		deferred_siginfo;
 
 	/* signal mask to restore. */
 	sigset_t		deferred_sigmask;
 
 	/* the sigaction should be used for deferred signal. */
 	struct sigaction	deferred_sigact;
 
 	/* deferred signal delivery is performed, do not reenter. */
 	int			deferred_run;
 
 	/* Force new thread to exit. */
 	int			force_exit;
 
 	/* Thread state: */
 	enum pthread_state 	state;
 
 	/*
 	 * Error variable used instead of errno. The function __error()
 	 * returns a pointer to this. 
 	 */
 	int			error;
 
 	/*
 	 * The joiner is the thread that is joining to this thread.  The
 	 * join status keeps track of a join operation to another thread.
 	 */
 	struct pthread		*joiner;
 
 	/* Miscellaneous flags; only set with scheduling lock held. */
 	int			flags;
 #define THR_FLAGS_PRIVATE	0x0001
 #define	THR_FLAGS_NEED_SUSPEND	0x0002	/* thread should be suspended */
 #define	THR_FLAGS_SUSPENDED	0x0004	/* thread is suspended */
 #define	THR_FLAGS_DETACHED	0x0008	/* thread is detached */
 
 	/* Thread list flags; only set with thread list lock held. */
 	int			tlflags;
 #define	TLFLAGS_GC_SAFE		0x0001	/* thread safe for cleaning */
 #define	TLFLAGS_IN_TDLIST	0x0002	/* thread in all thread list */
 #define	TLFLAGS_IN_GCLIST	0x0004	/* thread in gc list */
 
 	/*
 	 * Queues of the owned mutexes.  Private queue must have index
 	 * + 1 of the corresponding full queue.
 	 */
 #define	TMQ_NORM		0	/* NORMAL or PRIO_INHERIT normal */
 #define	TMQ_NORM_PRIV		1	/* NORMAL or PRIO_INHERIT normal priv */
 #define	TMQ_NORM_PP		2	/* PRIO_PROTECT normal mutexes */
 #define	TMQ_NORM_PP_PRIV	3	/* PRIO_PROTECT normal priv */
 #define	TMQ_ROBUST_PP		4	/* PRIO_PROTECT robust mutexes */
 #define	TMQ_ROBUST_PP_PRIV	5	/* PRIO_PROTECT robust priv */	
 #define	TMQ_NITEMS		6
 	struct mutex_queue	mq[TMQ_NITEMS];
 
 	void				*ret;
 	struct pthread_specific_elem	*specific;
 	int				specific_data_count;
 
 	/* Number rwlocks rdlocks held. */
 	int			rdlock_count;
 
 	/*
 	 * Current locks bitmap for rtld. */
 	int			rtld_bits;
 
 	/* Thread control block */
 	struct tcb		*tcb;
 
 	/* Cleanup handlers Link List */
 	struct pthread_cleanup	*cleanup;
 
 #ifdef _PTHREAD_FORCED_UNWIND
 	struct _Unwind_Exception	ex;
 	void			*unwind_stackend;
 	int			unwind_disabled;
 #endif
 
 	/*
 	 * Magic value to help recognize a valid thread structure
 	 * from an invalid one:
 	 */
 #define	THR_MAGIC		((u_int32_t) 0xd09ba115)
 	u_int32_t		magic;
 
 	/* Enable event reporting */
 	int			report_events;
 
 	/* Event mask */
 	int			event_mask;
 
 	/* Event */
 	td_event_msg_t		event_buf;
 
 	/* Wait channel */
 	void			*wchan;
 
 	/* Referenced mutex. */
 	struct pthread_mutex	*mutex_obj;
 
 	/* Thread will sleep. */
 	int			will_sleep;
 
 	/* Number of threads deferred. */
 	int			nwaiter_defer;
 
 	int			robust_inited;
 	uintptr_t		robust_list;
 	uintptr_t		priv_robust_list;
 	uintptr_t		inact_mtx;
 
 	/* Deferred threads from pthread_cond_signal. */
 	unsigned int 		*defer_waiters[MAX_DEFER_WAITERS];
 #define _pthread_endzero	wake_addr
 
 	struct wake_addr	*wake_addr;
 #define WAKE_ADDR(td)           ((td)->wake_addr)
 
 	/* Sleep queue */
 	struct	sleepqueue	*sleepqueue;
 
 	/* pthread_set/get_name_np */
 	char			*name;
 };
 
 #define THR_SHOULD_GC(thrd) 						\
 	((thrd)->refcount == 0 && (thrd)->state == PS_DEAD &&		\
 	 ((thrd)->flags & THR_FLAGS_DETACHED) != 0)
 
 #define	THR_IN_CRITICAL(thrd)				\
 	(((thrd)->locklevel > 0) ||			\
 	((thrd)->critical_count > 0))
 
 #define	THR_CRITICAL_ENTER(thrd)			\
 	(thrd)->critical_count++
 
 #define	THR_CRITICAL_LEAVE(thrd)			\
 	do {						\
 		(thrd)->critical_count--;		\
 		_thr_ast(thrd);				\
 	} while (0)
 
 #define THR_UMUTEX_TRYLOCK(thrd, lck)			\
 	_thr_umutex_trylock((lck), TID(thrd))
 
 #define	THR_UMUTEX_LOCK(thrd, lck)			\
 	_thr_umutex_lock((lck), TID(thrd))
 
 #define	THR_UMUTEX_TIMEDLOCK(thrd, lck, timo)		\
 	_thr_umutex_timedlock((lck), TID(thrd), (timo))
 
 #define	THR_UMUTEX_UNLOCK(thrd, lck)			\
 	_thr_umutex_unlock((lck), TID(thrd))
 
 #define	THR_LOCK_ACQUIRE(thrd, lck)			\
 do {							\
 	(thrd)->locklevel++;				\
 	_thr_umutex_lock(lck, TID(thrd));		\
 } while (0)
 
 #define	THR_LOCK_ACQUIRE_SPIN(thrd, lck)		\
 do {							\
 	(thrd)->locklevel++;				\
 	_thr_umutex_lock_spin(lck, TID(thrd));		\
 } while (0)
 
 #ifdef	_PTHREADS_INVARIANTS
 #define	THR_ASSERT_LOCKLEVEL(thrd)			\
 do {							\
 	if (__predict_false((thrd)->locklevel <= 0))	\
 		_thr_assert_lock_level();		\
 } while (0)
 #else
 #define THR_ASSERT_LOCKLEVEL(thrd)
 #endif
 
 #define	THR_LOCK_RELEASE(thrd, lck)			\
 do {							\
 	THR_ASSERT_LOCKLEVEL(thrd);			\
 	_thr_umutex_unlock((lck), TID(thrd));		\
 	(thrd)->locklevel--;				\
 	_thr_ast(thrd);					\
 } while (0)
 
 #define	THR_LOCK(curthrd)		THR_LOCK_ACQUIRE(curthrd, &(curthrd)->lock)
 #define	THR_UNLOCK(curthrd)		THR_LOCK_RELEASE(curthrd, &(curthrd)->lock)
 #define	THR_THREAD_LOCK(curthrd, thr)	THR_LOCK_ACQUIRE(curthrd, &(thr)->lock)
 #define	THR_THREAD_UNLOCK(curthrd, thr)	THR_LOCK_RELEASE(curthrd, &(thr)->lock)
 
 #define	THREAD_LIST_RDLOCK(curthrd)				\
 do {								\
 	(curthrd)->locklevel++;					\
 	_thr_rwl_rdlock(&_thr_list_lock);			\
 } while (0)
 
 #define	THREAD_LIST_WRLOCK(curthrd)				\
 do {								\
 	(curthrd)->locklevel++;					\
 	_thr_rwl_wrlock(&_thr_list_lock);			\
 } while (0)
 
 #define	THREAD_LIST_UNLOCK(curthrd)				\
 do {								\
 	_thr_rwl_unlock(&_thr_list_lock);			\
 	(curthrd)->locklevel--;					\
 	_thr_ast(curthrd);					\
 } while (0)
 
 /*
  * Macros to insert/remove threads to the all thread list and
  * the gc list.
  */
 #define	THR_LIST_ADD(thrd) do {					\
 	if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) == 0) {	\
 		TAILQ_INSERT_HEAD(&_thread_list, thrd, tle);	\
 		_thr_hash_add(thrd);				\
 		(thrd)->tlflags |= TLFLAGS_IN_TDLIST;		\
 	}							\
 } while (0)
 #define	THR_LIST_REMOVE(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) != 0) {	\
 		TAILQ_REMOVE(&_thread_list, thrd, tle);		\
 		_thr_hash_remove(thrd);				\
 		(thrd)->tlflags &= ~TLFLAGS_IN_TDLIST;		\
 	}							\
 } while (0)
 #define	THR_GCLIST_ADD(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) == 0) {	\
 		TAILQ_INSERT_HEAD(&_thread_gc_list, thrd, gcle);\
 		(thrd)->tlflags |= TLFLAGS_IN_GCLIST;		\
 		_gc_count++;					\
 	}							\
 } while (0)
 #define	THR_GCLIST_REMOVE(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) != 0) {	\
 		TAILQ_REMOVE(&_thread_gc_list, thrd, gcle);	\
 		(thrd)->tlflags &= ~TLFLAGS_IN_GCLIST;		\
 		_gc_count--;					\
 	}							\
 } while (0)
 
 #define THR_REF_ADD(curthread, pthread) {			\
 	THR_CRITICAL_ENTER(curthread);				\
 	pthread->refcount++;					\
 } while (0)
 
 #define THR_REF_DEL(curthread, pthread) {			\
 	pthread->refcount--;					\
 	THR_CRITICAL_LEAVE(curthread);				\
 } while (0)
 
 #define GC_NEEDED()	(_gc_count >= 5)
 
 #define SHOULD_REPORT_EVENT(curthr, e)			\
 	(curthr->report_events && 			\
 	 (((curthr)->event_mask | _thread_event_mask ) & e) != 0)
 
 #ifndef __LIBC_ISTHREADED_DECLARED
 #define __LIBC_ISTHREADED_DECLARED
 extern int __isthreaded;
 #endif
 
 /*
  * Global variables for the pthread kernel.
  */
 
 extern char		*_usrstack __hidden;
 
 /* For debugger */
 extern int		_libthr_debug;
 extern int		_thread_event_mask;
 extern struct pthread	*_thread_last_event;
 /* Used in symbol lookup of libthread_db */
 extern struct pthread_key	_thread_keytable[];
 
 /* List of all threads: */
 extern pthreadlist	_thread_list;
 
 /* List of threads needing GC: */
 extern pthreadlist	_thread_gc_list __hidden;
 
 extern int		_thread_active_threads;
 extern atfork_head	_thr_atfork_list __hidden;
 extern struct urwlock	_thr_atfork_lock __hidden;
 
 /* Default thread attributes: */
 extern struct pthread_attr _pthread_attr_default __hidden;
 
 /* Default mutex attributes: */
 extern struct pthread_mutex_attr _pthread_mutexattr_default __hidden;
 extern struct pthread_mutex_attr _pthread_mutexattr_adaptive_default __hidden;
 
 /* Default condition variable attributes: */
 extern struct pthread_cond_attr _pthread_condattr_default __hidden;
 
 extern struct pthread_prio _thr_priorities[] __hidden;
 
 extern int	_thr_is_smp __hidden;
 
 extern size_t	_thr_guard_default __hidden;
 extern size_t	_thr_stack_default __hidden;
 extern size_t	_thr_stack_initial __hidden;
 extern int	_thr_page_size __hidden;
 extern int	_thr_spinloops __hidden;
 extern int	_thr_yieldloops __hidden;
 extern int	_thr_queuefifo __hidden;
 
 /* Garbage thread count. */
 extern int	_gc_count __hidden;
 
 extern struct umutex	_mutex_static_lock __hidden;
 extern struct umutex	_cond_static_lock __hidden;
 extern struct umutex	_rwlock_static_lock __hidden;
 extern struct umutex	_keytable_lock __hidden;
 extern struct urwlock	_thr_list_lock __hidden;
 extern struct umutex	_thr_event_lock __hidden;
 extern struct umutex	_suspend_all_lock __hidden;
 extern int		_suspend_all_waiters __hidden;
 extern int		_suspend_all_cycle __hidden;
 extern struct pthread	*_single_thread __hidden;
 
 /*
  * Function prototype definitions.
  */
 __BEGIN_DECLS
 void	_thr_setthreaded(int) __hidden;
 int	_mutex_cv_lock(struct pthread_mutex *, int, bool) __hidden;
 int	_mutex_cv_unlock(struct pthread_mutex *, int *, int *) __hidden;
 int     _mutex_cv_attach(struct pthread_mutex *, int) __hidden;
 int     _mutex_cv_detach(struct pthread_mutex *, int *) __hidden;
 int     _mutex_owned(struct pthread *, const struct pthread_mutex *) __hidden;
 int	_mutex_reinit(pthread_mutex_t *) __hidden;
 void	_mutex_fork(struct pthread *curthread) __hidden;
 int	_mutex_enter_robust(struct pthread *curthread, struct pthread_mutex *m)
 	    __hidden;
 void	_mutex_leave_robust(struct pthread *curthread, struct pthread_mutex *m)
 	    __hidden;
 void	_libpthread_init(struct pthread *) __hidden;
 struct pthread *_thr_alloc(struct pthread *) __hidden;
 void	_thread_exit(const char *, int, const char *) __hidden __dead2;
 void	_thread_exitf(const char *, int, const char *, ...) __hidden __dead2
 	    __printflike(3, 4);
 int	_thr_ref_add(struct pthread *, struct pthread *, int) __hidden;
 void	_thr_ref_delete(struct pthread *, struct pthread *) __hidden;
 void	_thr_ref_delete_unlocked(struct pthread *, struct pthread *) __hidden;
 int	_thr_find_thread(struct pthread *, struct pthread *, int) __hidden;
 void	_thr_rtld_init(void) __hidden;
 void	_thr_rtld_postfork_child(void) __hidden;
 int	_thr_stack_alloc(struct pthread_attr *) __hidden;
 void	_thr_stack_free(struct pthread_attr *) __hidden;
 void	_thr_free(struct pthread *, struct pthread *) __hidden;
 void	_thr_gc(struct pthread *) __hidden;
 void    _thread_cleanupspecific(void) __hidden;
 void	_thread_printf(int, const char *, ...) __hidden __printflike(2, 3);
 void	_thread_vprintf(int, const char *, va_list) __hidden;
 void	_thr_spinlock_init(void) __hidden;
 void	_thr_cancel_enter(struct pthread *) __hidden;
 void	_thr_cancel_enter2(struct pthread *, int) __hidden;
 void	_thr_cancel_leave(struct pthread *, int) __hidden;
 void	_thr_testcancel(struct pthread *) __hidden;
 void	_thr_signal_block(struct pthread *) __hidden;
 void	_thr_signal_unblock(struct pthread *) __hidden;
 void	_thr_signal_init(int) __hidden;
 void	_thr_signal_deinit(void) __hidden;
 int	_thr_send_sig(struct pthread *, int sig) __hidden;
 void	_thr_list_init(void) __hidden;
 void	_thr_hash_add(struct pthread *) __hidden;
 void	_thr_hash_remove(struct pthread *) __hidden;
 struct pthread *_thr_hash_find(struct pthread *) __hidden;
 void	_thr_link(struct pthread *, struct pthread *) __hidden;
 void	_thr_unlink(struct pthread *, struct pthread *) __hidden;
 void	_thr_assert_lock_level(void) __hidden __dead2;
 void	_thr_ast(struct pthread *) __hidden;
 void	_thr_report_creation(struct pthread *curthread,
 	    struct pthread *newthread) __hidden;
 void	_thr_report_death(struct pthread *curthread) __hidden;
 int	_thr_getscheduler(lwpid_t, int *, struct sched_param *) __hidden;
 int	_thr_setscheduler(lwpid_t, int, const struct sched_param *) __hidden;
 void	_thr_signal_prefork(void) __hidden;
 void	_thr_signal_postfork(void) __hidden;
 void	_thr_signal_postfork_child(void) __hidden;
 void	_thr_suspend_all_lock(struct pthread *) __hidden;
 void	_thr_suspend_all_unlock(struct pthread *) __hidden;
 void	_thr_try_gc(struct pthread *, struct pthread *) __hidden;
 int	_rtp_to_schedparam(const struct rtprio *rtp, int *policy,
 		struct sched_param *param) __hidden;
 int	_schedparam_to_rtp(int policy, const struct sched_param *param,
 		struct rtprio *rtp) __hidden;
 void	_thread_bp_create(void);
 void	_thread_bp_death(void);
 int	_sched_yield(void);
 
 void	_pthread_cleanup_push(void (*)(void *), void *);
 void	_pthread_cleanup_pop(int);
 void	_pthread_exit_mask(void *status, sigset_t *mask) __dead2 __hidden;
 #ifndef _LIBC_PRIVATE_H_
 void	_pthread_cancel_enter(int maycancel);
 void 	_pthread_cancel_leave(int maycancel);
 #endif
 int	_pthread_mutex_consistent(pthread_mutex_t * _Nonnull);
 int	_pthread_mutexattr_getrobust(pthread_mutexattr_t * _Nonnull __restrict,
 	    int * _Nonnull __restrict);
 int	_pthread_mutexattr_setrobust(pthread_mutexattr_t * _Nonnull, int);
 
 /* #include <fcntl.h> */
 #ifdef  _SYS_FCNTL_H_
 #ifndef _LIBC_PRIVATE_H_
 int     __sys_fcntl(int, int, ...);
 int     __sys_openat(int, const char *, int, ...);
 #endif /* _LIBC_PRIVATE_H_ */
 #endif /* _SYS_FCNTL_H_ */
 
 /* #include <signal.h> */
 #ifdef _SIGNAL_H_
 #ifndef _LIBC_PRIVATE_H_
 int     __sys_sigaction(int, const struct sigaction *, struct sigaction *);
 int     __sys_sigprocmask(int, const sigset_t *, sigset_t *);
 int     __sys_sigsuspend(const sigset_t *);
 int	__sys_sigtimedwait(const sigset_t *, siginfo_t *,
 		const struct timespec *);
 int	__sys_sigwait(const sigset_t *, int *);
 int	__sys_sigwaitinfo(const sigset_t *set, siginfo_t *info);
 #endif /* _LIBC_PRIVATE_H_ */
 #endif /* _SYS_FCNTL_H_ */
 
 /* #include <time.h> */
 #ifdef	_TIME_H_
 #ifndef _LIBC_PRIVATE_H_
 int	__sys_clock_nanosleep(clockid_t, int, const struct timespec *,
 	    struct timespec *);
 int	__sys_nanosleep(const struct timespec *, struct timespec *);
 #endif /* _LIBC_PRIVATE_H_ */
 #endif /* _SYS_FCNTL_H_ */
 
 /* #include <sys/ucontext.h> */
 #ifdef _SYS_UCONTEXT_H_
 #ifndef _LIBC_PRIVATE_H_
 int	__sys_setcontext(const ucontext_t *ucp);
 int	__sys_swapcontext(ucontext_t *oucp, const ucontext_t *ucp);
 #endif /* _LIBC_PRIVATE_H_ */
 #endif /* _SYS_FCNTL_H_ */
 
 /* #include <unistd.h> */
 #ifdef  _UNISTD_H_
 #ifndef _LIBC_PRIVATE_H_
 int     __sys_close(int);
 int	__sys_fork(void);
 ssize_t __sys_read(int, void *, size_t);
 #endif /* _LIBC_PRIVATE_H_ */
 #endif /* _SYS_FCNTL_H_ */
 
 static inline int
 _thr_isthreaded(void)
 {
 	return (__isthreaded != 0);
 }
 
 static inline int
 _thr_is_inited(void)
 {
 	return (_thr_initial != NULL);
 }
 
 static inline void
 _thr_check_init(void)
 {
 	if (_thr_initial == NULL)
 		_libpthread_init(NULL);
 }
 
 struct wake_addr *_thr_alloc_wake_addr(void);
 void	_thr_release_wake_addr(struct wake_addr *);
 int	_thr_sleep(struct pthread *, int, const struct timespec *);
 
 void _thr_wake_addr_init(void) __hidden;
 
 static inline void
 _thr_clear_wake(struct pthread *td)
 {
 	td->wake_addr->value = 0;
 }
 
 static inline int
 _thr_is_woken(struct pthread *td)
 {
 	return td->wake_addr->value != 0;
 }
 
 static inline void
 _thr_set_wake(unsigned int *waddr)
 {
 	*waddr = 1;
 	_thr_umtx_wake(waddr, INT_MAX, 0);
 }
 
 void _thr_wake_all(unsigned int *waddrs[], int) __hidden;
 
 static inline struct pthread *
 _sleepq_first(struct sleepqueue *sq)
 {
 	return TAILQ_FIRST(&sq->sq_blocked);
 }
 
 void	_sleepq_init(void) __hidden;
 struct sleepqueue *_sleepq_alloc(void) __hidden;
 void	_sleepq_free(struct sleepqueue *) __hidden;
 void	_sleepq_lock(void *) __hidden;
 void	_sleepq_unlock(void *) __hidden;
 struct sleepqueue *_sleepq_lookup(void *) __hidden;
 void	_sleepq_add(void *, struct pthread *) __hidden;
 int	_sleepq_remove(struct sleepqueue *, struct pthread *) __hidden;
 void	_sleepq_drop(struct sleepqueue *,
 		void (*cb)(struct pthread *, void *arg), void *) __hidden;
 
 int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
 	    void *(calloc_cb)(size_t, size_t));
 
 struct dl_phdr_info;
 void __pthread_cxa_finalize(struct dl_phdr_info *phdr_info);
 void _thr_tsd_unload(struct dl_phdr_info *phdr_info) __hidden;
 void _thr_sigact_unload(struct dl_phdr_info *phdr_info) __hidden;
 void _thr_stack_fix_protection(struct pthread *thrd);
+void __pthread_distribute_static_tls(size_t offset, void *src, size_t len,
+    size_t total_len);
 
 int *__error_threaded(void) __hidden;
 void __thr_interpose_libc(void) __hidden;
 pid_t __thr_fork(void);
 int __thr_setcontext(const ucontext_t *ucp);
 int __thr_sigaction(int sig, const struct sigaction *act,
     struct sigaction *oact) __hidden;
 int __thr_sigprocmask(int how, const sigset_t *set, sigset_t *oset);
 int __thr_sigsuspend(const sigset_t * set);
 int __thr_sigtimedwait(const sigset_t *set, siginfo_t *info,
     const struct timespec * timeout);
 int __thr_sigwait(const sigset_t *set, int *sig);
 int __thr_sigwaitinfo(const sigset_t *set, siginfo_t *info);
 int __thr_swapcontext(ucontext_t *oucp, const ucontext_t *ucp);
 
 void __thr_map_stacks_exec(void);
 
 struct _spinlock;
 void __thr_spinunlock(struct _spinlock *lck);
 void __thr_spinlock(struct _spinlock *lck);
 
 struct tcb *_tcb_ctor(struct pthread *, int);
 void	_tcb_dtor(struct tcb *);
 
 void __thr_pshared_init(void) __hidden;
 void *__thr_pshared_offpage(void *key, int doalloc) __hidden;
 void __thr_pshared_destroy(void *key) __hidden;
 void __thr_pshared_atfork_pre(void) __hidden;
 void __thr_pshared_atfork_post(void) __hidden;
 
 void *__thr_calloc(size_t num, size_t size);
 void __thr_free(void *cp);
 void *__thr_malloc(size_t nbytes);
 void *__thr_realloc(void *cp, size_t nbytes);
 void __thr_malloc_init(void);
 void __thr_malloc_prefork(struct pthread *curthread);
 void __thr_malloc_postfork(struct pthread *curthread);
 
 __END_DECLS
 __NULLABILITY_PRAGMA_POP
 
 #endif  /* !_THR_PRIVATE_H */
Index: projects/capsicum-test/lib/libvgl/main.c
===================================================================
--- projects/capsicum-test/lib/libvgl/main.c	(revision 345709)
+++ projects/capsicum-test/lib/libvgl/main.c	(revision 345710)
@@ -1,585 +1,593 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1991-1997 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <signal.h>
 #include <stdio.h>
 #include <sys/types.h>
-#include <sys/signal.h>
 #include <sys/file.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/fbio.h>
 #include <sys/kbio.h>
 #include <sys/consio.h>
 #include "vgl.h"
 
 /* XXX Direct Color 24bits modes unsupported */
 
 #define min(x, y)	(((x) < (y)) ? (x) : (y))
 #define max(x, y)	(((x) > (y)) ? (x) : (y))
 
 VGLBitmap *VGLDisplay;
 video_info_t VGLModeInfo;
 video_adapter_info_t VGLAdpInfo;
 byte *VGLBuf;
 
 static int VGLMode;
 static int VGLOldMode;
 static size_t VGLBufSize;
 static byte *VGLMem = MAP_FAILED;
 static int VGLSwitchPending;
 static int VGLAbortPending;
 static int VGLOnDisplay;
 static unsigned int VGLCurWindow;
 static int VGLInitDone = 0;
+static video_info_t VGLOldModeInfo;
 static vid_info_t VGLOldVInfo;
 
 void
 VGLEnd()
 {
 struct vt_mode smode;
+  int size[3];
 
   if (!VGLInitDone)
     return;
   VGLInitDone = 0;
   VGLSwitchPending = 0;
   VGLAbortPending = 0;
 
   signal(SIGUSR1, SIG_IGN);
 
   if (VGLMem != MAP_FAILED) {
     VGLClear(VGLDisplay, 0);
     munmap(VGLMem, VGLAdpInfo.va_window_size);
   }
 
-  if (VGLOldMode >= M_VESA_BASE) {
-    /* ugly, but necessary */
+  if (VGLOldMode >= M_VESA_BASE)
     ioctl(0, _IO('V', VGLOldMode - M_VESA_BASE), 0);
-    if (VGLOldMode == M_VESA_800x600) {
-      int size[3];
-      size[0] = VGLOldVInfo.mv_csz;
-      size[1] = VGLOldVInfo.mv_rsz;
-      size[2] = 16;
-      ioctl(0, KDRASTER, size);
-    }
-  } else {
+  else
     ioctl(0, _IO('S', VGLOldMode), 0);
+  if (VGLOldModeInfo.vi_flags & V_INFO_GRAPHICS) {
+    size[0] = VGLOldVInfo.mv_csz;
+    size[1] = VGLOldVInfo.mv_rsz;
+    size[2] = VGLOldVInfo.font_size;;
+    ioctl(0, KDRASTER, size);
   }
   ioctl(0, KDDISABIO, 0);
   ioctl(0, KDSETMODE, KD_TEXT);
   smode.mode = VT_AUTO;
   ioctl(0, VT_SETMODE, &smode);
   if (VGLBuf)
     free(VGLBuf);
   VGLBuf = NULL;
   free(VGLDisplay);
   VGLDisplay = NULL;
   VGLKeyboardEnd();
 }
 
 static void 
-VGLAbort(int arg __unused)
+VGLAbort(int arg)
 {
+  sigset_t mask;
+
   VGLAbortPending = 1;
   signal(SIGINT, SIG_IGN);
   signal(SIGTERM, SIG_IGN);
-  signal(SIGSEGV, SIG_IGN);
-  signal(SIGBUS, SIG_IGN);
   signal(SIGUSR2, SIG_IGN);
+  if (arg == SIGBUS || arg == SIGSEGV) {
+    signal(arg, SIG_DFL);
+    sigemptyset(&mask);
+    sigaddset(&mask, arg);
+    sigprocmask(SIG_UNBLOCK, &mask, NULL);
+    VGLEnd();
+    kill(getpid(), arg);
+  }
 }
 
 static void
 VGLSwitch(int arg __unused)
 {
   if (!VGLOnDisplay)
     VGLOnDisplay = 1;
   else
     VGLOnDisplay = 0;
   VGLSwitchPending = 1;
   signal(SIGUSR1, VGLSwitch);
 }
 
 int
 VGLInit(int mode)
 {
   struct vt_mode smode;
   int adptype, depth;
 
   if (VGLInitDone)
     return -1;
 
   signal(SIGUSR1, VGLSwitch);
   signal(SIGINT, VGLAbort);
   signal(SIGTERM, VGLAbort);
   signal(SIGSEGV, VGLAbort);
   signal(SIGBUS, VGLAbort);
   signal(SIGUSR2, SIG_IGN);
 
   VGLOnDisplay = 1;
   VGLSwitchPending = 0;
   VGLAbortPending = 0;
 
   if (ioctl(0, CONS_GET, &VGLOldMode) || ioctl(0, CONS_CURRENT, &adptype))
     return -1;
   if (IOCGROUP(mode) == 'V')	/* XXX: this is ugly */
     VGLModeInfo.vi_mode = (mode & 0x0ff) + M_VESA_BASE;
   else
     VGLModeInfo.vi_mode = mode & 0x0ff;
   if (ioctl(0, CONS_MODEINFO, &VGLModeInfo))	/* FBIO_MODEINFO */
     return -1;
 
-  /* If current mode is VESA_800x600 then save its geometry to restore later */
-  if ((VGLOldMode >= M_VESA_BASE) && (VGLOldMode == M_VESA_800x600)) {
-    VGLOldVInfo.size = sizeof(VGLOldVInfo);
-    if (ioctl(0, CONS_GETINFO, &VGLOldVInfo))
-      return -1;
-  }
+  /* Save info for old mode to restore font size if old mode is graphics. */
+  VGLOldModeInfo.vi_mode = VGLOldMode;
+  if (ioctl(0, CONS_MODEINFO, &VGLOldModeInfo))
+    return -1;
+  VGLOldVInfo.size = sizeof(VGLOldVInfo);
+  if (ioctl(0, CONS_GETINFO, &VGLOldVInfo))
+    return -1;
 
   VGLDisplay = (VGLBitmap *)malloc(sizeof(VGLBitmap));
   if (VGLDisplay == NULL)
     return -2;
 
   if (ioctl(0, KDENABIO, 0)) {
     free(VGLDisplay);
     return -3;
   }
 
   VGLInitDone = 1;
 
   /*
    * vi_mem_model specifies the memory model of the current video mode
    * in -CURRENT.
    */
   switch (VGLModeInfo.vi_mem_model) {
   case V_INFO_MM_PLANAR:
     /* we can handle EGA/VGA planner modes only */
     if (VGLModeInfo.vi_depth != 4 || VGLModeInfo.vi_planes != 4
 	|| (adptype != KD_EGA && adptype != KD_VGA)) {
       VGLEnd();
       return -4;
     }
     VGLDisplay->Type = VIDBUF4;
     VGLDisplay->PixelBytes = 1;
     break;
   case V_INFO_MM_PACKED:
     /* we can do only 256 color packed modes */
     if (VGLModeInfo.vi_depth != 8) {
       VGLEnd();
       return -4;
     }
     VGLDisplay->Type = VIDBUF8;
     VGLDisplay->PixelBytes = 1;
     break;
   case V_INFO_MM_VGAX:
     VGLDisplay->Type = VIDBUF8X;
     VGLDisplay->PixelBytes = 1;
     break;
   case V_INFO_MM_DIRECT:
     VGLDisplay->PixelBytes = VGLModeInfo.vi_pixel_size;
     switch (VGLDisplay->PixelBytes) {
     case 2:
       VGLDisplay->Type = VIDBUF16;
       break;
 #if notyet
     case 3:
       VGLDisplay->Type = VIDBUF24;
       break;
 #endif
     case 4:
       VGLDisplay->Type = VIDBUF32;
       break;
     default:
       VGLEnd();
       return -4;
     }
     break;
   default:
     VGLEnd();
     return -4;
   }
 
   ioctl(0, VT_WAITACTIVE, 0);
   ioctl(0, KDSETMODE, KD_GRAPHICS);
   if (ioctl(0, mode, 0)) {
     VGLEnd();
     return -5;
   }
   if (ioctl(0, CONS_ADPINFO, &VGLAdpInfo)) {	/* FBIO_ADPINFO */
     VGLEnd();
     return -6;
   }
 
   /*
    * Calculate the shadow screen buffer size.  In -CURRENT, va_buffer_size
    * always holds the entire frame buffer size, wheather it's in the linear
    * mode or windowed mode.  
    *     VGLBufSize = VGLAdpInfo.va_buffer_size;
    * In -STABLE, va_buffer_size holds the frame buffer size, only if
    * the linear frame buffer mode is supported. Otherwise the field is zero.
    * We shall calculate the minimal size in this case:
    *     VGLAdpInfo.va_line_width*VGLModeInfo.vi_height*VGLModeInfo.vi_planes
    * or
    *     VGLAdpInfo.va_window_size*VGLModeInfo.vi_planes;
    * Use whichever is larger.
    */
   if (VGLAdpInfo.va_buffer_size != 0)
     VGLBufSize = VGLAdpInfo.va_buffer_size;
   else
     VGLBufSize = max(VGLAdpInfo.va_line_width*VGLModeInfo.vi_height,
 		     VGLAdpInfo.va_window_size)*VGLModeInfo.vi_planes;
   VGLBuf = malloc(VGLBufSize);
   if (VGLBuf == NULL) {
     VGLEnd();
     return -7;
   }
 
 #ifdef LIBVGL_DEBUG
   fprintf(stderr, "VGLBufSize:0x%x\n", VGLBufSize);
 #endif
 
   /* see if we are in the windowed buffer mode or in the linear buffer mode */
   if (VGLBufSize/VGLModeInfo.vi_planes > VGLAdpInfo.va_window_size) {
     switch (VGLDisplay->Type) {
     case VIDBUF4:
       VGLDisplay->Type = VIDBUF4S;
       break;
     case VIDBUF8:
       VGLDisplay->Type = VIDBUF8S;
       break;
     case VIDBUF16:
       VGLDisplay->Type = VIDBUF16S;
       break;
     case VIDBUF24:
       VGLDisplay->Type = VIDBUF24S;
       break;
     case VIDBUF32:
       VGLDisplay->Type = VIDBUF32S;
       break;
     default:
       VGLEnd();
       return -8;
     }
   }
 
   VGLMode = mode;
   VGLCurWindow = 0;
 
   VGLDisplay->Xsize = VGLModeInfo.vi_width;
   VGLDisplay->Ysize = VGLModeInfo.vi_height;
   depth = VGLModeInfo.vi_depth;
   if (depth == 15)
     depth = 16;
   VGLDisplay->VXsize = VGLAdpInfo.va_line_width
 			   *8/(depth/VGLModeInfo.vi_planes);
   VGLDisplay->VYsize = VGLBufSize/VGLModeInfo.vi_planes/VGLAdpInfo.va_line_width;
   VGLDisplay->Xorigin = 0;
   VGLDisplay->Yorigin = 0;
 
   VGLMem = (byte*)mmap(0, VGLAdpInfo.va_window_size, PROT_READ|PROT_WRITE,
 		       MAP_FILE | MAP_SHARED, 0, 0);
   if (VGLMem == MAP_FAILED) {
     VGLEnd();
     return -7;
   }
   VGLDisplay->Bitmap = VGLMem;
 
   VGLSavePalette();
 
 #ifdef LIBVGL_DEBUG
   fprintf(stderr, "va_line_width:%d\n", VGLAdpInfo.va_line_width);
   fprintf(stderr, "VGLXsize:%d, Ysize:%d, VXsize:%d, VYsize:%d\n",
 	  VGLDisplay->Xsize, VGLDisplay->Ysize, 
 	  VGLDisplay->VXsize, VGLDisplay->VYsize);
 #endif
 
   smode.mode = VT_PROCESS;
   smode.waitv = 0;
   smode.relsig = SIGUSR1;
   smode.acqsig = SIGUSR1;
   smode.frsig  = SIGINT;	
   if (ioctl(0, VT_SETMODE, &smode)) {
     VGLEnd();
     return -9;
   }
   VGLTextSetFontFile((byte*)0);
   VGLClear(VGLDisplay, 0);
   return 0;
 }
 
 void
 VGLCheckSwitch()
 {
   if (VGLAbortPending) {
     VGLEnd();
     exit(0);
   }
   while (VGLSwitchPending) {
     unsigned int offset;
     unsigned int len;
     int i;
 
     VGLSwitchPending = 0;
     if (VGLOnDisplay) {
       ioctl(0, KDENABIO, 0);
       ioctl(0, KDSETMODE, KD_GRAPHICS);
       ioctl(0, VGLMode, 0);
       VGLCurWindow = 0;
       VGLMem = (byte*)mmap(0, VGLAdpInfo.va_window_size, PROT_READ|PROT_WRITE,
 			   MAP_FILE | MAP_SHARED, 0, 0);
 
       /* XXX: what if mmap() has failed! */
       VGLDisplay->Type = VIDBUF8;	/* XXX */
       switch (VGLModeInfo.vi_mem_model) {
       case V_INFO_MM_PLANAR:
 	if (VGLModeInfo.vi_depth == 4 && VGLModeInfo.vi_planes == 4) {
 	  if (VGLBufSize/VGLModeInfo.vi_planes > VGLAdpInfo.va_window_size)
 	    VGLDisplay->Type = VIDBUF4S;
 	  else
 	    VGLDisplay->Type = VIDBUF4;
 	} else {
 	  /* shouldn't be happening */
 	}
         break;
       case V_INFO_MM_PACKED:
 	if (VGLModeInfo.vi_depth == 8) {
 	  if (VGLBufSize/VGLModeInfo.vi_planes > VGLAdpInfo.va_window_size)
 	    VGLDisplay->Type = VIDBUF8S;
 	  else
 	    VGLDisplay->Type = VIDBUF8;
 	}
         break;
       case V_INFO_MM_VGAX:
 	VGLDisplay->Type = VIDBUF8X;
 	break;
       case V_INFO_MM_DIRECT:
 	switch (VGLModeInfo.vi_pixel_size) {
 	  case 2:
 	    if (VGLBufSize/VGLModeInfo.vi_planes > VGLAdpInfo.va_window_size)
 	      VGLDisplay->Type = VIDBUF16S;
 	    else
 	      VGLDisplay->Type = VIDBUF16;
 	    break;
 	  case 3:
 	    if (VGLBufSize/VGLModeInfo.vi_planes > VGLAdpInfo.va_window_size)
 	      VGLDisplay->Type = VIDBUF24S;
 	    else
 	      VGLDisplay->Type = VIDBUF24;
 	    break;
 	  case 4:
 	    if (VGLBufSize/VGLModeInfo.vi_planes > VGLAdpInfo.va_window_size)
 	      VGLDisplay->Type = VIDBUF32S;
 	    else
 	      VGLDisplay->Type = VIDBUF32;
 	    break;
 	  default:
 	  /* shouldn't be happening */
           break;
         }
       default:
 	/* shouldn't be happening */
         break;
       }
 
       VGLDisplay->Bitmap = VGLMem;
       VGLDisplay->Xsize = VGLModeInfo.vi_width;
       VGLDisplay->Ysize = VGLModeInfo.vi_height;
       VGLSetVScreenSize(VGLDisplay, VGLDisplay->VXsize, VGLDisplay->VYsize);
       VGLPanScreen(VGLDisplay, VGLDisplay->Xorigin, VGLDisplay->Yorigin);
       switch (VGLDisplay->Type) {
       case VIDBUF4S:
 	outb(0x3c6, 0xff);
 	outb(0x3ce, 0x01); outb(0x3cf, 0x00);		/* set/reset enable */
 	outb(0x3ce, 0x08); outb(0x3cf, 0xff);		/* bit mask */
 	for (offset = 0; offset < VGLBufSize/VGLModeInfo.vi_planes;
 	     offset += len) {
 	  VGLSetSegment(offset);
 	  len = min(VGLBufSize/VGLModeInfo.vi_planes - offset,
 		    VGLAdpInfo.va_window_size);
 	  for (i = 0; i < VGLModeInfo.vi_planes; i++) {
 	    outb(0x3c4, 0x02);
 	    outb(0x3c5, 0x01<<i);
 	    bcopy(&VGLBuf[i*VGLBufSize/VGLModeInfo.vi_planes + offset],
 		  VGLMem, len);
 	  }
 	}
 	break;
       case VIDBUF4:
       case VIDBUF8X:
 	outb(0x3c6, 0xff);
 	outb(0x3ce, 0x01); outb(0x3cf, 0x00);		/* set/reset enable */
 	outb(0x3ce, 0x08); outb(0x3cf, 0xff);		/* bit mask */
 	for (i = 0; i < VGLModeInfo.vi_planes; i++) {
 	  outb(0x3c4, 0x02);
 	  outb(0x3c5, 0x01<<i);
 	  bcopy(&VGLBuf[i*VGLAdpInfo.va_window_size], VGLMem,
 		VGLAdpInfo.va_window_size);
 	}
 	break;
       case VIDBUF8:
       case VIDBUF8S:
       case VIDBUF16:
       case VIDBUF16S:
       case VIDBUF24:
       case VIDBUF24S:
       case VIDBUF32:
       case VIDBUF32S:
 	for (offset = 0; offset < VGLBufSize; offset += len) {
 	  VGLSetSegment(offset);
 	  len = min(VGLBufSize - offset, VGLAdpInfo.va_window_size);
           bcopy(&VGLBuf[offset], VGLMem, len);
 	}
 	break;
       }
       VGLRestorePalette();
       ioctl(0, VT_RELDISP, VT_ACKACQ);
     }
     else {
       switch (VGLDisplay->Type) {
       case VIDBUF4S:
 	for (offset = 0; offset < VGLBufSize/VGLModeInfo.vi_planes;
 	     offset += len) {
 	  VGLSetSegment(offset);
 	  len = min(VGLBufSize/VGLModeInfo.vi_planes - offset,
 		    VGLAdpInfo.va_window_size);
 	  for (i = 0; i < VGLModeInfo.vi_planes; i++) {
 	    outb(0x3ce, 0x04);
 	    outb(0x3cf, i);
 	    bcopy(VGLMem, &VGLBuf[i*VGLBufSize/VGLModeInfo.vi_planes + offset],
 		  len);
 	  }
 	}
 	break;
       case VIDBUF4:
       case VIDBUF8X:
 	/*
 	 * NOTE: the saved buffer is NOT in the MEMBUF format which 
 	 * the ordinary memory bitmap object is stored in. XXX
 	 */
 	for (i = 0; i < VGLModeInfo.vi_planes; i++) {
 	  outb(0x3ce, 0x04);
 	  outb(0x3cf, i);
 	  bcopy(VGLMem, &VGLBuf[i*VGLAdpInfo.va_window_size],
 		VGLAdpInfo.va_window_size);
 	}
 	break;
       case VIDBUF8:
       case VIDBUF8S:
       case VIDBUF16:
       case VIDBUF16S:
       case VIDBUF24:
       case VIDBUF24S:
       case VIDBUF32:
       case VIDBUF32S:
 	for (offset = 0; offset < VGLBufSize; offset += len) {
 	  VGLSetSegment(offset);
 	  len = min(VGLBufSize - offset, VGLAdpInfo.va_window_size);
           bcopy(VGLMem, &VGLBuf[offset], len);
 	}
 	break;
       }
       VGLMem = MAP_FAILED;
       munmap(VGLDisplay->Bitmap, VGLAdpInfo.va_window_size);
       ioctl(0, VGLOldMode, 0);
       ioctl(0, KDSETMODE, KD_TEXT);
       ioctl(0, KDDISABIO, 0);
       ioctl(0, VT_RELDISP, VT_TRUE);
       VGLDisplay->Bitmap = VGLBuf;
       VGLDisplay->Type = MEMBUF;
       VGLDisplay->Xsize = VGLDisplay->VXsize;
       VGLDisplay->Ysize = VGLDisplay->VYsize;
       while (!VGLOnDisplay) pause();
     }
   }
 }
 
 int
 VGLSetSegment(unsigned int offset)
 {
   if (offset/VGLAdpInfo.va_window_size != VGLCurWindow) {
     ioctl(0, CONS_SETWINORG, offset);		/* FBIO_SETWINORG */
     VGLCurWindow = offset/VGLAdpInfo.va_window_size;
   }
   return (offset%VGLAdpInfo.va_window_size);
 }
 
 int
 VGLSetVScreenSize(VGLBitmap *object, int VXsize, int VYsize)
 {
   int depth;
 
   if (VXsize < object->Xsize || VYsize < object->Ysize)
     return -1;
   if (object->Type == MEMBUF)
     return -1;
   if (ioctl(0, FBIO_SETLINEWIDTH, &VXsize))
     return -1;
   ioctl(0, CONS_ADPINFO, &VGLAdpInfo);	/* FBIO_ADPINFO */
   depth = VGLModeInfo.vi_depth;
   if (depth == 15)
     depth = 16;
   object->VXsize = VGLAdpInfo.va_line_width
 			   *8/(depth/VGLModeInfo.vi_planes);
   object->VYsize = VGLBufSize/VGLModeInfo.vi_planes/VGLAdpInfo.va_line_width;
   if (VYsize < object->VYsize)
     object->VYsize = VYsize;
 
 #ifdef LIBVGL_DEBUG
   fprintf(stderr, "new size: VGLXsize:%d, Ysize:%d, VXsize:%d, VYsize:%d\n",
 	  object->Xsize, object->Ysize, object->VXsize, object->VYsize);
 #endif
 
   return 0;
 }
 
 int
 VGLPanScreen(VGLBitmap *object, int x, int y)
 {
   video_display_start_t origin;
 
   if (x < 0 || x + object->Xsize > object->VXsize
       || y < 0 || y + object->Ysize > object->VYsize)
     return -1;
   if (object->Type == MEMBUF)
     return 0;
   origin.x = x;
   origin.y = y;
   if (ioctl(0, FBIO_SETDISPSTART, &origin))
     return -1;
   object->Xorigin = x;
   object->Yorigin = y;
 
 #ifdef LIBVGL_DEBUG
   fprintf(stderr, "new origin: (%d, %d)\n", x, y);
 #endif
 
   return 0;
 }
Index: projects/capsicum-test/lib/libvgl/mouse.c
===================================================================
--- projects/capsicum-test/lib/libvgl/mouse.c	(revision 345709)
+++ projects/capsicum-test/lib/libvgl/mouse.c	(revision 345710)
@@ -1,323 +1,337 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1991-1997 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/signal.h>
 #include <sys/consio.h>
 #include <sys/fbio.h>
 #include "vgl.h"
 
 #define X 0xff
 static byte StdAndMask[MOUSE_IMG_SIZE*MOUSE_IMG_SIZE] = {
 	X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	X,X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	X,X,X,X,0,0,0,0,0,0,0,0,0,0,0,0,
 	X,X,X,X,X,0,0,0,0,0,0,0,0,0,0,0,
 	X,X,X,X,X,X,0,0,0,0,0,0,0,0,0,0,
 	X,X,X,X,X,X,X,0,0,0,0,0,0,0,0,0,
 	X,X,X,X,X,X,X,X,0,0,0,0,0,0,0,0,
 	X,X,X,X,X,X,X,X,X,0,0,0,0,0,0,0,
 	X,X,X,X,X,X,X,0,0,0,0,0,0,0,0,0,
 	0,0,0,X,X,X,X,0,0,0,0,0,0,0,0,0,
 	0,0,0,X,X,X,X,X,0,0,0,0,0,0,0,0,
 	0,0,0,0,X,X,X,X,0,0,0,0,0,0,0,0,
 	0,0,0,0,X,X,X,X,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };
 static byte StdOrMask[MOUSE_IMG_SIZE*MOUSE_IMG_SIZE] = {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,X,X,X,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,X,X,X,X,0,0,0,0,0,0,0,0,0,0,0,
 	0,X,X,X,X,X,0,0,0,0,0,0,0,0,0,0,
 	0,X,X,X,X,X,X,0,0,0,0,0,0,0,0,0,
 	0,X,X,0,X,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,X,X,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,X,X,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,X,X,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,X,X,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };
 #undef X
 static VGLBitmap VGLMouseStdAndMask = 
     VGLBITMAP_INITIALIZER(MEMBUF, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE, StdAndMask);
 static VGLBitmap VGLMouseStdOrMask = 
     VGLBITMAP_INITIALIZER(MEMBUF, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE, StdOrMask);
 static VGLBitmap *VGLMouseAndMask, *VGLMouseOrMask;
 static byte map[MOUSE_IMG_SIZE*MOUSE_IMG_SIZE*4];
 static VGLBitmap VGLMouseSave = 
     VGLBITMAP_INITIALIZER(MEMBUF, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE, map);
 static int VGLMouseVisible = 0;
-static int VGLMouseFrozen = 0;
 static int VGLMouseShown = 0;
 static int VGLMouseXpos = 0;
 static int VGLMouseYpos = 0;
 static int VGLMouseButtons = 0;
+static volatile sig_atomic_t VGLMintpending;
+static volatile sig_atomic_t VGLMsuppressint;
 
+#define	INTOFF()	(VGLMsuppressint++)
+#define	INTON()		do { 						\
+				if (--VGLMsuppressint == 0 && VGLMintpending) \
+					VGLMouseAction(0);		\
+			} while (0)
+
 void
 VGLMousePointerShow()
 {
   byte buf[MOUSE_IMG_SIZE*MOUSE_IMG_SIZE*4];
   VGLBitmap buffer =
     VGLBITMAP_INITIALIZER(MEMBUF, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE, buf);
   byte crtcidx, crtcval, gdcidx, gdcval;
   int i, pos, pos1;
 
   if (!VGLMouseVisible) {
-    VGLMouseFrozen++;
+    INTOFF();
     VGLMouseVisible = 1;
     crtcidx = inb(0x3c4);
     crtcval = inb(0x3c5);
     gdcidx = inb(0x3ce);
     gdcval = inb(0x3cf);
     __VGLBitmapCopy(VGLDisplay, VGLMouseXpos, VGLMouseYpos, 
 		  &VGLMouseSave, 0, 0, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE);
     bcopy(VGLMouseSave.Bitmap, buffer.Bitmap,
           MOUSE_IMG_SIZE*MOUSE_IMG_SIZE*VGLDisplay->PixelBytes);
     for (pos = 0; pos <  MOUSE_IMG_SIZE*MOUSE_IMG_SIZE; pos++)
       for (i = 0; i < VGLDisplay->PixelBytes; i++) {
         pos1 = pos * VGLDisplay->PixelBytes + i;
         buffer.Bitmap[pos1] = (buffer.Bitmap[pos1] &
                                ~VGLMouseAndMask->Bitmap[pos]) |
                               VGLMouseOrMask->Bitmap[pos];
       }
     __VGLBitmapCopy(&buffer, 0, 0, VGLDisplay, 
 		  VGLMouseXpos, VGLMouseYpos, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE);
     outb(0x3c4, crtcidx);
     outb(0x3c5, crtcval);
     outb(0x3ce, gdcidx);
     outb(0x3cf, gdcval);
-    VGLMouseFrozen--;
+    INTON();
   }
 }
 
 void
 VGLMousePointerHide()
 {
   byte crtcidx, crtcval, gdcidx, gdcval;
 
   if (VGLMouseVisible) {
-    VGLMouseFrozen++;
+    INTOFF();
     VGLMouseVisible = 0;
     crtcidx = inb(0x3c4);
     crtcval = inb(0x3c5);
     gdcidx = inb(0x3ce);
     gdcval = inb(0x3cf);
     __VGLBitmapCopy(&VGLMouseSave, 0, 0, VGLDisplay, 
 		  VGLMouseXpos, VGLMouseYpos, MOUSE_IMG_SIZE, MOUSE_IMG_SIZE);
     outb(0x3c4, crtcidx);
     outb(0x3c5, crtcval);
     outb(0x3ce, gdcidx);
     outb(0x3cf, gdcval);
-    VGLMouseFrozen--;
+    INTON();
   }
 }
 
 void
 VGLMouseMode(int mode)
 {
   if (mode == VGL_MOUSESHOW) {
     if (VGLMouseShown == VGL_MOUSEHIDE) {
       VGLMousePointerShow();
       VGLMouseShown = VGL_MOUSESHOW;
     }
   }
   else {
     if (VGLMouseShown == VGL_MOUSESHOW) {
       VGLMousePointerHide();
       VGLMouseShown = VGL_MOUSEHIDE;
     }
   }
 }
 
 void
 VGLMouseAction(int dummy)	
 {
   struct mouse_info mouseinfo;
 
-  if (VGLMouseFrozen) {
-    VGLMouseFrozen += 8;
+  if (VGLMsuppressint) {
+    VGLMintpending = 1;
     return;
   }
+again:
+  INTOFF();
+  VGLMintpending = 0;
   mouseinfo.operation = MOUSE_GETINFO;
   ioctl(0, CONS_MOUSECTL, &mouseinfo);
   if (VGLMouseShown == VGL_MOUSESHOW)
     VGLMousePointerHide();
   VGLMouseXpos = mouseinfo.u.data.x;
   VGLMouseYpos = mouseinfo.u.data.y;
   VGLMouseButtons = mouseinfo.u.data.buttons;
   if (VGLMouseShown == VGL_MOUSESHOW)
     VGLMousePointerShow();
+
+  /* 
+   * Loop to handle any new (suppressed) signals.  This is INTON() without
+   * recursion.  !SA_RESTART prevents recursion in signal handling.  So the
+   * maximum recursion is 2 levels.
+   */
+  VGLMsuppressint = 0;
+  if (VGLMintpending)
+    goto again;
 }
 
 void
 VGLMouseSetImage(VGLBitmap *AndMask, VGLBitmap *OrMask)
 {
   if (VGLMouseShown == VGL_MOUSESHOW)
     VGLMousePointerHide();
   VGLMouseAndMask = AndMask;
   VGLMouseOrMask = OrMask;
   if (VGLMouseShown == VGL_MOUSESHOW)
     VGLMousePointerShow();
 }
 
 void
 VGLMouseSetStdImage()
 {
   if (VGLMouseShown == VGL_MOUSESHOW)
     VGLMousePointerHide();
   VGLMouseAndMask = &VGLMouseStdAndMask;
   VGLMouseOrMask = &VGLMouseStdOrMask;
   if (VGLMouseShown == VGL_MOUSESHOW)
     VGLMousePointerShow();
 }
 
 int
 VGLMouseInit(int mode)
 {
   struct mouse_info mouseinfo;
   int error, i, mask;
 
   switch (VGLModeInfo.vi_mem_model) {
   case V_INFO_MM_PACKED:
   case V_INFO_MM_PLANAR:
     mask = 0x0f;
     break;
   case V_INFO_MM_VGAX:
     mask = 0x3f;
     break;
   default:
     mask = 0xff;
     break;
   }
   for (i = 0; i < 256; i++)
     VGLMouseStdOrMask.Bitmap[i] &= mask;
   VGLMouseSetStdImage();
   mouseinfo.operation = MOUSE_MODE;
   mouseinfo.u.mode.signal = SIGUSR2;
   if ((error = ioctl(0, CONS_MOUSECTL, &mouseinfo)))
     return error;
   signal(SIGUSR2, VGLMouseAction);
   mouseinfo.operation = MOUSE_GETINFO;
   ioctl(0, CONS_MOUSECTL, &mouseinfo);
   VGLMouseXpos = mouseinfo.u.data.x;
   VGLMouseYpos = mouseinfo.u.data.y;
   VGLMouseButtons = mouseinfo.u.data.buttons;
   VGLMouseMode(mode);
   return 0;
 }
 
 int
 VGLMouseStatus(int *x, int *y, char *buttons)
 {
-  signal(SIGUSR2, SIG_IGN);
+  INTOFF();
   *x =  VGLMouseXpos;
   *y =  VGLMouseYpos;
   *buttons =  VGLMouseButtons;
-  signal(SIGUSR2, VGLMouseAction);
+  INTON();
   return VGLMouseShown;
 }
 
 int
 VGLMouseFreeze(int x, int y, int width, int hight, u_long color)
 {
   int i, xstride, ystride;
 
-    VGLMouseFrozen++;
+    INTOFF();
     if (width > 1 || hight > 1 || (color & 0xc0000000) == 0) { /* bitmap */
       if (VGLMouseShown == 1) {
         int overlap;
 
         if (x > VGLMouseXpos)
           overlap = (VGLMouseXpos + MOUSE_IMG_SIZE) - x;
         else
           overlap = (x + width) - VGLMouseXpos;
         if (overlap > 0) {
           if (y > VGLMouseYpos)
             overlap = (VGLMouseYpos + MOUSE_IMG_SIZE) - y;
           else
             overlap = (y + hight) - VGLMouseYpos;
           if (overlap > 0)
             VGLMousePointerHide();
         } 
       }
     }
     else {				/* bit */
       if (VGLMouseShown &&
           x >= VGLMouseXpos && x < VGLMouseXpos + MOUSE_IMG_SIZE &&
           y >= VGLMouseYpos && y < VGLMouseYpos + MOUSE_IMG_SIZE) {
         xstride = VGLDisplay->PixelBytes;
         ystride = MOUSE_IMG_SIZE * xstride;
         if (color & 0x40000000) {	/* Get */
           color = 0;
           for (i = xstride - 1; i >= 0; i--)
             color = (color << 8) |
                     VGLMouseSave.Bitmap[(y-VGLMouseYpos)*ystride+
                                         (x-VGLMouseXpos)*xstride+i];
           return 0x40000000 | (color & 0xffffff);
         } else {			/* Set */
           color &= 0xffffff;		/* discard flag and other garbage */
           for (i = 0; i < xstride; i++, color >>= 8)
             VGLMouseSave.Bitmap[(y-VGLMouseYpos)*ystride+
                                 (x-VGLMouseXpos)*xstride+i] = color;
           if (VGLMouseAndMask->Bitmap 
             [(y-VGLMouseYpos)*MOUSE_IMG_SIZE+(x-VGLMouseXpos)]) {
             return 1;
           }   
         }   
       }       
     }
   return 0;
 }
 
 void
 VGLMouseUnFreeze()
 {
-  if (VGLMouseFrozen > 8) {
-    VGLMouseFrozen = 0;
-    VGLMouseAction(0);
-  }
-  else {
-    if (VGLMouseShown == VGL_MOUSESHOW && !VGLMouseVisible)
-      VGLMousePointerShow();
-    VGLMouseFrozen = 0;
-  }
+  if (VGLMouseShown == VGL_MOUSESHOW && !VGLMouseVisible && !VGLMintpending)
+    VGLMousePointerShow();
+  while (VGLMsuppressint)
+    INTON();
 }
Index: projects/capsicum-test/lib/ofed/libibnetdisc/Makefile
===================================================================
--- projects/capsicum-test/lib/ofed/libibnetdisc/Makefile	(revision 345709)
+++ projects/capsicum-test/lib/ofed/libibnetdisc/Makefile	(revision 345710)
@@ -1,39 +1,36 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 _spath=	${SRCTOP}/contrib/ofed/libibnetdisc
 .PATH: ${_spath} ${_spath}/man
 
 SHLIBDIR?=	/usr/lib
 LIB_CXX=	ibnetdisc
 SHLIB_MAJOR=	5
 MK_PROFILE=	no
 
 SRCS= \
 chassis.c \
 g_hash_table.cpp \
 ibnetdisc.c \
 ibnetdisc_cache.c \
 query_smp.c
 
 MAN= \
 ibnd_debug.3 \
 ibnd_destroy_fabric.3 \
 ibnd_discover_fabric.3 \
 ibnd_find_node_dr.3 \
 ibnd_find_node_guid.3 \
 ibnd_iter_nodes.3 \
 ibnd_iter_nodes_type.3 \
 ibnd_show_progress.3
 
 LIBADD=		osmcomp ibmad ibumad
 CFLAGS+=	-DHAVE_CONFIG_H=1
 CFLAGS+=	-I${_spath}
 CFLAGS+=	-I${SYSROOT:U${DESTDIR}}/${INCLUDEDIR}/infiniband
-.if ${COMPILER_FEATURES:Mc++11}
-CXXFLAGS+=	-std=c++11
-.endif
 VERSION_MAP=	${_spath}/libibnetdisc.map
 
 .include <bsd.lib.mk>
Index: projects/capsicum-test/libexec/rtld-elf/arm/reloc.c
===================================================================
--- projects/capsicum-test/libexec/rtld-elf/arm/reloc.c	(revision 345709)
+++ projects/capsicum-test/libexec/rtld-elf/arm/reloc.c	(revision 345710)
@@ -1,526 +1,526 @@
 /*	$NetBSD: mdreloc.c,v 1.23 2003/07/26 15:04:38 mrg Exp $	*/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "machine/sysarch.h"
 
 #include "debug.h"
 #include "rtld.h"
 #include "paths.h"
 
 #ifdef __ARM_FP
 /*
  * On processors that have hard floating point supported, we also support
  * running soft float binaries. If we're being built with hard float support,
  * check the ELF headers to make sure that this is a hard float binary. If it is
  * a soft float binary, force the dynamic linker to use the alternative soft
  * float path.
  */
 void
 arm_abi_variant_hook(Elf_Auxinfo **aux_info)
 {
 	Elf_Word ehdr;
 
 	/*
 	 * If we're running an old kernel that doesn't provide any data fail
 	 * safe by doing nothing.
 	 */
 	if (aux_info[AT_EHDRFLAGS] == NULL)
 		return;
 	ehdr = aux_info[AT_EHDRFLAGS]->a_un.a_val;
 
 	/*
 	 * Hard float ABI binaries are the default, and use the default paths
 	 * and such.
 	 */
 	if ((ehdr & EF_ARM_VFP_FLOAT) != 0)
 		return;
 
 	/*
 	 * This is a soft float ABI binary. We need to use the soft float
 	 * settings.
 	 */
 	ld_elf_hints_default = _PATH_SOFT_ELF_HINTS;
 	ld_path_libmap_conf = _PATH_SOFT_LIBMAP_CONF;
 	ld_path_rtld = _PATH_SOFT_RTLD;
 	ld_standard_library_path = SOFT_STANDARD_LIBRARY_PATH;
 	ld_env_prefix = LD_SOFT_;
 }
 #endif
 
 void
 init_pltgot(Obj_Entry *obj)
 {       
 	if (obj->pltgot != NULL) {
 		obj->pltgot[1] = (Elf_Addr) obj;
 		obj->pltgot[2] = (Elf_Addr) &_rtld_bind_start;
 	}
 }
 
 int             
 do_copy_relocations(Obj_Entry *dstobj)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 
 	assert(dstobj->mainprog);	/* COPY relocations are invalid elsewhere */
 
 	rellim = (const Elf_Rel *)((const char *) dstobj->rel + dstobj->relsize);
 	for (rel = dstobj->rel;  rel < rellim;  rel++) {
 		if (ELF_R_TYPE(rel->r_info) == R_ARM_COPY) {
 	    		void *dstaddr;
 			const Elf_Sym *dstsym;
 			const char *name;
 			size_t size;
 			const void *srcaddr;
 			const Elf_Sym *srcsym;
 			const Obj_Entry *srcobj, *defobj;
 			SymLook req;
 			int res;
 			
 			dstaddr = (void *)(dstobj->relocbase + rel->r_offset);
 			dstsym = dstobj->symtab + ELF_R_SYM(rel->r_info);
 			name = dstobj->strtab + dstsym->st_name;
 			size = dstsym->st_size;
 
 			symlook_init(&req, name);
 			req.ventry = fetch_ventry(dstobj,
 			    ELF_R_SYM(rel->r_info));
 			req.flags = SYMLOOK_EARLY;
 
 			for (srcobj = globallist_next(dstobj); srcobj != NULL;
 			    srcobj = globallist_next(srcobj)) {
 				res = symlook_obj(&req, srcobj);
 				if (res == 0) {
 					srcsym = req.sym_out;
 					defobj = req.defobj_out;
 					break;
 				}
 			}			
 			if (srcobj == NULL) {
 				_rtld_error(
 "Undefined symbol \"%s\" referenced from COPY relocation in %s",
 				    name, dstobj->path);
 				return (-1);
 			}
 			
 			srcaddr = (const void *)(defobj->relocbase +
 			    srcsym->st_value);
 			memcpy(dstaddr, srcaddr, size);
 		}
 	}
 	return 0;			     
 }
 
 void _rtld_bind_start(void);
 void _rtld_relocate_nonplt_self(Elf_Dyn *, Elf_Addr);
 
 void
 _rtld_relocate_nonplt_self(Elf_Dyn *dynp, Elf_Addr relocbase)
 {
 	const Elf_Rel *rel = NULL, *rellim;
 	Elf_Addr relsz = 0;
 	Elf_Addr *where;
 	uint32_t size;
 
 	for (; dynp->d_tag != DT_NULL; dynp++) {
 		switch (dynp->d_tag) {
 		case DT_REL:
 			rel = (const Elf_Rel *)(relocbase + dynp->d_un.d_ptr);
 			break;
 		case DT_RELSZ:
 			relsz = dynp->d_un.d_val;
 			break;
 		}
 	}
 	rellim = (const Elf_Rel *)((const char *)rel + relsz);
 	size = (rellim - 1)->r_offset - rel->r_offset;
 	for (; rel < rellim; rel++) {
 		where = (Elf_Addr *)(relocbase + rel->r_offset);
 		
 		*where += (Elf_Addr)relocbase;
 	}
 }
 /*
  * It is possible for the compiler to emit relocations for unaligned data.
  * We handle this situation with these inlines.
  */
 #define	RELOC_ALIGNED_P(x) \
 	(((uintptr_t)(x) & (sizeof(void *) - 1)) == 0)
 
 static __inline Elf_Addr
 load_ptr(void *where)
 {
 	Elf_Addr res;
 
 	memcpy(&res, where, sizeof(res));
 
 	return (res);
 }
 
 static __inline void
 store_ptr(void *where, Elf_Addr val)
 {
 
 	memcpy(where, &val, sizeof(val));
 }
 
 static int
 reloc_nonplt_object(Obj_Entry *obj, const Elf_Rel *rel, SymCache *cache,
     int flags, RtldLockState *lockstate)
 {
 	Elf_Addr        *where;
 	const Elf_Sym   *def;
 	const Obj_Entry *defobj;
 	Elf_Addr         tmp;
 	unsigned long	 symnum;
 
 	where = (Elf_Addr *)(obj->relocbase + rel->r_offset);
 	symnum = ELF_R_SYM(rel->r_info);
 
 	switch (ELF_R_TYPE(rel->r_info)) {
 	case R_ARM_NONE:
 		break;
 		
 #if 1 /* XXX should not occur */
 	case R_ARM_PC24: {	/* word32 S - P + A */
 		Elf32_Sword addend;
 		
 		/*
 		 * Extract addend and sign-extend if needed.
 		 */
 		addend = *where;
 		if (addend & 0x00800000)
 			addend |= 0xff000000;
 		
 		def = find_symdef(symnum, obj, &defobj, flags, cache,
 		    lockstate);
 		if (def == NULL)
 				return -1;
 			tmp = (Elf_Addr)obj->relocbase + def->st_value
 			    - (Elf_Addr)where + (addend << 2);
 			if ((tmp & 0xfe000000) != 0xfe000000 &&
 			    (tmp & 0xfe000000) != 0) {
 				_rtld_error(
 				"%s: R_ARM_PC24 relocation @ %p to %s failed "
 				"(displacement %ld (%#lx) out of range)",
 				    obj->path, where,
 				    obj->strtab + obj->symtab[symnum].st_name,
 				    (long) tmp, (long) tmp);
 				return -1;
 			}
 			tmp >>= 2;
 			*where = (*where & 0xff000000) | (tmp & 0x00ffffff);
 			dbg("PC24 %s in %s --> %p @ %p in %s",
 			    obj->strtab + obj->symtab[symnum].st_name,
 			    obj->path, (void *)*where, where, defobj->path);
 			break;
 		}
 #endif
 
 		case R_ARM_ABS32:	/* word32 B + S + A */
 		case R_ARM_GLOB_DAT:	/* word32 B + S */
 			def = find_symdef(symnum, obj, &defobj, flags, cache,
 			    lockstate);
 			if (def == NULL)
 				return -1;
 			if (__predict_true(RELOC_ALIGNED_P(where))) {
 				tmp =  *where + (Elf_Addr)defobj->relocbase +
 				    def->st_value;
 				*where = tmp;
 			} else {
 				tmp = load_ptr(where) +
 				    (Elf_Addr)defobj->relocbase +
 				    def->st_value;
 				store_ptr(where, tmp);
 			}
 			dbg("ABS32/GLOB_DAT %s in %s --> %p @ %p in %s",
 			    obj->strtab + obj->symtab[symnum].st_name,
 			    obj->path, (void *)tmp, where, defobj->path);
 			break;
 
 		case R_ARM_RELATIVE:	/* word32 B + A */
 			if (__predict_true(RELOC_ALIGNED_P(where))) {
 				tmp = *where + (Elf_Addr)obj->relocbase;
 				*where = tmp;
 			} else {
 				tmp = load_ptr(where) +
 				    (Elf_Addr)obj->relocbase;
 				store_ptr(where, tmp);
 			}
 			dbg("RELATIVE in %s --> %p", obj->path,
 			    (void *)tmp);
 			break;
 
 		case R_ARM_COPY:
 			/*
 			 * These are deferred until all other relocations have
 			 * been done.  All we do here is make sure that the
 			 * COPY relocation is not in a shared library.  They
 			 * are allowed only in executable files.
 			 */
 			if (!obj->mainprog) {
 				_rtld_error(
 			"%s: Unexpected R_COPY relocation in shared library",
 				    obj->path);
 				return -1;
 			}
 			dbg("COPY (avoid in main)");
 			break;
 
 		case R_ARM_TLS_DTPOFF32:
 			def = find_symdef(symnum, obj, &defobj, flags, cache,
 			    lockstate);
 			if (def == NULL)
 				return -1;
 
 			tmp = (Elf_Addr)(def->st_value);
 			if (__predict_true(RELOC_ALIGNED_P(where)))
 				*where = tmp;
 			else
 				store_ptr(where, tmp);
 
 			dbg("TLS_DTPOFF32 %s in %s --> %p",
 			    obj->strtab + obj->symtab[symnum].st_name,
 			    obj->path, (void *)tmp);
 
 			break;
 		case R_ARM_TLS_DTPMOD32:
 			def = find_symdef(symnum, obj, &defobj, flags, cache,
 			    lockstate);
 			if (def == NULL)
 				return -1;
 
 			tmp = (Elf_Addr)(defobj->tlsindex);
 			if (__predict_true(RELOC_ALIGNED_P(where)))
 				*where = tmp;
 			else
 				store_ptr(where, tmp);
 
 			dbg("TLS_DTPMOD32 %s in %s --> %p",
 			    obj->strtab + obj->symtab[symnum].st_name,
 			    obj->path, (void *)tmp);
 
 			break;
 
 		case R_ARM_TLS_TPOFF32:
 			def = find_symdef(symnum, obj, &defobj, flags, cache,
 			    lockstate);
 			if (def == NULL)
 				return -1;
 
-			if (!defobj->tls_done && allocate_tls_offset(obj))
+			if (!defobj->tls_done && !allocate_tls_offset(obj))
 				return -1;
 
 			tmp = (Elf_Addr)def->st_value + defobj->tlsoffset;
 			if (__predict_true(RELOC_ALIGNED_P(where)))
 				*where = tmp;
 			else
 				store_ptr(where, tmp);
 			dbg("TLS_TPOFF32 %s in %s --> %p",
 			    obj->strtab + obj->symtab[symnum].st_name,
 			    obj->path, (void *)tmp);
 			break;
 
 
 		default:
 			dbg("sym = %lu, type = %lu, offset = %p, "
 			    "contents = %p, symbol = %s",
 			    symnum, (u_long)ELF_R_TYPE(rel->r_info),
 			    (void *)rel->r_offset, (void *)load_ptr(where),
 			    obj->strtab + obj->symtab[symnum].st_name);
 			_rtld_error("%s: Unsupported relocation type %ld "
 			    "in non-PLT relocations\n",
 			    obj->path, (u_long) ELF_R_TYPE(rel->r_info));
 			return -1;
 	}
 	return 0;
 }
 
 /*
  *  * Process non-PLT relocations
  *   */
 int
 reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags,
     RtldLockState *lockstate)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	SymCache *cache;
 	int r = -1;
 	
 	/* The relocation for the dynamic loader has already been done. */
 	if (obj == obj_rtld)
 		return (0);
 	if ((flags & SYMLOOK_IFUNC) != 0)
 		/* XXX not implemented */
 		return (0);
 
 	/*
  	 * The dynamic loader may be called from a thread, we have
 	 * limited amounts of stack available so we cannot use alloca().
 	 */
 	cache = calloc(obj->dynsymcount, sizeof(SymCache));
 	/* No need to check for NULL here */
 
 	rellim = (const Elf_Rel *)((const char *)obj->rel + obj->relsize);
 	for (rel = obj->rel; rel < rellim; rel++) {
 		if (reloc_nonplt_object(obj, rel, cache, flags, lockstate) < 0)
 			goto done;
 	}
 	r = 0;
 done:
 	if (cache != NULL)
 		free(cache);
 	return (r);
 }
 
 /*
  *  * Process the PLT relocations.
  *   */
 int
 reloc_plt(Obj_Entry *obj, int flags __unused, RtldLockState *lockstate __unused)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 		
 	rellim = (const Elf_Rel *)((const char *)obj->pltrel +
 	    obj->pltrelsize);
 	for (rel = obj->pltrel;  rel < rellim;  rel++) {
 		Elf_Addr *where;
 
 		assert(ELF_R_TYPE(rel->r_info) == R_ARM_JUMP_SLOT);
 		
 		where = (Elf_Addr *)(obj->relocbase + rel->r_offset);
 		*where += (Elf_Addr )obj->relocbase;
 	}
 	
 	return (0);
 }
 
 /*
  *  * LD_BIND_NOW was set - force relocation for all jump slots
  *   */
 int
 reloc_jmpslots(Obj_Entry *obj, int flags, RtldLockState *lockstate)
 {
 	const Obj_Entry *defobj;
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Sym *def;
 	Elf_Addr *where;
 	Elf_Addr target;
 	
 	rellim = (const Elf_Rel *)((const char *)obj->pltrel + obj->pltrelsize);
 	for (rel = obj->pltrel; rel < rellim; rel++) {
 		assert(ELF_R_TYPE(rel->r_info) == R_ARM_JUMP_SLOT);
 		where = (Elf_Addr *)(obj->relocbase + rel->r_offset);
 		def = find_symdef(ELF_R_SYM(rel->r_info), obj, &defobj,
 		    SYMLOOK_IN_PLT | flags, NULL, lockstate);
 		if (def == NULL) {
 			dbg("reloc_jmpslots: sym not found");
 			return (-1);
 		}
 		
 		target = (Elf_Addr)(defobj->relocbase + def->st_value);		
 		reloc_jmpslot(where, target, defobj, obj,
 		    (const Elf_Rel *) rel);
 	}
 	
 	obj->jmpslots_done = true;
 	
 	return (0);
 }
 
 int
 reloc_iresolve(Obj_Entry *obj __unused,
     struct Struct_RtldLockState *lockstate __unused)
 {
 
 	/* XXX not implemented */
 	return (0);
 }
 
 int
 reloc_gnu_ifunc(Obj_Entry *obj __unused, int flags __unused,
     struct Struct_RtldLockState *lockstate __unused)
 {
 
 	/* XXX not implemented */
 	return (0);
 }
 
 Elf_Addr
 reloc_jmpslot(Elf_Addr *where, Elf_Addr target,
     const Obj_Entry *defobj __unused, const Obj_Entry *obj __unused,
     const Elf_Rel *rel)
 {
 
 	assert(ELF_R_TYPE(rel->r_info) == R_ARM_JUMP_SLOT);
 
 	if (*where != target && !ld_bind_not)
 		*where = target;
 	return (target);
 }
 
 void
 ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused)
 {
 
 }
 
 void
 pre_init(void)
 {
 
 }
 
 void
 allocate_initial_tls(Obj_Entry *objs)
 {
 #ifdef ARM_TP_ADDRESS
 	void **_tp = (void **)ARM_TP_ADDRESS;
 #endif
 
 	/*
 	* Fix the size of the static TLS block by using the maximum
 	* offset allocated so far and adding a bit for dynamic modules to
 	* use.
 	*/
 
 	tls_static_space = tls_last_offset + tls_last_size + RTLD_STATIC_TLS_EXTRA;
 
 #ifdef ARM_TP_ADDRESS
 	(*_tp) = (void *) allocate_tls(objs, NULL, TLS_TCB_SIZE, 8);
 #else
 	sysarch(ARM_SET_TP, allocate_tls(objs, NULL, TLS_TCB_SIZE, 8));
 #endif
 }
 
 void *
 __tls_get_addr(tls_index* ti)
 {
 	char *p;
 #ifdef ARM_TP_ADDRESS
 	void **_tp = (void **)ARM_TP_ADDRESS;
 
 	p = tls_get_addr_common((Elf_Addr **)(*_tp), ti->ti_module, ti->ti_offset);
 #else
 	void *_tp;
 	__asm __volatile("mrc  p15, 0, %0, c13, c0, 3"		\
 	    : "=r" (_tp));
 	p = tls_get_addr_common((Elf_Addr **)(_tp), ti->ti_module, ti->ti_offset);
 #endif
 
 	return (p);
 }
Index: projects/capsicum-test/libexec/rtld-elf/mips/reloc.c
===================================================================
--- projects/capsicum-test/libexec/rtld-elf/mips/reloc.c	(revision 345709)
+++ projects/capsicum-test/libexec/rtld-elf/mips/reloc.c	(revision 345710)
@@ -1,841 +1,841 @@
 /*	$NetBSD: mips_reloc.c,v 1.58 2010/01/14 11:57:06 skrll Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright 1997 Michael L. Hitch <mhitch@montana.edu>
  * Portions copyright 2002 Charles M. Hannum <root@ihack.net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/endian.h>
 
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 
 #include <machine/sysarch.h>
 #include <machine/tls.h>
 
 #include "debug.h"
 #include "rtld.h"
 
 #ifdef __mips_n64
 #define	GOT1_MASK	0x8000000000000000UL
 #else
 #define	GOT1_MASK	0x80000000UL
 #endif
 
 /*
  * Determine if the second GOT entry is reserved for rtld or if it is
  * the first "real" GOT entry.
  *
  * This must be a macro rather than a function so that
  * _rtld_relocate_nonplt_self doesn't trigger a GOT invocation trying
  * to use it before the local GOT entries in rtld are adjusted.
  */
 #ifdef __mips_n64
 /* Old binutils uses the 32-bit GOT1 mask value for N64. */
 #define GOT1_RESERVED_FOR_RTLD(got)					\
 	(((got)[1] == 0x80000000) || (got)[1] & GOT1_MASK)
 #else
 #define GOT1_RESERVED_FOR_RTLD(got)	((got)[1] & GOT1_MASK)
 #endif
 
 #ifdef __mips_n64
 /*
  * ELF64 MIPS encodes the relocs uniquely.  The first 32-bits of info contain
  * the symbol index.  The top 32-bits contain three relocation types encoded
  * in big-endian integer with first relocation in LSB.  This means for little
  * endian we have to byte swap that integer (r_type).
  */
 #define	Elf_Sxword			Elf64_Sxword
 #define	ELF_R_NXTTYPE_64_P(r_type)	((((r_type) >> 8) & 0xff) == R_TYPE(64))
 #if BYTE_ORDER == LITTLE_ENDIAN
 #undef ELF_R_SYM
 #undef ELF_R_TYPE
 #define ELF_R_SYM(r_info)		((r_info) & 0xffffffff)
 #define ELF_R_TYPE(r_info)		bswap32((r_info) >> 32)
 #endif
 #else
 #define	ELF_R_NXTTYPE_64_P(r_type)	(0)
 #define	Elf_Sxword			Elf32_Sword
 #endif
 
 void _rtld_pltbind_start(void);
 
 void
 init_pltgot(Obj_Entry *obj)
 {
 
 	if (obj->pltgot != NULL) {
 		obj->pltgot[0] = (Elf_Addr) &_rtld_bind_start;
 		if (GOT1_RESERVED_FOR_RTLD(obj->pltgot))
 			obj->pltgot[1] = (Elf_Addr) obj | GOT1_MASK;
 	}
 	if (obj->mips_pltgot != NULL) {
 		obj->mips_pltgot[0] = (Elf_Addr) &_rtld_pltbind_start;
 		obj->mips_pltgot[1] = (Elf_Addr) obj;
 	}
 }
 
 int
 do_copy_relocations(Obj_Entry *dstobj)
 {
 	const Obj_Entry *srcobj, *defobj;
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Sym *srcsym;
 	const Elf_Sym *dstsym;
 	const void *srcaddr;
 	const char *name;
 	void *dstaddr;
 	SymLook req;
 	size_t size;
 	int res;
 
 	/*
 	 * COPY relocs are invalid outside of the main program
 	 */
 	assert(dstobj->mainprog);
 
 	rellim = (const Elf_Rel *)((const char *)dstobj->rel + dstobj->relsize);
 	for (rel = dstobj->rel; rel < rellim; rel++) {
 		if (ELF_R_TYPE(rel->r_info) != R_MIPS_COPY)
 			continue;
 
 		dstaddr = (void *)(dstobj->relocbase + rel->r_offset);
 		dstsym = dstobj->symtab + ELF_R_SYM(rel->r_info);
 		name = dstobj->strtab + dstsym->st_name;
 		size = dstsym->st_size;
 
 		symlook_init(&req, name);
 		req.ventry = fetch_ventry(dstobj, ELF_R_SYM(rel->r_info));
 		req.flags = SYMLOOK_EARLY;
 
 		for (srcobj = globallist_next(dstobj); srcobj != NULL;
 		     srcobj = globallist_next(srcobj)) {
 			res = symlook_obj(&req, srcobj);
 			if (res == 0) {
 				srcsym = req.sym_out;
 				defobj = req.defobj_out;
 				break;
 			}
 		}
 		if (srcobj == NULL) {
 			_rtld_error(
 "Undefined symbol \"%s\" referenced from COPY relocation in %s",
 			    name, dstobj->path);
 			return (-1);
 		}
 
 		srcaddr = (const void *)(defobj->relocbase + srcsym->st_value);
 		memcpy(dstaddr, srcaddr, size);
 	}
 
 	return (0);
 }
 
 void _rtld_relocate_nonplt_self(Elf_Dyn *, Elf_Addr);
 
 /*
  * It is possible for the compiler to emit relocations for unaligned data.
  * We handle this situation with these inlines.
  */
 static __inline Elf_Sxword
 load_ptr(void *where, size_t len)
 {
 	Elf_Sxword val;
 
 	if (__predict_true(((uintptr_t)where & (len - 1)) == 0)) {
 #ifdef __mips_n64
 		if (len == sizeof(Elf_Sxword))
 			return *(Elf_Sxword *)where;
 #endif
 		return *(Elf_Sword *)where;
 	}
 
 	val = 0;
 #if BYTE_ORDER == LITTLE_ENDIAN
 	(void)memcpy(&val, where, len);
 #endif
 #if BYTE_ORDER == BIG_ENDIAN
 	(void)memcpy((uint8_t *)((&val)+1) - len, where, len);
 #endif
 	return (len == sizeof(Elf_Sxword)) ? val : (Elf_Sword)val;
 }
 
 static __inline void
 store_ptr(void *where, Elf_Sxword val, size_t len)
 {
 	if (__predict_true(((uintptr_t)where & (len - 1)) == 0)) {
 #ifdef __mips_n64
 		if (len == sizeof(Elf_Sxword)) {
 			*(Elf_Sxword *)where = val;
 			return;
 		}
 #endif
 		*(Elf_Sword *)where = val;
 		return;
 	}
 #if BYTE_ORDER == LITTLE_ENDIAN
 	(void)memcpy(where, &val, len);
 #endif
 #if BYTE_ORDER == BIG_ENDIAN
 	(void)memcpy(where, (const uint8_t *)((&val)+1) - len, len);
 #endif
 }
 
 void
 _rtld_relocate_nonplt_self(Elf_Dyn *dynp, Elf_Addr relocbase)
 {
 	const Elf_Rel *rel = NULL, *rellim;
 	Elf_Addr relsz = 0;
 	const Elf_Sym *symtab = NULL, *sym;
 	Elf_Addr *where;
 	Elf_Addr *got = NULL;
 	Elf_Word local_gotno = 0, symtabno = 0, gotsym = 0;
 	size_t i;
 
 	for (; dynp->d_tag != DT_NULL; dynp++) {
 		switch (dynp->d_tag) {
 		case DT_REL:
 			rel = (const Elf_Rel *)(relocbase + dynp->d_un.d_ptr);
 			break;
 		case DT_RELSZ:
 			relsz = dynp->d_un.d_val;
 			break;
 		case DT_SYMTAB:
 			symtab = (const Elf_Sym *)(relocbase + dynp->d_un.d_ptr);
 			break;
 		case DT_PLTGOT:
 			got = (Elf_Addr *)(relocbase + dynp->d_un.d_ptr);
 			break;
 		case DT_MIPS_LOCAL_GOTNO:
 			local_gotno = dynp->d_un.d_val;
 			break;
 		case DT_MIPS_SYMTABNO:
 			symtabno = dynp->d_un.d_val;
 			break;
 		case DT_MIPS_GOTSYM:
 			gotsym = dynp->d_un.d_val;
 			break;
 		}
 	}
 
 	i = GOT1_RESERVED_FOR_RTLD(got) ? 2 : 1;
 	/* Relocate the local GOT entries */
 	got += i;
 	for (; i < local_gotno; i++) {
 		*got++ += relocbase;
 	}
 
 	sym = symtab + gotsym;
 	/* Now do the global GOT entries */
 	for (i = gotsym; i < symtabno; i++) {
 		*got = sym->st_value + relocbase;
 		++sym;
 		++got;
 	}
 
 	rellim = (const Elf_Rel *)((const char *)rel + relsz);
 	for (; rel < rellim; rel++) {
 		Elf_Word r_symndx, r_type;
 
 		where = (void *)(relocbase + rel->r_offset);
 
 		r_symndx = ELF_R_SYM(rel->r_info);
 		r_type = ELF_R_TYPE(rel->r_info);
 
 		switch (r_type & 0xff) {
 		case R_TYPE(REL32): {
 			const size_t rlen =
 			    ELF_R_NXTTYPE_64_P(r_type)
 				? sizeof(Elf_Sxword)
 				: sizeof(Elf_Sword);
 			Elf_Sxword old = load_ptr(where, rlen);
 			Elf_Sxword val = old;
 #ifdef __mips_n64
 			assert(r_type == R_TYPE(REL32)
 			    || r_type == (R_TYPE(REL32)|(R_TYPE(64) << 8)));
 #endif
 			assert(r_symndx < gotsym);
 			sym = symtab + r_symndx;
 			assert(ELF_ST_BIND(sym->st_info) == STB_LOCAL);
 			val += relocbase;
 			dbg("REL32/L(%p) %p -> %p in <self>",
 			    where, (void *)old, (void *)val);
 			store_ptr(where, val, rlen);
 			break;
 		}
 
 		case R_TYPE(GPREL32):
 		case R_TYPE(NONE):
 			break;
 
 
 		default:
 			abort();
 			break;
 		}
 	}
 }
 
 Elf_Addr
 _mips_rtld_bind(Obj_Entry *obj, Elf_Size reloff)
 {
         Elf_Addr *got = obj->pltgot;
         const Elf_Sym *def;
         const Obj_Entry *defobj;
         Elf_Addr *where;
         Elf_Addr target;
         RtldLockState lockstate;
 
 	rlock_acquire(rtld_bind_lock, &lockstate);
 	if (sigsetjmp(lockstate.env, 0) != 0)
 		lock_upgrade(rtld_bind_lock, &lockstate);
 
 	where = &got[obj->local_gotno + reloff - obj->gotsym];
         def = find_symdef(reloff, obj, &defobj, SYMLOOK_IN_PLT, NULL,
            &lockstate);
         if (def == NULL)
 		rtld_die();
 
         target = (Elf_Addr)(defobj->relocbase + def->st_value);
         dbg("bind now/fixup at %s sym # %jd in %s --> was=%p new=%p",
 	    obj->path,
 	    (intmax_t)reloff, defobj->strtab + def->st_name, 
 	    (void *)*where, (void *)target);
 	if (!ld_bind_not)
 		*where = target;
 	lock_release(rtld_bind_lock, &lockstate);
 	return (Elf_Addr)target;
 }
 
 int
 reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags,
     RtldLockState *lockstate)
 {
 	const Elf_Rel *rel;
 	const Elf_Rel *rellim;
 	Elf_Addr *got = obj->pltgot;
 	const Elf_Sym *sym, *def;
 	const Obj_Entry *defobj;
 	Elf_Word i;
 #ifdef SUPPORT_OLD_BROKEN_LD
 	int broken;
 #endif
 
 	/* The relocation for the dynamic loader has already been done. */
 	if (obj == obj_rtld)
 		return (0);
 
 	if ((flags & SYMLOOK_IFUNC) != 0)
 		/* XXX not implemented */
 		return (0);
 
 #ifdef SUPPORT_OLD_BROKEN_LD
 	broken = 0;
 	sym = obj->symtab;
 	for (i = 1; i < 12; i++)
 		if (sym[i].st_info == ELF_ST_INFO(STB_LOCAL, STT_NOTYPE))
 			broken = 1;
 	dbg("%s: broken=%d", obj->path, broken);
 #endif
 
 	i = GOT1_RESERVED_FOR_RTLD(got) ? 2 : 1;
 
 	/* Relocate the local GOT entries */
 	got += i;
 	dbg("got:%p for %d entries adding %p",
 	    got, obj->local_gotno, obj->relocbase);
 	for (; i < obj->local_gotno; i++) {
 		*got += (Elf_Addr)obj->relocbase;
 		got++;
 	}
 	sym = obj->symtab + obj->gotsym;
 
 	dbg("got:%p for %d entries",
 	    got, obj->symtabno);
 	/* Now do the global GOT entries */
 	for (i = obj->gotsym; i < obj->symtabno; i++) {
 		dbg(" doing got %d sym %p (%s, %lx)", i - obj->gotsym, sym,
 		    sym->st_name + obj->strtab, (u_long) *got);
 
 #ifdef SUPPORT_OLD_BROKEN_LD
 		if (ELF_ST_TYPE(sym->st_info) == STT_FUNC &&
 		    broken && sym->st_shndx == SHN_UNDEF) {
 			/*
 			 * XXX DANGER WILL ROBINSON!
 			 * You might think this is stupid, as it intentionally
 			 * defeats lazy binding -- and you'd be right.
 			 * Unfortunately, for lazy binding to work right, we
 			 * need to a way to force the GOT slots used for
 			 * function pointers to be resolved immediately.  This
 			 * is supposed to be done automatically by the linker,
 			 * by not outputting a PLT slot and setting st_value
 			 * to 0 if there are non-PLT references, but older
 			 * versions of GNU ld do not do this.
 			 */
 			def = find_symdef(i, obj, &defobj, flags, NULL,
 			    lockstate);
 			if (def == NULL)
 				return -1;
 			*got = def->st_value + (Elf_Addr)defobj->relocbase;
 		} else
 #endif
 		if (ELF_ST_TYPE(sym->st_info) == STT_FUNC &&
 		    sym->st_value != 0 && sym->st_shndx == SHN_UNDEF) {
 			/*
 			 * If there are non-PLT references to the function,
 			 * st_value should be 0, forcing us to resolve the
 			 * address immediately.
 			 *
 			 * XXX DANGER WILL ROBINSON!
 			 * The linker is not outputting PLT slots for calls to
 			 * functions that are defined in the same shared
 			 * library.  This is a bug, because it can screw up
 			 * link ordering rules if the symbol is defined in
 			 * more than one module.  For now, if there is a
 			 * definition, we fail the test above and force a full
 			 * symbol lookup.  This means that all intra-module
 			 * calls are bound immediately.  - mycroft, 2003/09/24
 			 */
 			*got = sym->st_value + (Elf_Addr)obj->relocbase;
 			if ((Elf_Addr)(*got) == (Elf_Addr)obj->relocbase) {
 				dbg("Warning2, i:%d maps to relocbase address:%p",
 				    i, obj->relocbase);
 			}
 
 		} else if (sym->st_info == ELF_ST_INFO(STB_GLOBAL, STT_SECTION)) {
 			/* Symbols with index SHN_ABS are not relocated. */
 			if (sym->st_shndx != SHN_ABS) {
 				*got = sym->st_value +
 				    (Elf_Addr)obj->relocbase;
 				if ((Elf_Addr)(*got) == (Elf_Addr)obj->relocbase) {
 					dbg("Warning3, i:%d maps to relocbase address:%p",
 					    i, obj->relocbase);
 				}
 			}
 		} else {
 			/* TODO: add cache here */
 			def = find_symdef(i, obj, &defobj, flags, NULL,
 			    lockstate);
 			if (def == NULL) {
 				dbg("Warning4, can't find symbole %d", i);
 				return -1;
 			}
 			*got = def->st_value + (Elf_Addr)defobj->relocbase;
 			if ((Elf_Addr)(*got) == (Elf_Addr)obj->relocbase) {
 				dbg("Warning4, i:%d maps to relocbase address:%p",
 				    i, obj->relocbase);
 				dbg("via first obj symbol %s",
 				    obj->strtab + obj->symtab[i].st_name);
 				dbg("found in obj %p:%s",
 				    defobj, defobj->path);
 			}
 		}
 
 		dbg("  --> now %lx", (u_long) *got);
 		++sym;
 		++got;
 	}
 
 	got = obj->pltgot;
 	rellim = (const Elf_Rel *)((const char *)obj->rel + obj->relsize);
 	for (rel = obj->rel; rel < rellim; rel++) {
 		Elf_Word	r_symndx, r_type;
 		void		*where;
 
 		where = obj->relocbase + rel->r_offset;
 		r_symndx = ELF_R_SYM(rel->r_info);
 		r_type = ELF_R_TYPE(rel->r_info);
 
 		switch (r_type & 0xff) {
 		case R_TYPE(NONE):
 			break;
 
 		case R_TYPE(REL32): {
 			/* 32-bit PC-relative reference */
 			const size_t rlen =
 			    ELF_R_NXTTYPE_64_P(r_type)
 				? sizeof(Elf_Sxword)
 				: sizeof(Elf_Sword);
 			Elf_Sxword old = load_ptr(where, rlen);
 			Elf_Sxword val = old;
 
 			def = obj->symtab + r_symndx;
 
 			if (r_symndx >= obj->gotsym) {
 				val += got[obj->local_gotno + r_symndx - obj->gotsym];
 				dbg("REL32/G(%p) %p --> %p (%s) in %s",
 				    where, (void *)old, (void *)val,
 				    obj->strtab + def->st_name,
 				    obj->path);
 			} else {
 				/*
 				 * XXX: ABI DIFFERENCE!
 				 *
 				 * Old NetBSD binutils would generate shared
 				 * libs with section-relative relocations being
 				 * already adjusted for the start address of
 				 * the section.
 				 *
 				 * New binutils, OTOH, generate shared libs
 				 * with the same relocations being based at
 				 * zero, so we need to add in the start address
 				 * of the section.
 				 *
 				 * --rkb, Oct 6, 2001
 				 */
 
 				if (def->st_info ==
 				    ELF_ST_INFO(STB_LOCAL, STT_SECTION)
 #ifdef SUPPORT_OLD_BROKEN_LD
 				    && !broken
 #endif
 				    )
 					val += (Elf_Addr)def->st_value;
 
 				val += (Elf_Addr)obj->relocbase;
 
 				dbg("REL32/L(%p) %p -> %p (%s) in %s",
 				    where, (void *)old, (void *)val,
 				    obj->strtab + def->st_name, obj->path);
 			}
 			store_ptr(where, val, rlen);
 			break;
 		}
 
 		case R_TYPE(COPY):
 			/*
 			 * These are deferred until all other relocations have
 			 * been done. All we do here is make sure that the
 			 * COPY relocation is not in a shared library. They
 			 * are allowed only in executable files.
 			 */
 			if (!obj->mainprog) {
 				_rtld_error("%s: Unexpected R_MIPS_COPY "
 				    "relocation in shared library", obj->path);
 				return (-1);
 			}
 			break;
 			
 #ifdef __mips_n64
 		case R_TYPE(TLS_DTPMOD64):
 #else
 		case R_TYPE(TLS_DTPMOD32): 
 #endif
 		{
 
 			const size_t rlen = sizeof(Elf_Addr);
 			Elf_Addr old = load_ptr(where, rlen);
 			Elf_Addr val = old;
 
         		def = find_symdef(r_symndx, obj, &defobj, flags, NULL,
 	    			lockstate);
 			if (def == NULL)
 				return -1;
 
 			val += (Elf_Addr)defobj->tlsindex;
 
 			store_ptr(where, val, rlen);
 			dbg("DTPMOD %s in %s %p --> %p in %s",
 			    obj->strtab + obj->symtab[r_symndx].st_name,
 			    obj->path, (void *)old, (void*)val, defobj->path);
 			break;
 		}
 
 #ifdef __mips_n64
 		case R_TYPE(TLS_DTPREL64):
 #else
 		case R_TYPE(TLS_DTPREL32):
 #endif
 		{
 			const size_t rlen = sizeof(Elf_Addr);
 			Elf_Addr old = load_ptr(where, rlen);
 			Elf_Addr val = old;
 
         		def = find_symdef(r_symndx, obj, &defobj, flags, NULL,
 	    			lockstate);
 			if (def == NULL)
 				return -1;
 
-			if (!defobj->tls_done && allocate_tls_offset(obj))
+			if (!defobj->tls_done && !allocate_tls_offset(obj))
 				return -1;
 
 			val += (Elf_Addr)def->st_value - TLS_DTP_OFFSET;
 			store_ptr(where, val, rlen);
 
 			dbg("DTPREL %s in %s %p --> %p in %s",
 			    obj->strtab + obj->symtab[r_symndx].st_name,
 			    obj->path, (void*)old, (void *)val, defobj->path);
 			break;
 		}
 
 #ifdef __mips_n64
 		case R_TYPE(TLS_TPREL64):
 #else
 		case R_TYPE(TLS_TPREL32):
 #endif
 		{
 			const size_t rlen = sizeof(Elf_Addr);
 			Elf_Addr old = load_ptr(where, rlen);
 			Elf_Addr val = old;
 
         		def = find_symdef(r_symndx, obj, &defobj, flags, NULL,
 	    			lockstate);
 
 			if (def == NULL)
 				return -1;
 
-			if (!defobj->tls_done && allocate_tls_offset(obj))
+			if (!defobj->tls_done && !allocate_tls_offset(obj))
 				return -1;
 
 			val += (Elf_Addr)(def->st_value + defobj->tlsoffset
 			    - TLS_TP_OFFSET - TLS_TCB_SIZE);
 			store_ptr(where, val, rlen);
 
 			dbg("TPREL %s in %s %p --> %p in %s",
 			    obj->strtab + obj->symtab[r_symndx].st_name,
 			    obj->path, (void*)old, (void *)val, defobj->path);
 			break;
 		}
 
 
 
 		default:
 			dbg("sym = %lu, type = %lu, offset = %p, "
 			    "contents = %p, symbol = %s",
 			    (u_long)r_symndx, (u_long)ELF_R_TYPE(rel->r_info),
 			    (void *)rel->r_offset,
 			    (void *)load_ptr(where, sizeof(Elf_Sword)),
 			    obj->strtab + obj->symtab[r_symndx].st_name);
 			_rtld_error("%s: Unsupported relocation type %ld "
 			    "in non-PLT relocations",
 			    obj->path, (u_long) ELF_R_TYPE(rel->r_info));
 			return -1;
 		}
 	}
 
 	return 0;
 }
 
 /*
  *  Process the PLT relocations.
  */
 int
 reloc_plt(Obj_Entry *obj, int flags __unused, RtldLockState *lockstate __unused)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 
 	rellim = (const Elf_Rel *)((const char *)obj->pltrel + obj->pltrelsize);
 	for (rel = obj->pltrel; rel < rellim; rel++) {
 		Elf_Addr *where;
 
 		switch (ELF_R_TYPE(rel->r_info)) {
 		case R_MIPS_JUMP_SLOT:
 			where = (Elf_Addr *)(obj->relocbase + rel->r_offset);
 			*where += (Elf_Addr )obj->relocbase;
 			break;
 		default:
 			_rtld_error("Unknown relocation type %u in PLT",
 			    (unsigned int)ELF_R_TYPE(rel->r_info));
 			return (-1);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * LD_BIND_NOW was set - force relocation for all jump slots
  */
 int
 reloc_jmpslots(Obj_Entry *obj, int flags, RtldLockState *lockstate)
 {
 	const Obj_Entry *defobj;
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Sym *def;
 
 	rellim = (const Elf_Rel *)((const char *)obj->pltrel + obj->pltrelsize);
 	for (rel = obj->pltrel; rel < rellim; rel++) {
 		Elf_Addr *where;
 
 		switch (ELF_R_TYPE(rel->r_info)) {
 		case R_MIPS_JUMP_SLOT:
 			def = find_symdef(ELF_R_SYM(rel->r_info), obj,
 			    &defobj, SYMLOOK_IN_PLT | flags, NULL, lockstate);
 			if (def == NULL) {
 				dbg("reloc_jmpslots: sym not found");
 				return (-1);
 			}
 
 			where = (Elf_Addr *)(obj->relocbase + rel->r_offset);
 			*where = (Elf_Addr)(defobj->relocbase + def->st_value);
 			break;
 		default:
 			_rtld_error("Unknown relocation type %u in PLT",
 			    (unsigned int)ELF_R_TYPE(rel->r_info));
 			return (-1);
 		}
 	}
 
 	return (0);
 }
 
 int
 reloc_iresolve(Obj_Entry *obj __unused,
     struct Struct_RtldLockState *lockstate __unused)
 {
 
 	/* XXX not implemented */
 	return (0);
 }
 
 int
 reloc_gnu_ifunc(Obj_Entry *obj __unused, int flags __unused,
     struct Struct_RtldLockState *lockstate __unused)
 {
 
 	/* XXX not implemented */
 	return (0);
 }
 
 Elf_Addr
 reloc_jmpslot(Elf_Addr *where, Elf_Addr target,
     const Obj_Entry *defobj __unused, const Obj_Entry *obj __unused,
     const Elf_Rel *rel)
 {
 
 	assert(ELF_R_TYPE(rel->r_info) == R_MIPS_JUMP_SLOT);
 
 	if (*where != target && !ld_bind_not)
 		*where = target;
 	return (target);
 }
 
 void
 ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused)
 {
 
 }
 
 void
 pre_init(void)
 {
 
 }
 
 void
 allocate_initial_tls(Obj_Entry *objs)
 {
 	char *tls;
 	
 	/*
 	 * Fix the size of the static TLS block by using the maximum
 	 * offset allocated so far and adding a bit for dynamic modules to
 	 * use.
 	 */
 	tls_static_space = tls_last_offset + tls_last_size + RTLD_STATIC_TLS_EXTRA;
 
 	tls = (char *) allocate_tls(objs, NULL, TLS_TCB_SIZE, 8);
 
 	sysarch(MIPS_SET_TLS, tls);
 }
 
 #ifdef __mips_n64
 void *
 _mips_get_tls(void)
 {
 	uint64_t _rv;
 
 	__asm__ __volatile__ (
 	    ".set\tpush\n\t"
 	    ".set\tmips64r2\n\t"
 	    "rdhwr\t%0, $29\n\t"
 	    ".set\tpop"
 	    : "=r" (_rv));
 	/*
 	 * XXXSS See 'git show c6be4f4d2d1b71c04de5d3bbb6933ce2dbcdb317'
 	 *
 	 * Remove the offset since this really a request to get the TLS
 	 * pointer via sysarch() (in theory).  Of course, this may go away
 	 * once the TLS code is rewritten.
 	 */
 	_rv = _rv - TLS_TP_OFFSET - TLS_TCB_SIZE;
 
 	return (void *)_rv;
 }
 
 #else /* mips 32 */
 
 void *
 _mips_get_tls(void)
 {
 	uint32_t _rv;
 
 	__asm__ __volatile__ (
 	    ".set\tpush\n\t"
 	    ".set\tmips32r2\n\t"
 	    "rdhwr\t%0, $29\n\t"
 	    ".set\tpop"
 	    : "=r" (_rv));
 	/*
 	 * XXXSS See 'git show c6be4f4d2d1b71c04de5d3bbb6933ce2dbcdb317'
 	 *
 	 * Remove the offset since this really a request to get the TLS
 	 * pointer via sysarch() (in theory).  Of course, this may go away
 	 * once the TLS code is rewritten.
 	 */
 	_rv = _rv - TLS_TP_OFFSET - TLS_TCB_SIZE;
 
 	return (void *)_rv;
 }
 #endif /* ! __mips_n64 */
 
 void *
 __tls_get_addr(tls_index* ti)
 {
 	Elf_Addr** tls;
 	char *p;
 
 #ifdef TLS_USE_SYSARCH
 	sysarch(MIPS_GET_TLS, &tls);
 #else
 	tls = _mips_get_tls();
 #endif
 
 	p = tls_get_addr_common(tls, ti->ti_module, ti->ti_offset + TLS_DTP_OFFSET);
 
 	return (p);
 }
Index: projects/capsicum-test/libexec/rtld-elf/rtld.c
===================================================================
--- projects/capsicum-test/libexec/rtld-elf/rtld.c	(revision 345709)
+++ projects/capsicum-test/libexec/rtld-elf/rtld.c	(revision 345710)
@@ -1,5669 +1,5703 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 1996, 1997, 1998, 1999, 2000 John D. Polstra.
  * Copyright 2003 Alexander Kabaev <kan@FreeBSD.ORG>.
  * Copyright 2009-2013 Konstantin Belousov <kib@FreeBSD.ORG>.
  * Copyright 2012 John Marino <draco@marino.st>.
  * Copyright 2014-2017 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Dynamic linker for ELF.
  *
  * John Polstra <jdp@polstra.com>.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/utsname.h>
 #include <sys/ktrace.h>
 
 #include <dlfcn.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "debug.h"
 #include "rtld.h"
 #include "libmap.h"
 #include "paths.h"
 #include "rtld_tls.h"
 #include "rtld_printf.h"
 #include "rtld_malloc.h"
 #include "rtld_utrace.h"
 #include "notes.h"
 
 /* Types. */
 typedef void (*func_ptr_type)(void);
 typedef void * (*path_enum_proc) (const char *path, size_t len, void *arg);
 
 
 /* Variables that cannot be static: */
 extern struct r_debug r_debug; /* For GDB */
 extern int _thread_autoinit_dummy_decl;
 extern char* __progname;
 extern void (*__cleanup)(void);
 
 
 /*
  * Function declarations.
  */
 static const char *basename(const char *);
 static void digest_dynamic1(Obj_Entry *, int, const Elf_Dyn **,
     const Elf_Dyn **, const Elf_Dyn **);
 static void digest_dynamic2(Obj_Entry *, const Elf_Dyn *, const Elf_Dyn *,
     const Elf_Dyn *);
 static void digest_dynamic(Obj_Entry *, int);
 static Obj_Entry *digest_phdr(const Elf_Phdr *, int, caddr_t, const char *);
+static void distribute_static_tls(Objlist *, RtldLockState *);
 static Obj_Entry *dlcheck(void *);
 static int dlclose_locked(void *, RtldLockState *);
 static Obj_Entry *dlopen_object(const char *name, int fd, Obj_Entry *refobj,
     int lo_flags, int mode, RtldLockState *lockstate);
 static Obj_Entry *do_load_object(int, const char *, char *, struct stat *, int);
 static int do_search_info(const Obj_Entry *obj, int, struct dl_serinfo *);
 static bool donelist_check(DoneList *, const Obj_Entry *);
 static void errmsg_restore(char *);
 static char *errmsg_save(void);
 static void *fill_search_info(const char *, size_t, void *);
 static char *find_library(const char *, const Obj_Entry *, int *);
 static const char *gethints(bool);
 static void hold_object(Obj_Entry *);
 static void unhold_object(Obj_Entry *);
 static void init_dag(Obj_Entry *);
 static void init_marker(Obj_Entry *);
 static void init_pagesizes(Elf_Auxinfo **aux_info);
 static void init_rtld(caddr_t, Elf_Auxinfo **);
 static void initlist_add_neededs(Needed_Entry *, Objlist *);
 static void initlist_add_objects(Obj_Entry *, Obj_Entry *, Objlist *);
 static int initlist_objects_ifunc(Objlist *, bool, int, RtldLockState *);
 static void linkmap_add(Obj_Entry *);
 static void linkmap_delete(Obj_Entry *);
 static void load_filtees(Obj_Entry *, int flags, RtldLockState *);
 static void unload_filtees(Obj_Entry *, RtldLockState *);
 static int load_needed_objects(Obj_Entry *, int);
 static int load_preload_objects(void);
 static Obj_Entry *load_object(const char *, int fd, const Obj_Entry *, int);
 static void map_stacks_exec(RtldLockState *);
 static int obj_disable_relro(Obj_Entry *);
 static int obj_enforce_relro(Obj_Entry *);
 static Obj_Entry *obj_from_addr(const void *);
 static void objlist_call_fini(Objlist *, Obj_Entry *, RtldLockState *);
 static void objlist_call_init(Objlist *, RtldLockState *);
 static void objlist_clear(Objlist *);
 static Objlist_Entry *objlist_find(Objlist *, const Obj_Entry *);
 static void objlist_init(Objlist *);
 static void objlist_push_head(Objlist *, Obj_Entry *);
 static void objlist_push_tail(Objlist *, Obj_Entry *);
 static void objlist_put_after(Objlist *, Obj_Entry *, Obj_Entry *);
 static void objlist_remove(Objlist *, Obj_Entry *);
 static int open_binary_fd(const char *argv0, bool search_in_path);
 static int parse_args(char* argv[], int argc, bool *use_pathp, int *fdp);
 static int parse_integer(const char *);
 static void *path_enumerate(const char *, path_enum_proc, const char *, void *);
 static void print_usage(const char *argv0);
 static void release_object(Obj_Entry *);
 static int relocate_object_dag(Obj_Entry *root, bool bind_now,
     Obj_Entry *rtldobj, int flags, RtldLockState *lockstate);
 static int relocate_object(Obj_Entry *obj, bool bind_now, Obj_Entry *rtldobj,
     int flags, RtldLockState *lockstate);
 static int relocate_objects(Obj_Entry *, bool, Obj_Entry *, int,
     RtldLockState *);
 static int resolve_object_ifunc(Obj_Entry *, bool, int, RtldLockState *);
 static int rtld_dirname(const char *, char *);
 static int rtld_dirname_abs(const char *, char *);
 static void *rtld_dlopen(const char *name, int fd, int mode);
 static void rtld_exit(void);
 static char *search_library_path(const char *, const char *, const char *,
     int *);
 static char *search_library_pathfds(const char *, const char *, int *);
 static const void **get_program_var_addr(const char *, RtldLockState *);
 static void set_program_var(const char *, const void *);
 static int symlook_default(SymLook *, const Obj_Entry *refobj);
 static int symlook_global(SymLook *, DoneList *);
 static void symlook_init_from_req(SymLook *, const SymLook *);
 static int symlook_list(SymLook *, const Objlist *, DoneList *);
 static int symlook_needed(SymLook *, const Needed_Entry *, DoneList *);
 static int symlook_obj1_sysv(SymLook *, const Obj_Entry *);
 static int symlook_obj1_gnu(SymLook *, const Obj_Entry *);
 static void trace_loaded_objects(Obj_Entry *);
 static void unlink_object(Obj_Entry *);
 static void unload_object(Obj_Entry *, RtldLockState *lockstate);
 static void unref_dag(Obj_Entry *);
 static void ref_dag(Obj_Entry *);
 static char *origin_subst_one(Obj_Entry *, char *, const char *,
     const char *, bool);
 static char *origin_subst(Obj_Entry *, const char *);
 static bool obj_resolve_origin(Obj_Entry *obj);
 static void preinit_main(void);
 static int  rtld_verify_versions(const Objlist *);
 static int  rtld_verify_object_versions(Obj_Entry *);
 static void object_add_name(Obj_Entry *, const char *);
 static int  object_match_name(const Obj_Entry *, const char *);
 static void ld_utrace_log(int, void *, void *, size_t, int, const char *);
 static void rtld_fill_dl_phdr_info(const Obj_Entry *obj,
     struct dl_phdr_info *phdr_info);
 static uint32_t gnu_hash(const char *);
 static bool matched_symbol(SymLook *, const Obj_Entry *, Sym_Match_Result *,
     const unsigned long);
 
 void r_debug_state(struct r_debug *, struct link_map *) __noinline __exported;
 void _r_debug_postinit(struct link_map *) __noinline __exported;
 
 int __sys_openat(int, const char *, int, ...);
 
 /*
  * Data declarations.
  */
 static char *error_message;	/* Message for dlerror(), or NULL */
 struct r_debug r_debug __exported;	/* for GDB; */
 static bool libmap_disable;	/* Disable libmap */
 static bool ld_loadfltr;	/* Immediate filters processing */
 static char *libmap_override;	/* Maps to use in addition to libmap.conf */
 static bool trust;		/* False for setuid and setgid programs */
 static bool dangerous_ld_env;	/* True if environment variables have been
 				   used to affect the libraries loaded */
 bool ld_bind_not;		/* Disable PLT update */
 static char *ld_bind_now;	/* Environment variable for immediate binding */
 static char *ld_debug;		/* Environment variable for debugging */
 static char *ld_library_path;	/* Environment variable for search path */
 static char *ld_library_dirs;	/* Environment variable for library descriptors */
 static char *ld_preload;	/* Environment variable for libraries to
 				   load first */
 static const char *ld_elf_hints_path;	/* Environment variable for alternative hints path */
 static const char *ld_tracing;	/* Called from ldd to print libs */
 static char *ld_utrace;		/* Use utrace() to log events. */
 static struct obj_entry_q obj_list;	/* Queue of all loaded objects */
 static Obj_Entry *obj_main;	/* The main program shared object */
 static Obj_Entry obj_rtld;	/* The dynamic linker shared object */
 static unsigned int obj_count;	/* Number of objects in obj_list */
 static unsigned int obj_loads;	/* Number of loads of objects (gen count) */
 
 static Objlist list_global =	/* Objects dlopened with RTLD_GLOBAL */
   STAILQ_HEAD_INITIALIZER(list_global);
 static Objlist list_main =	/* Objects loaded at program startup */
   STAILQ_HEAD_INITIALIZER(list_main);
 static Objlist list_fini =	/* Objects needing fini() calls */
   STAILQ_HEAD_INITIALIZER(list_fini);
 
 Elf_Sym sym_zero;		/* For resolving undefined weak refs. */
 
 #define GDB_STATE(s,m)	r_debug.r_state = s; r_debug_state(&r_debug,m);
 
 extern Elf_Dyn _DYNAMIC;
 #pragma weak _DYNAMIC
 
 int dlclose(void *) __exported;
 char *dlerror(void) __exported;
 void *dlopen(const char *, int) __exported;
 void *fdlopen(int, int) __exported;
 void *dlsym(void *, const char *) __exported;
 dlfunc_t dlfunc(void *, const char *) __exported;
 void *dlvsym(void *, const char *, const char *) __exported;
 int dladdr(const void *, Dl_info *) __exported;
 void dllockinit(void *, void *(*)(void *), void (*)(void *), void (*)(void *),
     void (*)(void *), void (*)(void *), void (*)(void *)) __exported;
 int dlinfo(void *, int , void *) __exported;
 int dl_iterate_phdr(__dl_iterate_hdr_callback, void *) __exported;
 int _rtld_addr_phdr(const void *, struct dl_phdr_info *) __exported;
 int _rtld_get_stack_prot(void) __exported;
 int _rtld_is_dlopened(void *) __exported;
 void _rtld_error(const char *, ...) __exported;
 
 /* Only here to fix -Wmissing-prototypes warnings */
 int __getosreldate(void);
 void __pthread_cxa_finalize(struct dl_phdr_info *a);
 func_ptr_type _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp);
 Elf_Addr _rtld_bind(Obj_Entry *obj, Elf_Size reloff);
 
 
 int npagesizes;
 static int osreldate;
 size_t *pagesizes;
 
 static int stack_prot = PROT_READ | PROT_WRITE | RTLD_DEFAULT_STACK_EXEC;
 static int max_stack_flags;
 
 /*
  * Global declarations normally provided by crt1.  The dynamic linker is
  * not built with crt1, so we have to provide them ourselves.
  */
 char *__progname;
 char **environ;
 
 /*
  * Used to pass argc, argv to init functions.
  */
 int main_argc;
 char **main_argv;
 
 /*
  * Globals to control TLS allocation.
  */
 size_t tls_last_offset;		/* Static TLS offset of last module */
 size_t tls_last_size;		/* Static TLS size of last module */
 size_t tls_static_space;	/* Static TLS space allocated */
 static size_t tls_static_max_align;
 Elf_Addr tls_dtv_generation = 1;	/* Used to detect when dtv size changes */
 int tls_max_index = 1;		/* Largest module index allocated */
 
 static bool ld_library_path_rpath = false;
 
 /*
  * Globals for path names, and such
  */
 const char *ld_elf_hints_default = _PATH_ELF_HINTS;
 const char *ld_path_libmap_conf = _PATH_LIBMAP_CONF;
 const char *ld_path_rtld = _PATH_RTLD;
 const char *ld_standard_library_path = STANDARD_LIBRARY_PATH;
 const char *ld_env_prefix = LD_;
 
 /*
  * Fill in a DoneList with an allocation large enough to hold all of
  * the currently-loaded objects.  Keep this as a macro since it calls
  * alloca and we want that to occur within the scope of the caller.
  */
 #define donelist_init(dlp)					\
     ((dlp)->objs = alloca(obj_count * sizeof (dlp)->objs[0]),	\
     assert((dlp)->objs != NULL),				\
     (dlp)->num_alloc = obj_count,				\
     (dlp)->num_used = 0)
 
 #define	LD_UTRACE(e, h, mb, ms, r, n) do {			\
 	if (ld_utrace != NULL)					\
 		ld_utrace_log(e, h, mb, ms, r, n);		\
 } while (0)
 
 static void
 ld_utrace_log(int event, void *handle, void *mapbase, size_t mapsize,
     int refcnt, const char *name)
 {
 	struct utrace_rtld ut;
 	static const char rtld_utrace_sig[RTLD_UTRACE_SIG_SZ] = RTLD_UTRACE_SIG;
 
 	memcpy(ut.sig, rtld_utrace_sig, sizeof(ut.sig));
 	ut.event = event;
 	ut.handle = handle;
 	ut.mapbase = mapbase;
 	ut.mapsize = mapsize;
 	ut.refcnt = refcnt;
 	bzero(ut.name, sizeof(ut.name));
 	if (name)
 		strlcpy(ut.name, name, sizeof(ut.name));
 	utrace(&ut, sizeof(ut));
 }
 
 #ifdef RTLD_VARIANT_ENV_NAMES
 /*
  * construct the env variable based on the type of binary that's
  * running.
  */
 static inline const char *
 _LD(const char *var)
 {
 	static char buffer[128];
 
 	strlcpy(buffer, ld_env_prefix, sizeof(buffer));
 	strlcat(buffer, var, sizeof(buffer));
 	return (buffer);
 }
 #else
 #define _LD(x)	LD_ x
 #endif
 
 /*
  * Main entry point for dynamic linking.  The first argument is the
  * stack pointer.  The stack is expected to be laid out as described
  * in the SVR4 ABI specification, Intel 386 Processor Supplement.
  * Specifically, the stack pointer points to a word containing
  * ARGC.  Following that in the stack is a null-terminated sequence
  * of pointers to argument strings.  Then comes a null-terminated
  * sequence of pointers to environment strings.  Finally, there is a
  * sequence of "auxiliary vector" entries.
  *
  * The second argument points to a place to store the dynamic linker's
  * exit procedure pointer and the third to a place to store the main
  * program's object.
  *
  * The return value is the main program's entry point.
  */
 func_ptr_type
 _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp)
 {
     Elf_Auxinfo *aux, *auxp, *auxpf, *aux_info[AT_COUNT];
     Objlist_Entry *entry;
     Obj_Entry *last_interposer, *obj, *preload_tail;
     const Elf_Phdr *phdr;
     Objlist initlist;
     RtldLockState lockstate;
     struct stat st;
     Elf_Addr *argcp;
     char **argv, **env, **envp, *kexecpath, *library_path_rpath;
     const char *argv0;
     caddr_t imgentry;
     char buf[MAXPATHLEN];
     int argc, fd, i, phnum, rtld_argc;
     bool dir_enable, explicit_fd, search_in_path;
 
     /*
      * On entry, the dynamic linker itself has not been relocated yet.
      * Be very careful not to reference any global data until after
      * init_rtld has returned.  It is OK to reference file-scope statics
      * and string constants, and to call static and global functions.
      */
 
     /* Find the auxiliary vector on the stack. */
     argcp = sp;
     argc = *sp++;
     argv = (char **) sp;
     sp += argc + 1;	/* Skip over arguments and NULL terminator */
     env = (char **) sp;
     while (*sp++ != 0)	/* Skip over environment, and NULL terminator */
 	;
     aux = (Elf_Auxinfo *) sp;
 
     /* Digest the auxiliary vector. */
     for (i = 0;  i < AT_COUNT;  i++)
 	aux_info[i] = NULL;
     for (auxp = aux;  auxp->a_type != AT_NULL;  auxp++) {
 	if (auxp->a_type < AT_COUNT)
 	    aux_info[auxp->a_type] = auxp;
     }
 
     /* Initialize and relocate ourselves. */
     assert(aux_info[AT_BASE] != NULL);
     init_rtld((caddr_t) aux_info[AT_BASE]->a_un.a_ptr, aux_info);
 
     __progname = obj_rtld.path;
     argv0 = argv[0] != NULL ? argv[0] : "(null)";
     environ = env;
     main_argc = argc;
     main_argv = argv;
 
     trust = !issetugid();
 
     md_abi_variant_hook(aux_info);
 
     fd = -1;
     if (aux_info[AT_EXECFD] != NULL) {
 	fd = aux_info[AT_EXECFD]->a_un.a_val;
     } else {
 	assert(aux_info[AT_PHDR] != NULL);
 	phdr = (const Elf_Phdr *)aux_info[AT_PHDR]->a_un.a_ptr;
 	if (phdr == obj_rtld.phdr) {
 	    if (!trust) {
 		_rtld_error("Tainted process refusing to run binary %s",
 		    argv0);
 		rtld_die();
 	    }
 	    dbg("opening main program in direct exec mode");
 	    if (argc >= 2) {
 		rtld_argc = parse_args(argv, argc, &search_in_path, &fd);
 		argv0 = argv[rtld_argc];
 		explicit_fd = (fd != -1);
 		if (!explicit_fd)
 		    fd = open_binary_fd(argv0, search_in_path);
 		if (fstat(fd, &st) == -1) {
 		    _rtld_error("Failed to fstat FD %d (%s): %s", fd,
 		      explicit_fd ? "user-provided descriptor" : argv0,
 		      rtld_strerror(errno));
 		    rtld_die();
 		}
 
 		/*
 		 * Rough emulation of the permission checks done by
 		 * execve(2), only Unix DACs are checked, ACLs are
 		 * ignored.  Preserve the semantic of disabling owner
 		 * to execute if owner x bit is cleared, even if
 		 * others x bit is enabled.
 		 * mmap(2) does not allow to mmap with PROT_EXEC if
 		 * binary' file comes from noexec mount.  We cannot
 		 * set VV_TEXT on the binary.
 		 */
 		dir_enable = false;
 		if (st.st_uid == geteuid()) {
 		    if ((st.st_mode & S_IXUSR) != 0)
 			dir_enable = true;
 		} else if (st.st_gid == getegid()) {
 		    if ((st.st_mode & S_IXGRP) != 0)
 			dir_enable = true;
 		} else if ((st.st_mode & S_IXOTH) != 0) {
 		    dir_enable = true;
 		}
 		if (!dir_enable) {
 		    _rtld_error("No execute permission for binary %s",
 		        argv0);
 		    rtld_die();
 		}
 
 		/*
 		 * For direct exec mode, argv[0] is the interpreter
 		 * name, we must remove it and shift arguments left
 		 * before invoking binary main.  Since stack layout
 		 * places environment pointers and aux vectors right
 		 * after the terminating NULL, we must shift
 		 * environment and aux as well.
 		 */
 		main_argc = argc - rtld_argc;
 		for (i = 0; i <= main_argc; i++)
 		    argv[i] = argv[i + rtld_argc];
 		*argcp -= rtld_argc;
 		environ = env = envp = argv + main_argc + 1;
 		do {
 		    *envp = *(envp + rtld_argc);
 		    envp++;
 		} while (*envp != NULL);
 		aux = auxp = (Elf_Auxinfo *)envp;
 		auxpf = (Elf_Auxinfo *)(envp + rtld_argc);
 		for (;; auxp++, auxpf++) {
 		    *auxp = *auxpf;
 		    if (auxp->a_type == AT_NULL)
 			    break;
 		}
 	    } else {
 		_rtld_error("No binary");
 		rtld_die();
 	    }
 	}
     }
 
     ld_bind_now = getenv(_LD("BIND_NOW"));
 
     /* 
      * If the process is tainted, then we un-set the dangerous environment
      * variables.  The process will be marked as tainted until setuid(2)
      * is called.  If any child process calls setuid(2) we do not want any
      * future processes to honor the potentially un-safe variables.
      */
     if (!trust) {
 	if (unsetenv(_LD("PRELOAD")) || unsetenv(_LD("LIBMAP")) ||
 	    unsetenv(_LD("LIBRARY_PATH")) || unsetenv(_LD("LIBRARY_PATH_FDS")) ||
 	    unsetenv(_LD("LIBMAP_DISABLE")) || unsetenv(_LD("BIND_NOT")) ||
 	    unsetenv(_LD("DEBUG")) || unsetenv(_LD("ELF_HINTS_PATH")) ||
 	    unsetenv(_LD("LOADFLTR")) || unsetenv(_LD("LIBRARY_PATH_RPATH"))) {
 		_rtld_error("environment corrupt; aborting");
 		rtld_die();
 	}
     }
     ld_debug = getenv(_LD("DEBUG"));
     if (ld_bind_now == NULL)
 	    ld_bind_not = getenv(_LD("BIND_NOT")) != NULL;
     libmap_disable = getenv(_LD("LIBMAP_DISABLE")) != NULL;
     libmap_override = getenv(_LD("LIBMAP"));
     ld_library_path = getenv(_LD("LIBRARY_PATH"));
     ld_library_dirs = getenv(_LD("LIBRARY_PATH_FDS"));
     ld_preload = getenv(_LD("PRELOAD"));
     ld_elf_hints_path = getenv(_LD("ELF_HINTS_PATH"));
     ld_loadfltr = getenv(_LD("LOADFLTR")) != NULL;
     library_path_rpath = getenv(_LD("LIBRARY_PATH_RPATH"));
     if (library_path_rpath != NULL) {
 	    if (library_path_rpath[0] == 'y' ||
 		library_path_rpath[0] == 'Y' ||
 		library_path_rpath[0] == '1')
 		    ld_library_path_rpath = true;
 	    else
 		    ld_library_path_rpath = false;
     }
     dangerous_ld_env = libmap_disable || (libmap_override != NULL) ||
 	(ld_library_path != NULL) || (ld_preload != NULL) ||
 	(ld_elf_hints_path != NULL) || ld_loadfltr;
     ld_tracing = getenv(_LD("TRACE_LOADED_OBJECTS"));
     ld_utrace = getenv(_LD("UTRACE"));
 
     if ((ld_elf_hints_path == NULL) || strlen(ld_elf_hints_path) == 0)
 	ld_elf_hints_path = ld_elf_hints_default;
 
     if (ld_debug != NULL && *ld_debug != '\0')
 	debug = 1;
     dbg("%s is initialized, base address = %p", __progname,
 	(caddr_t) aux_info[AT_BASE]->a_un.a_ptr);
     dbg("RTLD dynamic = %p", obj_rtld.dynamic);
     dbg("RTLD pltgot  = %p", obj_rtld.pltgot);
 
     dbg("initializing thread locks");
     lockdflt_init();
 
     /*
      * Load the main program, or process its program header if it is
      * already loaded.
      */
     if (fd != -1) {	/* Load the main program. */
 	dbg("loading main program");
 	obj_main = map_object(fd, argv0, NULL);
 	close(fd);
 	if (obj_main == NULL)
 	    rtld_die();
 	max_stack_flags = obj_main->stack_flags;
     } else {				/* Main program already loaded. */
 	dbg("processing main program's program header");
 	assert(aux_info[AT_PHDR] != NULL);
 	phdr = (const Elf_Phdr *) aux_info[AT_PHDR]->a_un.a_ptr;
 	assert(aux_info[AT_PHNUM] != NULL);
 	phnum = aux_info[AT_PHNUM]->a_un.a_val;
 	assert(aux_info[AT_PHENT] != NULL);
 	assert(aux_info[AT_PHENT]->a_un.a_val == sizeof(Elf_Phdr));
 	assert(aux_info[AT_ENTRY] != NULL);
 	imgentry = (caddr_t) aux_info[AT_ENTRY]->a_un.a_ptr;
 	if ((obj_main = digest_phdr(phdr, phnum, imgentry, argv0)) == NULL)
 	    rtld_die();
     }
 
     if (aux_info[AT_EXECPATH] != NULL && fd == -1) {
 	    kexecpath = aux_info[AT_EXECPATH]->a_un.a_ptr;
 	    dbg("AT_EXECPATH %p %s", kexecpath, kexecpath);
 	    if (kexecpath[0] == '/')
 		    obj_main->path = kexecpath;
 	    else if (getcwd(buf, sizeof(buf)) == NULL ||
 		     strlcat(buf, "/", sizeof(buf)) >= sizeof(buf) ||
 		     strlcat(buf, kexecpath, sizeof(buf)) >= sizeof(buf))
 		    obj_main->path = xstrdup(argv0);
 	    else
 		    obj_main->path = xstrdup(buf);
     } else {
 	    dbg("No AT_EXECPATH or direct exec");
 	    obj_main->path = xstrdup(argv0);
     }
     dbg("obj_main path %s", obj_main->path);
     obj_main->mainprog = true;
 
     if (aux_info[AT_STACKPROT] != NULL &&
       aux_info[AT_STACKPROT]->a_un.a_val != 0)
 	    stack_prot = aux_info[AT_STACKPROT]->a_un.a_val;
 
 #ifndef COMPAT_32BIT
     /*
      * Get the actual dynamic linker pathname from the executable if
      * possible.  (It should always be possible.)  That ensures that
      * gdb will find the right dynamic linker even if a non-standard
      * one is being used.
      */
     if (obj_main->interp != NULL &&
       strcmp(obj_main->interp, obj_rtld.path) != 0) {
 	free(obj_rtld.path);
 	obj_rtld.path = xstrdup(obj_main->interp);
         __progname = obj_rtld.path;
     }
 #endif
 
     digest_dynamic(obj_main, 0);
     dbg("%s valid_hash_sysv %d valid_hash_gnu %d dynsymcount %d",
 	obj_main->path, obj_main->valid_hash_sysv, obj_main->valid_hash_gnu,
 	obj_main->dynsymcount);
 
     linkmap_add(obj_main);
     linkmap_add(&obj_rtld);
 
     /* Link the main program into the list of objects. */
     TAILQ_INSERT_HEAD(&obj_list, obj_main, next);
     obj_count++;
     obj_loads++;
 
     /* Initialize a fake symbol for resolving undefined weak references. */
     sym_zero.st_info = ELF_ST_INFO(STB_GLOBAL, STT_NOTYPE);
     sym_zero.st_shndx = SHN_UNDEF;
     sym_zero.st_value = -(uintptr_t)obj_main->relocbase;
 
     if (!libmap_disable)
         libmap_disable = (bool)lm_init(libmap_override);
 
     dbg("loading LD_PRELOAD libraries");
     if (load_preload_objects() == -1)
 	rtld_die();
     preload_tail = globallist_curr(TAILQ_LAST(&obj_list, obj_entry_q));
 
     dbg("loading needed objects");
     if (load_needed_objects(obj_main, 0) == -1)
 	rtld_die();
 
     /* Make a list of all objects loaded at startup. */
     last_interposer = obj_main;
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj->marker)
 	    continue;
 	if (obj->z_interpose && obj != obj_main) {
 	    objlist_put_after(&list_main, last_interposer, obj);
 	    last_interposer = obj;
 	} else {
 	    objlist_push_tail(&list_main, obj);
 	}
     	obj->refcount++;
     }
 
     dbg("checking for required versions");
     if (rtld_verify_versions(&list_main) == -1 && !ld_tracing)
 	rtld_die();
 
     if (ld_tracing) {		/* We're done */
 	trace_loaded_objects(obj_main);
 	exit(0);
     }
 
     if (getenv(_LD("DUMP_REL_PRE")) != NULL) {
        dump_relocations(obj_main);
        exit (0);
     }
 
     /*
      * Processing tls relocations requires having the tls offsets
      * initialized.  Prepare offsets before starting initial
      * relocation processing.
      */
     dbg("initializing initial thread local storage offsets");
     STAILQ_FOREACH(entry, &list_main, link) {
 	/*
 	 * Allocate all the initial objects out of the static TLS
 	 * block even if they didn't ask for it.
 	 */
 	allocate_tls_offset(entry->obj);
     }
 
     if (relocate_objects(obj_main,
       ld_bind_now != NULL && *ld_bind_now != '\0',
       &obj_rtld, SYMLOOK_EARLY, NULL) == -1)
 	rtld_die();
 
     dbg("doing copy relocations");
     if (do_copy_relocations(obj_main) == -1)
 	rtld_die();
 
     if (getenv(_LD("DUMP_REL_POST")) != NULL) {
        dump_relocations(obj_main);
        exit (0);
     }
 
     ifunc_init(aux);
 
     /*
      * Setup TLS for main thread.  This must be done after the
      * relocations are processed, since tls initialization section
      * might be the subject for relocations.
      */
     dbg("initializing initial thread local storage");
     allocate_initial_tls(globallist_curr(TAILQ_FIRST(&obj_list)));
 
     dbg("initializing key program variables");
     set_program_var("__progname", argv[0] != NULL ? basename(argv[0]) : "");
     set_program_var("environ", env);
     set_program_var("__elf_aux_vector", aux);
 
     /* Make a list of init functions to call. */
     objlist_init(&initlist);
     initlist_add_objects(globallist_curr(TAILQ_FIRST(&obj_list)),
       preload_tail, &initlist);
 
     r_debug_state(NULL, &obj_main->linkmap); /* say hello to gdb! */
 
     map_stacks_exec(NULL);
 
     if (!obj_main->crt_no_init) {
 	/*
 	 * Make sure we don't call the main program's init and fini
 	 * functions for binaries linked with old crt1 which calls
 	 * _init itself.
 	 */
 	obj_main->init = obj_main->fini = (Elf_Addr)NULL;
 	obj_main->preinit_array = obj_main->init_array =
 	    obj_main->fini_array = (Elf_Addr)NULL;
     }
 
     /*
      * Execute MD initializers required before we call the objects'
      * init functions.
      */
     pre_init();
 
     wlock_acquire(rtld_bind_lock, &lockstate);
 
     dbg("resolving ifuncs");
     if (initlist_objects_ifunc(&initlist, ld_bind_now != NULL &&
       *ld_bind_now != '\0', SYMLOOK_EARLY, &lockstate) == -1)
 	rtld_die();
 
     if (obj_main->crt_no_init)
 	preinit_main();
     objlist_call_init(&initlist, &lockstate);
     _r_debug_postinit(&obj_main->linkmap);
     objlist_clear(&initlist);
     dbg("loading filtees");
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj->marker)
 	    continue;
 	if (ld_loadfltr || obj->z_loadfltr)
 	    load_filtees(obj, 0, &lockstate);
     }
 
     dbg("enforcing main obj relro");
     if (obj_enforce_relro(obj_main) == -1)
 	rtld_die();
 
     lock_release(rtld_bind_lock, &lockstate);
 
     dbg("transferring control to program entry point = %p", obj_main->entry);
 
     /* Return the exit procedure and the program entry point. */
     *exit_proc = rtld_exit;
     *objp = obj_main;
     return (func_ptr_type) obj_main->entry;
 }
 
 void *
 rtld_resolve_ifunc(const Obj_Entry *obj, const Elf_Sym *def)
 {
 	void *ptr;
 	Elf_Addr target;
 
 	ptr = (void *)make_function_pointer(def, obj);
 	target = call_ifunc_resolver(ptr);
 	return ((void *)target);
 }
 
 /*
  * NB: MIPS uses a private version of this function (_mips_rtld_bind).
  * Changes to this function should be applied there as well.
  */
 Elf_Addr
 _rtld_bind(Obj_Entry *obj, Elf_Size reloff)
 {
     const Elf_Rel *rel;
     const Elf_Sym *def;
     const Obj_Entry *defobj;
     Elf_Addr *where;
     Elf_Addr target;
     RtldLockState lockstate;
 
     rlock_acquire(rtld_bind_lock, &lockstate);
     if (sigsetjmp(lockstate.env, 0) != 0)
 	    lock_upgrade(rtld_bind_lock, &lockstate);
     if (obj->pltrel)
 	rel = (const Elf_Rel *)((const char *)obj->pltrel + reloff);
     else
 	rel = (const Elf_Rel *)((const char *)obj->pltrela + reloff);
 
     where = (Elf_Addr *)(obj->relocbase + rel->r_offset);
     def = find_symdef(ELF_R_SYM(rel->r_info), obj, &defobj, SYMLOOK_IN_PLT,
 	NULL, &lockstate);
     if (def == NULL)
 	rtld_die();
     if (ELF_ST_TYPE(def->st_info) == STT_GNU_IFUNC)
 	target = (Elf_Addr)rtld_resolve_ifunc(defobj, def);
     else
 	target = (Elf_Addr)(defobj->relocbase + def->st_value);
 
     dbg("\"%s\" in \"%s\" ==> %p in \"%s\"",
       defobj->strtab + def->st_name, basename(obj->path),
       (void *)target, basename(defobj->path));
 
     /*
      * Write the new contents for the jmpslot. Note that depending on
      * architecture, the value which we need to return back to the
      * lazy binding trampoline may or may not be the target
      * address. The value returned from reloc_jmpslot() is the value
      * that the trampoline needs.
      */
     target = reloc_jmpslot(where, target, defobj, obj, rel);
     lock_release(rtld_bind_lock, &lockstate);
     return target;
 }
 
 /*
  * Error reporting function.  Use it like printf.  If formats the message
  * into a buffer, and sets things up so that the next call to dlerror()
  * will return the message.
  */
 void
 _rtld_error(const char *fmt, ...)
 {
     static char buf[512];
     va_list ap;
 
     va_start(ap, fmt);
     rtld_vsnprintf(buf, sizeof buf, fmt, ap);
     error_message = buf;
     va_end(ap);
     LD_UTRACE(UTRACE_RTLD_ERROR, NULL, NULL, 0, 0, error_message);
 }
 
 /*
  * Return a dynamically-allocated copy of the current error message, if any.
  */
 static char *
 errmsg_save(void)
 {
     return error_message == NULL ? NULL : xstrdup(error_message);
 }
 
 /*
  * Restore the current error message from a copy which was previously saved
  * by errmsg_save().  The copy is freed.
  */
 static void
 errmsg_restore(char *saved_msg)
 {
     if (saved_msg == NULL)
 	error_message = NULL;
     else {
 	_rtld_error("%s", saved_msg);
 	free(saved_msg);
     }
 }
 
 static const char *
 basename(const char *name)
 {
     const char *p = strrchr(name, '/');
     return p != NULL ? p + 1 : name;
 }
 
 static struct utsname uts;
 
 static char *
 origin_subst_one(Obj_Entry *obj, char *real, const char *kw,
     const char *subst, bool may_free)
 {
 	char *p, *p1, *res, *resp;
 	int subst_len, kw_len, subst_count, old_len, new_len;
 
 	kw_len = strlen(kw);
 
 	/*
 	 * First, count the number of the keyword occurrences, to
 	 * preallocate the final string.
 	 */
 	for (p = real, subst_count = 0;; p = p1 + kw_len, subst_count++) {
 		p1 = strstr(p, kw);
 		if (p1 == NULL)
 			break;
 	}
 
 	/*
 	 * If the keyword is not found, just return.
 	 *
 	 * Return non-substituted string if resolution failed.  We
 	 * cannot do anything more reasonable, the failure mode of the
 	 * caller is unresolved library anyway.
 	 */
 	if (subst_count == 0 || (obj != NULL && !obj_resolve_origin(obj)))
 		return (may_free ? real : xstrdup(real));
 	if (obj != NULL)
 		subst = obj->origin_path;
 
 	/*
 	 * There is indeed something to substitute.  Calculate the
 	 * length of the resulting string, and allocate it.
 	 */
 	subst_len = strlen(subst);
 	old_len = strlen(real);
 	new_len = old_len + (subst_len - kw_len) * subst_count;
 	res = xmalloc(new_len + 1);
 
 	/*
 	 * Now, execute the substitution loop.
 	 */
 	for (p = real, resp = res, *resp = '\0';;) {
 		p1 = strstr(p, kw);
 		if (p1 != NULL) {
 			/* Copy the prefix before keyword. */
 			memcpy(resp, p, p1 - p);
 			resp += p1 - p;
 			/* Keyword replacement. */
 			memcpy(resp, subst, subst_len);
 			resp += subst_len;
 			*resp = '\0';
 			p = p1 + kw_len;
 		} else
 			break;
 	}
 
 	/* Copy to the end of string and finish. */
 	strcat(resp, p);
 	if (may_free)
 		free(real);
 	return (res);
 }
 
 static char *
 origin_subst(Obj_Entry *obj, const char *real)
 {
 	char *res1, *res2, *res3, *res4;
 
 	if (obj == NULL || !trust)
 		return (xstrdup(real));
 	if (uts.sysname[0] == '\0') {
 		if (uname(&uts) != 0) {
 			_rtld_error("utsname failed: %d", errno);
 			return (NULL);
 		}
 	}
 	/* __DECONST is safe here since without may_free real is unchanged */
 	res1 = origin_subst_one(obj, __DECONST(char *, real), "$ORIGIN", NULL,
 	    false);
 	res2 = origin_subst_one(NULL, res1, "$OSNAME", uts.sysname, true);
 	res3 = origin_subst_one(NULL, res2, "$OSREL", uts.release, true);
 	res4 = origin_subst_one(NULL, res3, "$PLATFORM", uts.machine, true);
 	return (res4);
 }
 
 void
 rtld_die(void)
 {
     const char *msg = dlerror();
 
     if (msg == NULL)
 	msg = "Fatal error";
     rtld_fdputstr(STDERR_FILENO, _BASENAME_RTLD ": ");
     rtld_fdputstr(STDERR_FILENO, msg);
     rtld_fdputchar(STDERR_FILENO, '\n');
     _exit(1);
 }
 
 /*
  * Process a shared object's DYNAMIC section, and save the important
  * information in its Obj_Entry structure.
  */
 static void
 digest_dynamic1(Obj_Entry *obj, int early, const Elf_Dyn **dyn_rpath,
     const Elf_Dyn **dyn_soname, const Elf_Dyn **dyn_runpath)
 {
     const Elf_Dyn *dynp;
     Needed_Entry **needed_tail = &obj->needed;
     Needed_Entry **needed_filtees_tail = &obj->needed_filtees;
     Needed_Entry **needed_aux_filtees_tail = &obj->needed_aux_filtees;
     const Elf_Hashelt *hashtab;
     const Elf32_Word *hashval;
     Elf32_Word bkt, nmaskwords;
     int bloom_size32;
     int plttype = DT_REL;
 
     *dyn_rpath = NULL;
     *dyn_soname = NULL;
     *dyn_runpath = NULL;
 
     obj->bind_now = false;
     for (dynp = obj->dynamic;  dynp->d_tag != DT_NULL;  dynp++) {
 	switch (dynp->d_tag) {
 
 	case DT_REL:
 	    obj->rel = (const Elf_Rel *)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_RELSZ:
 	    obj->relsize = dynp->d_un.d_val;
 	    break;
 
 	case DT_RELENT:
 	    assert(dynp->d_un.d_val == sizeof(Elf_Rel));
 	    break;
 
 	case DT_JMPREL:
 	    obj->pltrel = (const Elf_Rel *)
 	      (obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_PLTRELSZ:
 	    obj->pltrelsize = dynp->d_un.d_val;
 	    break;
 
 	case DT_RELA:
 	    obj->rela = (const Elf_Rela *)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_RELASZ:
 	    obj->relasize = dynp->d_un.d_val;
 	    break;
 
 	case DT_RELAENT:
 	    assert(dynp->d_un.d_val == sizeof(Elf_Rela));
 	    break;
 
 	case DT_PLTREL:
 	    plttype = dynp->d_un.d_val;
 	    assert(dynp->d_un.d_val == DT_REL || plttype == DT_RELA);
 	    break;
 
 	case DT_SYMTAB:
 	    obj->symtab = (const Elf_Sym *)
 	      (obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_SYMENT:
 	    assert(dynp->d_un.d_val == sizeof(Elf_Sym));
 	    break;
 
 	case DT_STRTAB:
 	    obj->strtab = (const char *)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_STRSZ:
 	    obj->strsize = dynp->d_un.d_val;
 	    break;
 
 	case DT_VERNEED:
 	    obj->verneed = (const Elf_Verneed *)(obj->relocbase +
 		dynp->d_un.d_val);
 	    break;
 
 	case DT_VERNEEDNUM:
 	    obj->verneednum = dynp->d_un.d_val;
 	    break;
 
 	case DT_VERDEF:
 	    obj->verdef = (const Elf_Verdef *)(obj->relocbase +
 		dynp->d_un.d_val);
 	    break;
 
 	case DT_VERDEFNUM:
 	    obj->verdefnum = dynp->d_un.d_val;
 	    break;
 
 	case DT_VERSYM:
 	    obj->versyms = (const Elf_Versym *)(obj->relocbase +
 		dynp->d_un.d_val);
 	    break;
 
 	case DT_HASH:
 	    {
 		hashtab = (const Elf_Hashelt *)(obj->relocbase +
 		    dynp->d_un.d_ptr);
 		obj->nbuckets = hashtab[0];
 		obj->nchains = hashtab[1];
 		obj->buckets = hashtab + 2;
 		obj->chains = obj->buckets + obj->nbuckets;
 		obj->valid_hash_sysv = obj->nbuckets > 0 && obj->nchains > 0 &&
 		  obj->buckets != NULL;
 	    }
 	    break;
 
 	case DT_GNU_HASH:
 	    {
 		hashtab = (const Elf_Hashelt *)(obj->relocbase +
 		    dynp->d_un.d_ptr);
 		obj->nbuckets_gnu = hashtab[0];
 		obj->symndx_gnu = hashtab[1];
 		nmaskwords = hashtab[2];
 		bloom_size32 = (__ELF_WORD_SIZE / 32) * nmaskwords;
 		obj->maskwords_bm_gnu = nmaskwords - 1;
 		obj->shift2_gnu = hashtab[3];
 		obj->bloom_gnu = (const Elf_Addr *)(hashtab + 4);
 		obj->buckets_gnu = hashtab + 4 + bloom_size32;
 		obj->chain_zero_gnu = obj->buckets_gnu + obj->nbuckets_gnu -
 		  obj->symndx_gnu;
 		/* Number of bitmask words is required to be power of 2 */
 		obj->valid_hash_gnu = powerof2(nmaskwords) &&
 		    obj->nbuckets_gnu > 0 && obj->buckets_gnu != NULL;
 	    }
 	    break;
 
 	case DT_NEEDED:
 	    if (!obj->rtld) {
 		Needed_Entry *nep = NEW(Needed_Entry);
 		nep->name = dynp->d_un.d_val;
 		nep->obj = NULL;
 		nep->next = NULL;
 
 		*needed_tail = nep;
 		needed_tail = &nep->next;
 	    }
 	    break;
 
 	case DT_FILTER:
 	    if (!obj->rtld) {
 		Needed_Entry *nep = NEW(Needed_Entry);
 		nep->name = dynp->d_un.d_val;
 		nep->obj = NULL;
 		nep->next = NULL;
 
 		*needed_filtees_tail = nep;
 		needed_filtees_tail = &nep->next;
 	    }
 	    break;
 
 	case DT_AUXILIARY:
 	    if (!obj->rtld) {
 		Needed_Entry *nep = NEW(Needed_Entry);
 		nep->name = dynp->d_un.d_val;
 		nep->obj = NULL;
 		nep->next = NULL;
 
 		*needed_aux_filtees_tail = nep;
 		needed_aux_filtees_tail = &nep->next;
 	    }
 	    break;
 
 	case DT_PLTGOT:
 	    obj->pltgot = (Elf_Addr *)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_TEXTREL:
 	    obj->textrel = true;
 	    break;
 
 	case DT_SYMBOLIC:
 	    obj->symbolic = true;
 	    break;
 
 	case DT_RPATH:
 	    /*
 	     * We have to wait until later to process this, because we
 	     * might not have gotten the address of the string table yet.
 	     */
 	    *dyn_rpath = dynp;
 	    break;
 
 	case DT_SONAME:
 	    *dyn_soname = dynp;
 	    break;
 
 	case DT_RUNPATH:
 	    *dyn_runpath = dynp;
 	    break;
 
 	case DT_INIT:
 	    obj->init = (Elf_Addr)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_PREINIT_ARRAY:
 	    obj->preinit_array = (Elf_Addr)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_PREINIT_ARRAYSZ:
 	    obj->preinit_array_num = dynp->d_un.d_val / sizeof(Elf_Addr);
 	    break;
 
 	case DT_INIT_ARRAY:
 	    obj->init_array = (Elf_Addr)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_INIT_ARRAYSZ:
 	    obj->init_array_num = dynp->d_un.d_val / sizeof(Elf_Addr);
 	    break;
 
 	case DT_FINI:
 	    obj->fini = (Elf_Addr)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_FINI_ARRAY:
 	    obj->fini_array = (Elf_Addr)(obj->relocbase + dynp->d_un.d_ptr);
 	    break;
 
 	case DT_FINI_ARRAYSZ:
 	    obj->fini_array_num = dynp->d_un.d_val / sizeof(Elf_Addr);
 	    break;
 
 	/*
 	 * Don't process DT_DEBUG on MIPS as the dynamic section
 	 * is mapped read-only. DT_MIPS_RLD_MAP is used instead.
 	 */
 
 #ifndef __mips__
 	case DT_DEBUG:
 	    if (!early)
 		dbg("Filling in DT_DEBUG entry");
 	    (__DECONST(Elf_Dyn *, dynp))->d_un.d_ptr = (Elf_Addr)&r_debug;
 	    break;
 #endif
 
 	case DT_FLAGS:
 		if (dynp->d_un.d_val & DF_ORIGIN)
 		    obj->z_origin = true;
 		if (dynp->d_un.d_val & DF_SYMBOLIC)
 		    obj->symbolic = true;
 		if (dynp->d_un.d_val & DF_TEXTREL)
 		    obj->textrel = true;
 		if (dynp->d_un.d_val & DF_BIND_NOW)
 		    obj->bind_now = true;
-		/*if (dynp->d_un.d_val & DF_STATIC_TLS)
-		    ;*/
+		if (dynp->d_un.d_val & DF_STATIC_TLS)
+		    obj->static_tls = true;
 	    break;
 #ifdef __mips__
 	case DT_MIPS_LOCAL_GOTNO:
 		obj->local_gotno = dynp->d_un.d_val;
 		break;
 
 	case DT_MIPS_SYMTABNO:
 		obj->symtabno = dynp->d_un.d_val;
 		break;
 
 	case DT_MIPS_GOTSYM:
 		obj->gotsym = dynp->d_un.d_val;
 		break;
 
 	case DT_MIPS_RLD_MAP:
 		*((Elf_Addr *)(dynp->d_un.d_ptr)) = (Elf_Addr) &r_debug;
 		break;
 
 	case DT_MIPS_RLD_MAP_REL:
 		// The MIPS_RLD_MAP_REL tag stores the offset to the .rld_map
 		// section relative to the address of the tag itself.
 		*((Elf_Addr *)(__DECONST(char*, dynp) + dynp->d_un.d_val)) =
 		    (Elf_Addr) &r_debug;
 		break;
 
 	case DT_MIPS_PLTGOT:
 		obj->mips_pltgot = (Elf_Addr *)(obj->relocbase +
 		    dynp->d_un.d_ptr);
 		break;
 		
 #endif
 
 #ifdef __powerpc64__
 	case DT_PPC64_GLINK:
 		obj->glink = (Elf_Addr)(obj->relocbase + dynp->d_un.d_ptr);
 		break;
 #endif
 
 	case DT_FLAGS_1:
 		if (dynp->d_un.d_val & DF_1_NOOPEN)
 		    obj->z_noopen = true;
 		if (dynp->d_un.d_val & DF_1_ORIGIN)
 		    obj->z_origin = true;
 		if (dynp->d_un.d_val & DF_1_GLOBAL)
 		    obj->z_global = true;
 		if (dynp->d_un.d_val & DF_1_BIND_NOW)
 		    obj->bind_now = true;
 		if (dynp->d_un.d_val & DF_1_NODELETE)
 		    obj->z_nodelete = true;
 		if (dynp->d_un.d_val & DF_1_LOADFLTR)
 		    obj->z_loadfltr = true;
 		if (dynp->d_un.d_val & DF_1_INTERPOSE)
 		    obj->z_interpose = true;
 		if (dynp->d_un.d_val & DF_1_NODEFLIB)
 		    obj->z_nodeflib = true;
 	    break;
 
 	default:
 	    if (!early) {
 		dbg("Ignoring d_tag %ld = %#lx", (long)dynp->d_tag,
 		    (long)dynp->d_tag);
 	    }
 	    break;
 	}
     }
 
     obj->traced = false;
 
     if (plttype == DT_RELA) {
 	obj->pltrela = (const Elf_Rela *) obj->pltrel;
 	obj->pltrel = NULL;
 	obj->pltrelasize = obj->pltrelsize;
 	obj->pltrelsize = 0;
     }
 
     /* Determine size of dynsym table (equal to nchains of sysv hash) */
     if (obj->valid_hash_sysv)
 	obj->dynsymcount = obj->nchains;
     else if (obj->valid_hash_gnu) {
 	obj->dynsymcount = 0;
 	for (bkt = 0; bkt < obj->nbuckets_gnu; bkt++) {
 	    if (obj->buckets_gnu[bkt] == 0)
 		continue;
 	    hashval = &obj->chain_zero_gnu[obj->buckets_gnu[bkt]];
 	    do
 		obj->dynsymcount++;
 	    while ((*hashval++ & 1u) == 0);
 	}
 	obj->dynsymcount += obj->symndx_gnu;
     }
 }
 
 static bool
 obj_resolve_origin(Obj_Entry *obj)
 {
 
 	if (obj->origin_path != NULL)
 		return (true);
 	obj->origin_path = xmalloc(PATH_MAX);
 	return (rtld_dirname_abs(obj->path, obj->origin_path) != -1);
 }
 
 static void
 digest_dynamic2(Obj_Entry *obj, const Elf_Dyn *dyn_rpath,
     const Elf_Dyn *dyn_soname, const Elf_Dyn *dyn_runpath)
 {
 
 	if (obj->z_origin && !obj_resolve_origin(obj))
 		rtld_die();
 
 	if (dyn_runpath != NULL) {
 		obj->runpath = (const char *)obj->strtab + dyn_runpath->d_un.d_val;
 		obj->runpath = origin_subst(obj, obj->runpath);
 	} else if (dyn_rpath != NULL) {
 		obj->rpath = (const char *)obj->strtab + dyn_rpath->d_un.d_val;
 		obj->rpath = origin_subst(obj, obj->rpath);
 	}
 	if (dyn_soname != NULL)
 		object_add_name(obj, obj->strtab + dyn_soname->d_un.d_val);
 }
 
 static void
 digest_dynamic(Obj_Entry *obj, int early)
 {
 	const Elf_Dyn *dyn_rpath;
 	const Elf_Dyn *dyn_soname;
 	const Elf_Dyn *dyn_runpath;
 
 	digest_dynamic1(obj, early, &dyn_rpath, &dyn_soname, &dyn_runpath);
 	digest_dynamic2(obj, dyn_rpath, dyn_soname, dyn_runpath);
 }
 
 /*
  * Process a shared object's program header.  This is used only for the
  * main program, when the kernel has already loaded the main program
  * into memory before calling the dynamic linker.  It creates and
  * returns an Obj_Entry structure.
  */
 static Obj_Entry *
 digest_phdr(const Elf_Phdr *phdr, int phnum, caddr_t entry, const char *path)
 {
     Obj_Entry *obj;
     const Elf_Phdr *phlimit = phdr + phnum;
     const Elf_Phdr *ph;
     Elf_Addr note_start, note_end;
     int nsegs = 0;
 
     obj = obj_new();
     for (ph = phdr;  ph < phlimit;  ph++) {
 	if (ph->p_type != PT_PHDR)
 	    continue;
 
 	obj->phdr = phdr;
 	obj->phsize = ph->p_memsz;
 	obj->relocbase = __DECONST(char *, phdr) - ph->p_vaddr;
 	break;
     }
 
     obj->stack_flags = PF_X | PF_R | PF_W;
 
     for (ph = phdr;  ph < phlimit;  ph++) {
 	switch (ph->p_type) {
 
 	case PT_INTERP:
 	    obj->interp = (const char *)(ph->p_vaddr + obj->relocbase);
 	    break;
 
 	case PT_LOAD:
 	    if (nsegs == 0) {	/* First load segment */
 		obj->vaddrbase = trunc_page(ph->p_vaddr);
 		obj->mapbase = obj->vaddrbase + obj->relocbase;
 	    } else {		/* Last load segment */
 		obj->mapsize = round_page(ph->p_vaddr + ph->p_memsz) -
 		  obj->vaddrbase;
 	    }
 	    nsegs++;
 	    break;
 
 	case PT_DYNAMIC:
 	    obj->dynamic = (const Elf_Dyn *)(ph->p_vaddr + obj->relocbase);
 	    break;
 
 	case PT_TLS:
 	    obj->tlsindex = 1;
 	    obj->tlssize = ph->p_memsz;
 	    obj->tlsalign = ph->p_align;
 	    obj->tlsinitsize = ph->p_filesz;
 	    obj->tlsinit = (void*)(ph->p_vaddr + obj->relocbase);
 	    break;
 
 	case PT_GNU_STACK:
 	    obj->stack_flags = ph->p_flags;
 	    break;
 
 	case PT_GNU_RELRO:
 	    obj->relro_page = obj->relocbase + trunc_page(ph->p_vaddr);
 	    obj->relro_size = round_page(ph->p_memsz);
 	    break;
 
 	case PT_NOTE:
 	    note_start = (Elf_Addr)obj->relocbase + ph->p_vaddr;
 	    note_end = note_start + ph->p_filesz;
 	    digest_notes(obj, note_start, note_end);
 	    break;
 	}
     }
     if (nsegs < 1) {
 	_rtld_error("%s: too few PT_LOAD segments", path);
 	return NULL;
     }
 
     obj->entry = entry;
     return obj;
 }
 
 void
 digest_notes(Obj_Entry *obj, Elf_Addr note_start, Elf_Addr note_end)
 {
 	const Elf_Note *note;
 	const char *note_name;
 	uintptr_t p;
 
 	for (note = (const Elf_Note *)note_start; (Elf_Addr)note < note_end;
 	    note = (const Elf_Note *)((const char *)(note + 1) +
 	      roundup2(note->n_namesz, sizeof(Elf32_Addr)) +
 	      roundup2(note->n_descsz, sizeof(Elf32_Addr)))) {
 		if (note->n_namesz != sizeof(NOTE_FREEBSD_VENDOR) ||
 		    note->n_descsz != sizeof(int32_t))
 			continue;
 		if (note->n_type != NT_FREEBSD_ABI_TAG &&
 		    note->n_type != NT_FREEBSD_FEATURE_CTL &&
 		    note->n_type != NT_FREEBSD_NOINIT_TAG)
 			continue;
 		note_name = (const char *)(note + 1);
 		if (strncmp(NOTE_FREEBSD_VENDOR, note_name,
 		    sizeof(NOTE_FREEBSD_VENDOR)) != 0)
 			continue;
 		switch (note->n_type) {
 		case NT_FREEBSD_ABI_TAG:
 			/* FreeBSD osrel note */
 			p = (uintptr_t)(note + 1);
 			p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
 			obj->osrel = *(const int32_t *)(p);
 			dbg("note osrel %d", obj->osrel);
 			break;
 		case NT_FREEBSD_FEATURE_CTL:
 			/* FreeBSD ABI feature control note */
 			p = (uintptr_t)(note + 1);
 			p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
 			obj->fctl0 = *(const uint32_t *)(p);
 			dbg("note fctl0 %#x", obj->fctl0);
 			break;
 		case NT_FREEBSD_NOINIT_TAG:
 			/* FreeBSD 'crt does not call init' note */
 			obj->crt_no_init = true;
 			dbg("note crt_no_init");
 			break;
 		}
 	}
 }
 
 static Obj_Entry *
 dlcheck(void *handle)
 {
     Obj_Entry *obj;
 
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj == (Obj_Entry *) handle)
 	    break;
     }
 
     if (obj == NULL || obj->refcount == 0 || obj->dl_refcount == 0) {
 	_rtld_error("Invalid shared object handle %p", handle);
 	return NULL;
     }
     return obj;
 }
 
 /*
  * If the given object is already in the donelist, return true.  Otherwise
  * add the object to the list and return false.
  */
 static bool
 donelist_check(DoneList *dlp, const Obj_Entry *obj)
 {
     unsigned int i;
 
     for (i = 0;  i < dlp->num_used;  i++)
 	if (dlp->objs[i] == obj)
 	    return true;
     /*
      * Our donelist allocation should always be sufficient.  But if
      * our threads locking isn't working properly, more shared objects
      * could have been loaded since we allocated the list.  That should
      * never happen, but we'll handle it properly just in case it does.
      */
     if (dlp->num_used < dlp->num_alloc)
 	dlp->objs[dlp->num_used++] = obj;
     return false;
 }
 
 /*
  * Hash function for symbol table lookup.  Don't even think about changing
  * this.  It is specified by the System V ABI.
  */
 unsigned long
 elf_hash(const char *name)
 {
     const unsigned char *p = (const unsigned char *) name;
     unsigned long h = 0;
     unsigned long g;
 
     while (*p != '\0') {
 	h = (h << 4) + *p++;
 	if ((g = h & 0xf0000000) != 0)
 	    h ^= g >> 24;
 	h &= ~g;
     }
     return h;
 }
 
 /*
  * The GNU hash function is the Daniel J. Bernstein hash clipped to 32 bits
  * unsigned in case it's implemented with a wider type.
  */
 static uint32_t
 gnu_hash(const char *s)
 {
 	uint32_t h;
 	unsigned char c;
 
 	h = 5381;
 	for (c = *s; c != '\0'; c = *++s)
 		h = h * 33 + c;
 	return (h & 0xffffffff);
 }
 
 
 /*
  * Find the library with the given name, and return its full pathname.
  * The returned string is dynamically allocated.  Generates an error
  * message and returns NULL if the library cannot be found.
  *
  * If the second argument is non-NULL, then it refers to an already-
  * loaded shared object, whose library search path will be searched.
  *
  * If a library is successfully located via LD_LIBRARY_PATH_FDS, its
  * descriptor (which is close-on-exec) will be passed out via the third
  * argument.
  *
  * The search order is:
  *   DT_RPATH in the referencing file _unless_ DT_RUNPATH is present (1)
  *   DT_RPATH of the main object if DSO without defined DT_RUNPATH (1)
  *   LD_LIBRARY_PATH
  *   DT_RUNPATH in the referencing file
  *   ldconfig hints (if -z nodefaultlib, filter out default library directories
  *	 from list)
  *   /lib:/usr/lib _unless_ the referencing file is linked with -z nodefaultlib
  *
  * (1) Handled in digest_dynamic2 - rpath left NULL if runpath defined.
  */
 static char *
 find_library(const char *xname, const Obj_Entry *refobj, int *fdp)
 {
 	char *pathname, *refobj_path;
 	const char *name;
 	bool nodeflib, objgiven;
 
 	objgiven = refobj != NULL;
 
 	if (libmap_disable || !objgiven ||
 	    (name = lm_find(refobj->path, xname)) == NULL)
 		name = xname;
 
 	if (strchr(name, '/') != NULL) {	/* Hard coded pathname */
 		if (name[0] != '/' && !trust) {
 			_rtld_error("Absolute pathname required "
 			    "for shared object \"%s\"", name);
 			return (NULL);
 		}
 		return (origin_subst(__DECONST(Obj_Entry *, refobj),
 		    __DECONST(char *, name)));
 	}
 
 	dbg(" Searching for \"%s\"", name);
 	refobj_path = objgiven ? refobj->path : NULL;
 
 	/*
 	 * If refobj->rpath != NULL, then refobj->runpath is NULL.  Fall
 	 * back to pre-conforming behaviour if user requested so with
 	 * LD_LIBRARY_PATH_RPATH environment variable and ignore -z
 	 * nodeflib.
 	 */
 	if (objgiven && refobj->rpath != NULL && ld_library_path_rpath) {
 		pathname = search_library_path(name, ld_library_path,
 		    refobj_path, fdp);
 		if (pathname != NULL)
 			return (pathname);
 		if (refobj != NULL) {
 			pathname = search_library_path(name, refobj->rpath,
 			    refobj_path, fdp);
 			if (pathname != NULL)
 				return (pathname);
 		}
 		pathname = search_library_pathfds(name, ld_library_dirs, fdp);
 		if (pathname != NULL)
 			return (pathname);
 		pathname = search_library_path(name, gethints(false),
 		    refobj_path, fdp);
 		if (pathname != NULL)
 			return (pathname);
 		pathname = search_library_path(name, ld_standard_library_path,
 		    refobj_path, fdp);
 		if (pathname != NULL)
 			return (pathname);
 	} else {
 		nodeflib = objgiven ? refobj->z_nodeflib : false;
 		if (objgiven) {
 			pathname = search_library_path(name, refobj->rpath,
 			    refobj->path, fdp);
 			if (pathname != NULL)
 				return (pathname);
 		}
 		if (objgiven && refobj->runpath == NULL && refobj != obj_main) {
 			pathname = search_library_path(name, obj_main->rpath,
 			    refobj_path, fdp);
 			if (pathname != NULL)
 				return (pathname);
 		}
 		pathname = search_library_path(name, ld_library_path,
 		    refobj_path, fdp);
 		if (pathname != NULL)
 			return (pathname);
 		if (objgiven) {
 			pathname = search_library_path(name, refobj->runpath,
 			    refobj_path, fdp);
 			if (pathname != NULL)
 				return (pathname);
 		}
 		pathname = search_library_pathfds(name, ld_library_dirs, fdp);
 		if (pathname != NULL)
 			return (pathname);
 		pathname = search_library_path(name, gethints(nodeflib),
 		    refobj_path, fdp);
 		if (pathname != NULL)
 			return (pathname);
 		if (objgiven && !nodeflib) {
 			pathname = search_library_path(name,
 			    ld_standard_library_path, refobj_path, fdp);
 			if (pathname != NULL)
 				return (pathname);
 		}
 	}
 
 	if (objgiven && refobj->path != NULL) {
 		_rtld_error("Shared object \"%s\" not found, "
 		    "required by \"%s\"", name, basename(refobj->path));
 	} else {
 		_rtld_error("Shared object \"%s\" not found", name);
 	}
 	return (NULL);
 }
 
 /*
  * Given a symbol number in a referencing object, find the corresponding
  * definition of the symbol.  Returns a pointer to the symbol, or NULL if
  * no definition was found.  Returns a pointer to the Obj_Entry of the
  * defining object via the reference parameter DEFOBJ_OUT.
  */
 const Elf_Sym *
 find_symdef(unsigned long symnum, const Obj_Entry *refobj,
     const Obj_Entry **defobj_out, int flags, SymCache *cache,
     RtldLockState *lockstate)
 {
     const Elf_Sym *ref;
     const Elf_Sym *def;
     const Obj_Entry *defobj;
     const Ver_Entry *ve;
     SymLook req;
     const char *name;
     int res;
 
     /*
      * If we have already found this symbol, get the information from
      * the cache.
      */
     if (symnum >= refobj->dynsymcount)
 	return NULL;	/* Bad object */
     if (cache != NULL && cache[symnum].sym != NULL) {
 	*defobj_out = cache[symnum].obj;
 	return cache[symnum].sym;
     }
 
     ref = refobj->symtab + symnum;
     name = refobj->strtab + ref->st_name;
     def = NULL;
     defobj = NULL;
     ve = NULL;
 
     /*
      * We don't have to do a full scale lookup if the symbol is local.
      * We know it will bind to the instance in this load module; to
      * which we already have a pointer (ie ref). By not doing a lookup,
      * we not only improve performance, but it also avoids unresolvable
      * symbols when local symbols are not in the hash table. This has
      * been seen with the ia64 toolchain.
      */
     if (ELF_ST_BIND(ref->st_info) != STB_LOCAL) {
 	if (ELF_ST_TYPE(ref->st_info) == STT_SECTION) {
 	    _rtld_error("%s: Bogus symbol table entry %lu", refobj->path,
 		symnum);
 	}
 	symlook_init(&req, name);
 	req.flags = flags;
 	ve = req.ventry = fetch_ventry(refobj, symnum);
 	req.lockstate = lockstate;
 	res = symlook_default(&req, refobj);
 	if (res == 0) {
 	    def = req.sym_out;
 	    defobj = req.defobj_out;
 	}
     } else {
 	def = ref;
 	defobj = refobj;
     }
 
     /*
      * If we found no definition and the reference is weak, treat the
      * symbol as having the value zero.
      */
     if (def == NULL && ELF_ST_BIND(ref->st_info) == STB_WEAK) {
 	def = &sym_zero;
 	defobj = obj_main;
     }
 
     if (def != NULL) {
 	*defobj_out = defobj;
 	/* Record the information in the cache to avoid subsequent lookups. */
 	if (cache != NULL) {
 	    cache[symnum].sym = def;
 	    cache[symnum].obj = defobj;
 	}
     } else {
 	if (refobj != &obj_rtld)
 	    _rtld_error("%s: Undefined symbol \"%s%s%s\"", refobj->path, name,
 	      ve != NULL ? "@" : "", ve != NULL ? ve->name : "");
     }
     return def;
 }
 
 /*
  * Return the search path from the ldconfig hints file, reading it if
  * necessary.  If nostdlib is true, then the default search paths are
  * not added to result.
  *
  * Returns NULL if there are problems with the hints file,
  * or if the search path there is empty.
  */
 static const char *
 gethints(bool nostdlib)
 {
 	static char *filtered_path;
 	static const char *hints;
 	static struct elfhints_hdr hdr;
 	struct fill_search_info_args sargs, hargs;
 	struct dl_serinfo smeta, hmeta, *SLPinfo, *hintinfo;
 	struct dl_serpath *SLPpath, *hintpath;
 	char *p;
 	struct stat hint_stat;
 	unsigned int SLPndx, hintndx, fndx, fcount;
 	int fd;
 	size_t flen;
 	uint32_t dl;
 	bool skip;
 
 	/* First call, read the hints file */
 	if (hints == NULL) {
 		/* Keep from trying again in case the hints file is bad. */
 		hints = "";
 
 		if ((fd = open(ld_elf_hints_path, O_RDONLY | O_CLOEXEC)) == -1)
 			return (NULL);
 
 		/*
 		 * Check of hdr.dirlistlen value against type limit
 		 * intends to pacify static analyzers.  Further
 		 * paranoia leads to checks that dirlist is fully
 		 * contained in the file range.
 		 */
 		if (read(fd, &hdr, sizeof hdr) != sizeof hdr ||
 		    hdr.magic != ELFHINTS_MAGIC ||
 		    hdr.version != 1 || hdr.dirlistlen > UINT_MAX / 2 ||
 		    fstat(fd, &hint_stat) == -1) {
 cleanup1:
 			close(fd);
 			hdr.dirlistlen = 0;
 			return (NULL);
 		}
 		dl = hdr.strtab;
 		if (dl + hdr.dirlist < dl)
 			goto cleanup1;
 		dl += hdr.dirlist;
 		if (dl + hdr.dirlistlen < dl)
 			goto cleanup1;
 		dl += hdr.dirlistlen;
 		if (dl > hint_stat.st_size)
 			goto cleanup1;
 		p = xmalloc(hdr.dirlistlen + 1);
 		if (pread(fd, p, hdr.dirlistlen + 1,
 		    hdr.strtab + hdr.dirlist) != (ssize_t)hdr.dirlistlen + 1 ||
 		    p[hdr.dirlistlen] != '\0') {
 			free(p);
 			goto cleanup1;
 		}
 		hints = p;
 		close(fd);
 	}
 
 	/*
 	 * If caller agreed to receive list which includes the default
 	 * paths, we are done. Otherwise, if we still did not
 	 * calculated filtered result, do it now.
 	 */
 	if (!nostdlib)
 		return (hints[0] != '\0' ? hints : NULL);
 	if (filtered_path != NULL)
 		goto filt_ret;
 
 	/*
 	 * Obtain the list of all configured search paths, and the
 	 * list of the default paths.
 	 *
 	 * First estimate the size of the results.
 	 */
 	smeta.dls_size = __offsetof(struct dl_serinfo, dls_serpath);
 	smeta.dls_cnt = 0;
 	hmeta.dls_size = __offsetof(struct dl_serinfo, dls_serpath);
 	hmeta.dls_cnt = 0;
 
 	sargs.request = RTLD_DI_SERINFOSIZE;
 	sargs.serinfo = &smeta;
 	hargs.request = RTLD_DI_SERINFOSIZE;
 	hargs.serinfo = &hmeta;
 
 	path_enumerate(ld_standard_library_path, fill_search_info, NULL,
 	    &sargs);
 	path_enumerate(hints, fill_search_info, NULL, &hargs);
 
 	SLPinfo = xmalloc(smeta.dls_size);
 	hintinfo = xmalloc(hmeta.dls_size);
 
 	/*
 	 * Next fetch both sets of paths.
 	 */
 	sargs.request = RTLD_DI_SERINFO;
 	sargs.serinfo = SLPinfo;
 	sargs.serpath = &SLPinfo->dls_serpath[0];
 	sargs.strspace = (char *)&SLPinfo->dls_serpath[smeta.dls_cnt];
 
 	hargs.request = RTLD_DI_SERINFO;
 	hargs.serinfo = hintinfo;
 	hargs.serpath = &hintinfo->dls_serpath[0];
 	hargs.strspace = (char *)&hintinfo->dls_serpath[hmeta.dls_cnt];
 
 	path_enumerate(ld_standard_library_path, fill_search_info, NULL,
 	    &sargs);
 	path_enumerate(hints, fill_search_info, NULL, &hargs);
 
 	/*
 	 * Now calculate the difference between two sets, by excluding
 	 * standard paths from the full set.
 	 */
 	fndx = 0;
 	fcount = 0;
 	filtered_path = xmalloc(hdr.dirlistlen + 1);
 	hintpath = &hintinfo->dls_serpath[0];
 	for (hintndx = 0; hintndx < hmeta.dls_cnt; hintndx++, hintpath++) {
 		skip = false;
 		SLPpath = &SLPinfo->dls_serpath[0];
 		/*
 		 * Check each standard path against current.
 		 */
 		for (SLPndx = 0; SLPndx < smeta.dls_cnt; SLPndx++, SLPpath++) {
 			/* matched, skip the path */
 			if (!strcmp(hintpath->dls_name, SLPpath->dls_name)) {
 				skip = true;
 				break;
 			}
 		}
 		if (skip)
 			continue;
 		/*
 		 * Not matched against any standard path, add the path
 		 * to result. Separate consequtive paths with ':'.
 		 */
 		if (fcount > 0) {
 			filtered_path[fndx] = ':';
 			fndx++;
 		}
 		fcount++;
 		flen = strlen(hintpath->dls_name);
 		strncpy((filtered_path + fndx),	hintpath->dls_name, flen);
 		fndx += flen;
 	}
 	filtered_path[fndx] = '\0';
 
 	free(SLPinfo);
 	free(hintinfo);
 
 filt_ret:
 	return (filtered_path[0] != '\0' ? filtered_path : NULL);
 }
 
 static void
 init_dag(Obj_Entry *root)
 {
     const Needed_Entry *needed;
     const Objlist_Entry *elm;
     DoneList donelist;
 
     if (root->dag_inited)
 	return;
     donelist_init(&donelist);
 
     /* Root object belongs to own DAG. */
     objlist_push_tail(&root->dldags, root);
     objlist_push_tail(&root->dagmembers, root);
     donelist_check(&donelist, root);
 
     /*
      * Add dependencies of root object to DAG in breadth order
      * by exploiting the fact that each new object get added
      * to the tail of the dagmembers list.
      */
     STAILQ_FOREACH(elm, &root->dagmembers, link) {
 	for (needed = elm->obj->needed; needed != NULL; needed = needed->next) {
 	    if (needed->obj == NULL || donelist_check(&donelist, needed->obj))
 		continue;
 	    objlist_push_tail(&needed->obj->dldags, root);
 	    objlist_push_tail(&root->dagmembers, needed->obj);
 	}
     }
     root->dag_inited = true;
 }
 
 static void
 init_marker(Obj_Entry *marker)
 {
 
 	bzero(marker, sizeof(*marker));
 	marker->marker = true;
 }
 
 Obj_Entry *
 globallist_curr(const Obj_Entry *obj)
 {
 
 	for (;;) {
 		if (obj == NULL)
 			return (NULL);
 		if (!obj->marker)
 			return (__DECONST(Obj_Entry *, obj));
 		obj = TAILQ_PREV(obj, obj_entry_q, next);
 	}
 }
 
 Obj_Entry *
 globallist_next(const Obj_Entry *obj)
 {
 
 	for (;;) {
 		obj = TAILQ_NEXT(obj, next);
 		if (obj == NULL)
 			return (NULL);
 		if (!obj->marker)
 			return (__DECONST(Obj_Entry *, obj));
 	}
 }
 
 /* Prevent the object from being unmapped while the bind lock is dropped. */
 static void
 hold_object(Obj_Entry *obj)
 {
 
 	obj->holdcount++;
 }
 
 static void
 unhold_object(Obj_Entry *obj)
 {
 
 	assert(obj->holdcount > 0);
 	if (--obj->holdcount == 0 && obj->unholdfree)
 		release_object(obj);
 }
 
 static void
 process_z(Obj_Entry *root)
 {
 	const Objlist_Entry *elm;
 	Obj_Entry *obj;
 
 	/*
 	 * Walk over object DAG and process every dependent object
 	 * that is marked as DF_1_NODELETE or DF_1_GLOBAL. They need
 	 * to grow their own DAG.
 	 *
 	 * For DF_1_GLOBAL, DAG is required for symbol lookups in
 	 * symlook_global() to work.
 	 *
 	 * For DF_1_NODELETE, the DAG should have its reference upped.
 	 */
 	STAILQ_FOREACH(elm, &root->dagmembers, link) {
 		obj = elm->obj;
 		if (obj == NULL)
 			continue;
 		if (obj->z_nodelete && !obj->ref_nodel) {
 			dbg("obj %s -z nodelete", obj->path);
 			init_dag(obj);
 			ref_dag(obj);
 			obj->ref_nodel = true;
 		}
 		if (obj->z_global && objlist_find(&list_global, obj) == NULL) {
 			dbg("obj %s -z global", obj->path);
 			objlist_push_tail(&list_global, obj);
 			init_dag(obj);
 		}
 	}
 }
 /*
  * Initialize the dynamic linker.  The argument is the address at which
  * the dynamic linker has been mapped into memory.  The primary task of
  * this function is to relocate the dynamic linker.
  */
 static void
 init_rtld(caddr_t mapbase, Elf_Auxinfo **aux_info)
 {
     Obj_Entry objtmp;	/* Temporary rtld object */
     const Elf_Ehdr *ehdr;
     const Elf_Dyn *dyn_rpath;
     const Elf_Dyn *dyn_soname;
     const Elf_Dyn *dyn_runpath;
 
 #ifdef RTLD_INIT_PAGESIZES_EARLY
     /* The page size is required by the dynamic memory allocator. */
     init_pagesizes(aux_info);
 #endif
 
     /*
      * Conjure up an Obj_Entry structure for the dynamic linker.
      *
      * The "path" member can't be initialized yet because string constants
      * cannot yet be accessed. Below we will set it correctly.
      */
     memset(&objtmp, 0, sizeof(objtmp));
     objtmp.path = NULL;
     objtmp.rtld = true;
     objtmp.mapbase = mapbase;
 #ifdef PIC
     objtmp.relocbase = mapbase;
 #endif
 
     objtmp.dynamic = rtld_dynamic(&objtmp);
     digest_dynamic1(&objtmp, 1, &dyn_rpath, &dyn_soname, &dyn_runpath);
     assert(objtmp.needed == NULL);
 #if !defined(__mips__)
     /* MIPS has a bogus DT_TEXTREL. */
     assert(!objtmp.textrel);
 #endif
     /*
      * Temporarily put the dynamic linker entry into the object list, so
      * that symbols can be found.
      */
     relocate_objects(&objtmp, true, &objtmp, 0, NULL);
 
     ehdr = (Elf_Ehdr *)mapbase;
     objtmp.phdr = (Elf_Phdr *)((char *)mapbase + ehdr->e_phoff);
     objtmp.phsize = ehdr->e_phnum * sizeof(objtmp.phdr[0]);
 
     /* Initialize the object list. */
     TAILQ_INIT(&obj_list);
 
     /* Now that non-local variables can be accesses, copy out obj_rtld. */
     memcpy(&obj_rtld, &objtmp, sizeof(obj_rtld));
 
 #ifndef RTLD_INIT_PAGESIZES_EARLY
     /* The page size is required by the dynamic memory allocator. */
     init_pagesizes(aux_info);
 #endif
 
     if (aux_info[AT_OSRELDATE] != NULL)
 	    osreldate = aux_info[AT_OSRELDATE]->a_un.a_val;
 
     digest_dynamic2(&obj_rtld, dyn_rpath, dyn_soname, dyn_runpath);
 
     /* Replace the path with a dynamically allocated copy. */
     obj_rtld.path = xstrdup(ld_path_rtld);
 
     r_debug.r_brk = r_debug_state;
     r_debug.r_state = RT_CONSISTENT;
 }
 
 /*
  * Retrieve the array of supported page sizes.  The kernel provides the page
  * sizes in increasing order.
  */
 static void
 init_pagesizes(Elf_Auxinfo **aux_info)
 {
 	static size_t psa[MAXPAGESIZES];
 	int mib[2];
 	size_t len, size;
 
 	if (aux_info[AT_PAGESIZES] != NULL && aux_info[AT_PAGESIZESLEN] !=
 	    NULL) {
 		size = aux_info[AT_PAGESIZESLEN]->a_un.a_val;
 		pagesizes = aux_info[AT_PAGESIZES]->a_un.a_ptr;
 	} else {
 		len = 2;
 		if (sysctlnametomib("hw.pagesizes", mib, &len) == 0)
 			size = sizeof(psa);
 		else {
 			/* As a fallback, retrieve the base page size. */
 			size = sizeof(psa[0]);
 			if (aux_info[AT_PAGESZ] != NULL) {
 				psa[0] = aux_info[AT_PAGESZ]->a_un.a_val;
 				goto psa_filled;
 			} else {
 				mib[0] = CTL_HW;
 				mib[1] = HW_PAGESIZE;
 				len = 2;
 			}
 		}
 		if (sysctl(mib, len, psa, &size, NULL, 0) == -1) {
 			_rtld_error("sysctl for hw.pagesize(s) failed");
 			rtld_die();
 		}
 psa_filled:
 		pagesizes = psa;
 	}
 	npagesizes = size / sizeof(pagesizes[0]);
 	/* Discard any invalid entries at the end of the array. */
 	while (npagesizes > 0 && pagesizes[npagesizes - 1] == 0)
 		npagesizes--;
 }
 
 /*
  * Add the init functions from a needed object list (and its recursive
  * needed objects) to "list".  This is not used directly; it is a helper
  * function for initlist_add_objects().  The write lock must be held
  * when this function is called.
  */
 static void
 initlist_add_neededs(Needed_Entry *needed, Objlist *list)
 {
     /* Recursively process the successor needed objects. */
     if (needed->next != NULL)
 	initlist_add_neededs(needed->next, list);
 
     /* Process the current needed object. */
     if (needed->obj != NULL)
 	initlist_add_objects(needed->obj, needed->obj, list);
 }
 
 /*
  * Scan all of the DAGs rooted in the range of objects from "obj" to
  * "tail" and add their init functions to "list".  This recurses over
  * the DAGs and ensure the proper init ordering such that each object's
  * needed libraries are initialized before the object itself.  At the
  * same time, this function adds the objects to the global finalization
  * list "list_fini" in the opposite order.  The write lock must be
  * held when this function is called.
  */
 static void
 initlist_add_objects(Obj_Entry *obj, Obj_Entry *tail, Objlist *list)
 {
     Obj_Entry *nobj;
 
     if (obj->init_scanned || obj->init_done)
 	return;
     obj->init_scanned = true;
 
     /* Recursively process the successor objects. */
     nobj = globallist_next(obj);
     if (nobj != NULL && obj != tail)
 	initlist_add_objects(nobj, tail, list);
 
     /* Recursively process the needed objects. */
     if (obj->needed != NULL)
 	initlist_add_neededs(obj->needed, list);
     if (obj->needed_filtees != NULL)
 	initlist_add_neededs(obj->needed_filtees, list);
     if (obj->needed_aux_filtees != NULL)
 	initlist_add_neededs(obj->needed_aux_filtees, list);
 
     /* Add the object to the init list. */
     objlist_push_tail(list, obj);
 
     /* Add the object to the global fini list in the reverse order. */
     if ((obj->fini != (Elf_Addr)NULL || obj->fini_array != (Elf_Addr)NULL)
       && !obj->on_fini_list) {
 	objlist_push_head(&list_fini, obj);
 	obj->on_fini_list = true;
     }
 }
 
 #ifndef FPTR_TARGET
 #define FPTR_TARGET(f)	((Elf_Addr) (f))
 #endif
 
 static void
 free_needed_filtees(Needed_Entry *n, RtldLockState *lockstate)
 {
     Needed_Entry *needed, *needed1;
 
     for (needed = n; needed != NULL; needed = needed->next) {
 	if (needed->obj != NULL) {
 	    dlclose_locked(needed->obj, lockstate);
 	    needed->obj = NULL;
 	}
     }
     for (needed = n; needed != NULL; needed = needed1) {
 	needed1 = needed->next;
 	free(needed);
     }
 }
 
 static void
 unload_filtees(Obj_Entry *obj, RtldLockState *lockstate)
 {
 
 	free_needed_filtees(obj->needed_filtees, lockstate);
 	obj->needed_filtees = NULL;
 	free_needed_filtees(obj->needed_aux_filtees, lockstate);
 	obj->needed_aux_filtees = NULL;
 	obj->filtees_loaded = false;
 }
 
 static void
 load_filtee1(Obj_Entry *obj, Needed_Entry *needed, int flags,
     RtldLockState *lockstate)
 {
 
     for (; needed != NULL; needed = needed->next) {
 	needed->obj = dlopen_object(obj->strtab + needed->name, -1, obj,
 	  flags, ((ld_loadfltr || obj->z_loadfltr) ? RTLD_NOW : RTLD_LAZY) |
 	  RTLD_LOCAL, lockstate);
     }
 }
 
 static void
 load_filtees(Obj_Entry *obj, int flags, RtldLockState *lockstate)
 {
 
     lock_restart_for_upgrade(lockstate);
     if (!obj->filtees_loaded) {
 	load_filtee1(obj, obj->needed_filtees, flags, lockstate);
 	load_filtee1(obj, obj->needed_aux_filtees, flags, lockstate);
 	obj->filtees_loaded = true;
     }
 }
 
 static int
 process_needed(Obj_Entry *obj, Needed_Entry *needed, int flags)
 {
     Obj_Entry *obj1;
 
     for (; needed != NULL; needed = needed->next) {
 	obj1 = needed->obj = load_object(obj->strtab + needed->name, -1, obj,
 	  flags & ~RTLD_LO_NOLOAD);
 	if (obj1 == NULL && !ld_tracing && (flags & RTLD_LO_FILTEES) == 0)
 	    return (-1);
     }
     return (0);
 }
 
 /*
  * Given a shared object, traverse its list of needed objects, and load
  * each of them.  Returns 0 on success.  Generates an error message and
  * returns -1 on failure.
  */
 static int
 load_needed_objects(Obj_Entry *first, int flags)
 {
     Obj_Entry *obj;
 
     for (obj = first; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 	if (obj->marker)
 	    continue;
 	if (process_needed(obj, obj->needed, flags) == -1)
 	    return (-1);
     }
     return (0);
 }
 
 static int
 load_preload_objects(void)
 {
     char *p = ld_preload;
     Obj_Entry *obj;
     static const char delim[] = " \t:;";
 
     if (p == NULL)
 	return 0;
 
     p += strspn(p, delim);
     while (*p != '\0') {
 	size_t len = strcspn(p, delim);
 	char savech;
 
 	savech = p[len];
 	p[len] = '\0';
 	obj = load_object(p, -1, NULL, 0);
 	if (obj == NULL)
 	    return -1;	/* XXX - cleanup */
 	obj->z_interpose = true;
 	p[len] = savech;
 	p += len;
 	p += strspn(p, delim);
     }
     LD_UTRACE(UTRACE_PRELOAD_FINISHED, NULL, NULL, 0, 0, NULL);
     return 0;
 }
 
 static const char *
 printable_path(const char *path)
 {
 
 	return (path == NULL ? "<unknown>" : path);
 }
 
 /*
  * Load a shared object into memory, if it is not already loaded.  The
  * object may be specified by name or by user-supplied file descriptor
  * fd_u. In the later case, the fd_u descriptor is not closed, but its
  * duplicate is.
  *
  * Returns a pointer to the Obj_Entry for the object.  Returns NULL
  * on failure.
  */
 static Obj_Entry *
 load_object(const char *name, int fd_u, const Obj_Entry *refobj, int flags)
 {
     Obj_Entry *obj;
     int fd;
     struct stat sb;
     char *path;
 
     fd = -1;
     if (name != NULL) {
 	TAILQ_FOREACH(obj, &obj_list, next) {
 	    if (obj->marker || obj->doomed)
 		continue;
 	    if (object_match_name(obj, name))
 		return (obj);
 	}
 
 	path = find_library(name, refobj, &fd);
 	if (path == NULL)
 	    return (NULL);
     } else
 	path = NULL;
 
     if (fd >= 0) {
 	/*
 	 * search_library_pathfds() opens a fresh file descriptor for the
 	 * library, so there is no need to dup().
 	 */
     } else if (fd_u == -1) {
 	/*
 	 * If we didn't find a match by pathname, or the name is not
 	 * supplied, open the file and check again by device and inode.
 	 * This avoids false mismatches caused by multiple links or ".."
 	 * in pathnames.
 	 *
 	 * To avoid a race, we open the file and use fstat() rather than
 	 * using stat().
 	 */
 	if ((fd = open(path, O_RDONLY | O_CLOEXEC | O_VERIFY)) == -1) {
 	    _rtld_error("Cannot open \"%s\"", path);
 	    free(path);
 	    return (NULL);
 	}
     } else {
 	fd = fcntl(fd_u, F_DUPFD_CLOEXEC, 0);
 	if (fd == -1) {
 	    _rtld_error("Cannot dup fd");
 	    free(path);
 	    return (NULL);
 	}
     }
     if (fstat(fd, &sb) == -1) {
 	_rtld_error("Cannot fstat \"%s\"", printable_path(path));
 	close(fd);
 	free(path);
 	return NULL;
     }
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj->marker || obj->doomed)
 	    continue;
 	if (obj->ino == sb.st_ino && obj->dev == sb.st_dev)
 	    break;
     }
     if (obj != NULL && name != NULL) {
 	object_add_name(obj, name);
 	free(path);
 	close(fd);
 	return obj;
     }
     if (flags & RTLD_LO_NOLOAD) {
 	free(path);
 	close(fd);
 	return (NULL);
     }
 
     /* First use of this object, so we must map it in */
     obj = do_load_object(fd, name, path, &sb, flags);
     if (obj == NULL)
 	free(path);
     close(fd);
 
     return obj;
 }
 
 static Obj_Entry *
 do_load_object(int fd, const char *name, char *path, struct stat *sbp,
   int flags)
 {
     Obj_Entry *obj;
     struct statfs fs;
 
     /*
      * but first, make sure that environment variables haven't been
      * used to circumvent the noexec flag on a filesystem.
      */
     if (dangerous_ld_env) {
 	if (fstatfs(fd, &fs) != 0) {
 	    _rtld_error("Cannot fstatfs \"%s\"", printable_path(path));
 	    return NULL;
 	}
 	if (fs.f_flags & MNT_NOEXEC) {
 	    _rtld_error("Cannot execute objects on %s", fs.f_mntonname);
 	    return NULL;
 	}
     }
     dbg("loading \"%s\"", printable_path(path));
     obj = map_object(fd, printable_path(path), sbp);
     if (obj == NULL)
         return NULL;
 
     /*
      * If DT_SONAME is present in the object, digest_dynamic2 already
      * added it to the object names.
      */
     if (name != NULL)
 	object_add_name(obj, name);
     obj->path = path;
     digest_dynamic(obj, 0);
     dbg("%s valid_hash_sysv %d valid_hash_gnu %d dynsymcount %d", obj->path,
 	obj->valid_hash_sysv, obj->valid_hash_gnu, obj->dynsymcount);
     if (obj->z_noopen && (flags & (RTLD_LO_DLOPEN | RTLD_LO_TRACE)) ==
       RTLD_LO_DLOPEN) {
 	dbg("refusing to load non-loadable \"%s\"", obj->path);
 	_rtld_error("Cannot dlopen non-loadable %s", obj->path);
 	munmap(obj->mapbase, obj->mapsize);
 	obj_free(obj);
 	return (NULL);
     }
 
     obj->dlopened = (flags & RTLD_LO_DLOPEN) != 0;
     TAILQ_INSERT_TAIL(&obj_list, obj, next);
     obj_count++;
     obj_loads++;
     linkmap_add(obj);	/* for GDB & dlinfo() */
     max_stack_flags |= obj->stack_flags;
 
     dbg("  %p .. %p: %s", obj->mapbase,
          obj->mapbase + obj->mapsize - 1, obj->path);
     if (obj->textrel)
 	dbg("  WARNING: %s has impure text", obj->path);
     LD_UTRACE(UTRACE_LOAD_OBJECT, obj, obj->mapbase, obj->mapsize, 0,
 	obj->path);    
 
     return obj;
 }
 
 static Obj_Entry *
 obj_from_addr(const void *addr)
 {
     Obj_Entry *obj;
 
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj->marker)
 	    continue;
 	if (addr < (void *) obj->mapbase)
 	    continue;
 	if (addr < (void *)(obj->mapbase + obj->mapsize))
 	    return obj;
     }
     return NULL;
 }
 
 static void
 preinit_main(void)
 {
     Elf_Addr *preinit_addr;
     int index;
 
     preinit_addr = (Elf_Addr *)obj_main->preinit_array;
     if (preinit_addr == NULL)
 	return;
 
     for (index = 0; index < obj_main->preinit_array_num; index++) {
 	if (preinit_addr[index] != 0 && preinit_addr[index] != 1) {
 	    dbg("calling preinit function for %s at %p", obj_main->path,
 	      (void *)preinit_addr[index]);
 	    LD_UTRACE(UTRACE_INIT_CALL, obj_main, (void *)preinit_addr[index],
 	      0, 0, obj_main->path);
 	    call_init_pointer(obj_main, preinit_addr[index]);
 	}
     }
 }
 
 /*
  * Call the finalization functions for each of the objects in "list"
  * belonging to the DAG of "root" and referenced once. If NULL "root"
  * is specified, every finalization function will be called regardless
  * of the reference count and the list elements won't be freed. All of
  * the objects are expected to have non-NULL fini functions.
  */
 static void
 objlist_call_fini(Objlist *list, Obj_Entry *root, RtldLockState *lockstate)
 {
     Objlist_Entry *elm;
     char *saved_msg;
     Elf_Addr *fini_addr;
     int index;
 
     assert(root == NULL || root->refcount == 1);
 
     if (root != NULL)
 	root->doomed = true;
 
     /*
      * Preserve the current error message since a fini function might
      * call into the dynamic linker and overwrite it.
      */
     saved_msg = errmsg_save();
     do {
 	STAILQ_FOREACH(elm, list, link) {
 	    if (root != NULL && (elm->obj->refcount != 1 ||
 	      objlist_find(&root->dagmembers, elm->obj) == NULL))
 		continue;
 	    /* Remove object from fini list to prevent recursive invocation. */
 	    STAILQ_REMOVE(list, elm, Struct_Objlist_Entry, link);
 	    /* Ensure that new references cannot be acquired. */
 	    elm->obj->doomed = true;
 
 	    hold_object(elm->obj);
 	    lock_release(rtld_bind_lock, lockstate);
 	    /*
 	     * It is legal to have both DT_FINI and DT_FINI_ARRAY defined.
 	     * When this happens, DT_FINI_ARRAY is processed first.
 	     */
 	    fini_addr = (Elf_Addr *)elm->obj->fini_array;
 	    if (fini_addr != NULL && elm->obj->fini_array_num > 0) {
 		for (index = elm->obj->fini_array_num - 1; index >= 0;
 		  index--) {
 		    if (fini_addr[index] != 0 && fini_addr[index] != 1) {
 			dbg("calling fini function for %s at %p",
 			    elm->obj->path, (void *)fini_addr[index]);
 			LD_UTRACE(UTRACE_FINI_CALL, elm->obj,
 			    (void *)fini_addr[index], 0, 0, elm->obj->path);
 			call_initfini_pointer(elm->obj, fini_addr[index]);
 		    }
 		}
 	    }
 	    if (elm->obj->fini != (Elf_Addr)NULL) {
 		dbg("calling fini function for %s at %p", elm->obj->path,
 		    (void *)elm->obj->fini);
 		LD_UTRACE(UTRACE_FINI_CALL, elm->obj, (void *)elm->obj->fini,
 		    0, 0, elm->obj->path);
 		call_initfini_pointer(elm->obj, elm->obj->fini);
 	    }
 	    wlock_acquire(rtld_bind_lock, lockstate);
 	    unhold_object(elm->obj);
 	    /* No need to free anything if process is going down. */
 	    if (root != NULL)
 	    	free(elm);
 	    /*
 	     * We must restart the list traversal after every fini call
 	     * because a dlclose() call from the fini function or from
 	     * another thread might have modified the reference counts.
 	     */
 	    break;
 	}
     } while (elm != NULL);
     errmsg_restore(saved_msg);
 }
 
 /*
  * Call the initialization functions for each of the objects in
  * "list".  All of the objects are expected to have non-NULL init
  * functions.
  */
 static void
 objlist_call_init(Objlist *list, RtldLockState *lockstate)
 {
     Objlist_Entry *elm;
     Obj_Entry *obj;
     char *saved_msg;
     Elf_Addr *init_addr;
     int index;
 
     /*
      * Clean init_scanned flag so that objects can be rechecked and
      * possibly initialized earlier if any of vectors called below
      * cause the change by using dlopen.
      */
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj->marker)
 	    continue;
 	obj->init_scanned = false;
     }
 
     /*
      * Preserve the current error message since an init function might
      * call into the dynamic linker and overwrite it.
      */
     saved_msg = errmsg_save();
     STAILQ_FOREACH(elm, list, link) {
 	if (elm->obj->init_done) /* Initialized early. */
 	    continue;
 	/*
 	 * Race: other thread might try to use this object before current
 	 * one completes the initialization. Not much can be done here
 	 * without better locking.
 	 */
 	elm->obj->init_done = true;
 	hold_object(elm->obj);
 	lock_release(rtld_bind_lock, lockstate);
 
         /*
          * It is legal to have both DT_INIT and DT_INIT_ARRAY defined.
          * When this happens, DT_INIT is processed first.
          */
 	if (elm->obj->init != (Elf_Addr)NULL) {
 	    dbg("calling init function for %s at %p", elm->obj->path,
 	        (void *)elm->obj->init);
 	    LD_UTRACE(UTRACE_INIT_CALL, elm->obj, (void *)elm->obj->init,
 	        0, 0, elm->obj->path);
 	    call_initfini_pointer(elm->obj, elm->obj->init);
 	}
 	init_addr = (Elf_Addr *)elm->obj->init_array;
 	if (init_addr != NULL) {
 	    for (index = 0; index < elm->obj->init_array_num; index++) {
 		if (init_addr[index] != 0 && init_addr[index] != 1) {
 		    dbg("calling init function for %s at %p", elm->obj->path,
 			(void *)init_addr[index]);
 		    LD_UTRACE(UTRACE_INIT_CALL, elm->obj,
 			(void *)init_addr[index], 0, 0, elm->obj->path);
 		    call_init_pointer(elm->obj, init_addr[index]);
 		}
 	    }
 	}
 	wlock_acquire(rtld_bind_lock, lockstate);
 	unhold_object(elm->obj);
     }
     errmsg_restore(saved_msg);
 }
 
 static void
 objlist_clear(Objlist *list)
 {
     Objlist_Entry *elm;
 
     while (!STAILQ_EMPTY(list)) {
 	elm = STAILQ_FIRST(list);
 	STAILQ_REMOVE_HEAD(list, link);
 	free(elm);
     }
 }
 
 static Objlist_Entry *
 objlist_find(Objlist *list, const Obj_Entry *obj)
 {
     Objlist_Entry *elm;
 
     STAILQ_FOREACH(elm, list, link)
 	if (elm->obj == obj)
 	    return elm;
     return NULL;
 }
 
 static void
 objlist_init(Objlist *list)
 {
     STAILQ_INIT(list);
 }
 
 static void
 objlist_push_head(Objlist *list, Obj_Entry *obj)
 {
     Objlist_Entry *elm;
 
     elm = NEW(Objlist_Entry);
     elm->obj = obj;
     STAILQ_INSERT_HEAD(list, elm, link);
 }
 
 static void
 objlist_push_tail(Objlist *list, Obj_Entry *obj)
 {
     Objlist_Entry *elm;
 
     elm = NEW(Objlist_Entry);
     elm->obj = obj;
     STAILQ_INSERT_TAIL(list, elm, link);
 }
 
 static void
 objlist_put_after(Objlist *list, Obj_Entry *listobj, Obj_Entry *obj)
 {
 	Objlist_Entry *elm, *listelm;
 
 	STAILQ_FOREACH(listelm, list, link) {
 		if (listelm->obj == listobj)
 			break;
 	}
 	elm = NEW(Objlist_Entry);
 	elm->obj = obj;
 	if (listelm != NULL)
 		STAILQ_INSERT_AFTER(list, listelm, elm, link);
 	else
 		STAILQ_INSERT_TAIL(list, elm, link);
 }
 
 static void
 objlist_remove(Objlist *list, Obj_Entry *obj)
 {
     Objlist_Entry *elm;
 
     if ((elm = objlist_find(list, obj)) != NULL) {
 	STAILQ_REMOVE(list, elm, Struct_Objlist_Entry, link);
 	free(elm);
     }
 }
 
 /*
  * Relocate dag rooted in the specified object.
  * Returns 0 on success, or -1 on failure.
  */
 
 static int
 relocate_object_dag(Obj_Entry *root, bool bind_now, Obj_Entry *rtldobj,
     int flags, RtldLockState *lockstate)
 {
 	Objlist_Entry *elm;
 	int error;
 
 	error = 0;
 	STAILQ_FOREACH(elm, &root->dagmembers, link) {
 		error = relocate_object(elm->obj, bind_now, rtldobj, flags,
 		    lockstate);
 		if (error == -1)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Prepare for, or clean after, relocating an object marked with
  * DT_TEXTREL or DF_TEXTREL.  Before relocating, all read-only
  * segments are remapped read-write.  After relocations are done, the
  * segment's permissions are returned back to the modes specified in
  * the phdrs.  If any relocation happened, or always for wired
  * program, COW is triggered.
  */
 static int
 reloc_textrel_prot(Obj_Entry *obj, bool before)
 {
 	const Elf_Phdr *ph;
 	void *base;
 	size_t l, sz;
 	int prot;
 
 	for (l = obj->phsize / sizeof(*ph), ph = obj->phdr; l > 0;
 	    l--, ph++) {
 		if (ph->p_type != PT_LOAD || (ph->p_flags & PF_W) != 0)
 			continue;
 		base = obj->relocbase + trunc_page(ph->p_vaddr);
 		sz = round_page(ph->p_vaddr + ph->p_filesz) -
 		    trunc_page(ph->p_vaddr);
 		prot = convert_prot(ph->p_flags) | (before ? PROT_WRITE : 0);
 		if (mprotect(base, sz, prot) == -1) {
 			_rtld_error("%s: Cannot write-%sable text segment: %s",
 			    obj->path, before ? "en" : "dis",
 			    rtld_strerror(errno));
 			return (-1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Relocate single object.
  * Returns 0 on success, or -1 on failure.
  */
 static int
 relocate_object(Obj_Entry *obj, bool bind_now, Obj_Entry *rtldobj,
     int flags, RtldLockState *lockstate)
 {
 
 	if (obj->relocated)
 		return (0);
 	obj->relocated = true;
 	if (obj != rtldobj)
 		dbg("relocating \"%s\"", obj->path);
 
 	if (obj->symtab == NULL || obj->strtab == NULL ||
 	    !(obj->valid_hash_sysv || obj->valid_hash_gnu)) {
 		_rtld_error("%s: Shared object has no run-time symbol table",
 			    obj->path);
 		return (-1);
 	}
 
 	/* There are relocations to the write-protected text segment. */
 	if (obj->textrel && reloc_textrel_prot(obj, true) != 0)
 		return (-1);
 
 	/* Process the non-PLT non-IFUNC relocations. */
 	if (reloc_non_plt(obj, rtldobj, flags, lockstate))
 		return (-1);
 
 	/* Re-protected the text segment. */
 	if (obj->textrel && reloc_textrel_prot(obj, false) != 0)
 		return (-1);
 
 	/* Set the special PLT or GOT entries. */
 	init_pltgot(obj);
 
 	/* Process the PLT relocations. */
 	if (reloc_plt(obj, flags, lockstate) == -1)
 		return (-1);
 	/* Relocate the jump slots if we are doing immediate binding. */
 	if ((obj->bind_now || bind_now) && reloc_jmpslots(obj, flags,
 	    lockstate) == -1)
 		return (-1);
 
 	if (!obj->mainprog && obj_enforce_relro(obj) == -1)
 		return (-1);
 
 	/*
 	 * Set up the magic number and version in the Obj_Entry.  These
 	 * were checked in the crt1.o from the original ElfKit, so we
 	 * set them for backward compatibility.
 	 */
 	obj->magic = RTLD_MAGIC;
 	obj->version = RTLD_VERSION;
 
 	return (0);
 }
 
 /*
  * Relocate newly-loaded shared objects.  The argument is a pointer to
  * the Obj_Entry for the first such object.  All objects from the first
  * to the end of the list of objects are relocated.  Returns 0 on success,
  * or -1 on failure.
  */
 static int
 relocate_objects(Obj_Entry *first, bool bind_now, Obj_Entry *rtldobj,
     int flags, RtldLockState *lockstate)
 {
 	Obj_Entry *obj;
 	int error;
 
 	for (error = 0, obj = first;  obj != NULL;
 	    obj = TAILQ_NEXT(obj, next)) {
 		if (obj->marker)
 			continue;
 		error = relocate_object(obj, bind_now, rtldobj, flags,
 		    lockstate);
 		if (error == -1)
 			break;
 	}
 	return (error);
 }
 
 /*
  * The handling of R_MACHINE_IRELATIVE relocations and jumpslots
  * referencing STT_GNU_IFUNC symbols is postponed till the other
  * relocations are done.  The indirect functions specified as
  * ifunc are allowed to call other symbols, so we need to have
  * objects relocated before asking for resolution from indirects.
  *
  * The R_MACHINE_IRELATIVE slots are resolved in greedy fashion,
  * instead of the usual lazy handling of PLT slots.  It is
  * consistent with how GNU does it.
  */
 static int
 resolve_object_ifunc(Obj_Entry *obj, bool bind_now, int flags,
     RtldLockState *lockstate)
 {
 
 	if (obj->ifuncs_resolved)
 		return (0);
 	obj->ifuncs_resolved = true;
 	if (!obj->irelative && !((obj->bind_now || bind_now) && obj->gnu_ifunc))
 		return (0);
 	if (obj_disable_relro(obj) == -1 ||
 	    (obj->irelative && reloc_iresolve(obj, lockstate) == -1) ||
 	    ((obj->bind_now || bind_now) && obj->gnu_ifunc &&
 	    reloc_gnu_ifunc(obj, flags, lockstate) == -1) ||
 	    obj_enforce_relro(obj) == -1)
 		return (-1);
 	return (0);
 }
 
 static int
 initlist_objects_ifunc(Objlist *list, bool bind_now, int flags,
     RtldLockState *lockstate)
 {
 	Objlist_Entry *elm;
 	Obj_Entry *obj;
 
 	STAILQ_FOREACH(elm, list, link) {
 		obj = elm->obj;
 		if (obj->marker)
 			continue;
 		if (resolve_object_ifunc(obj, bind_now, flags,
 		    lockstate) == -1)
 			return (-1);
 	}
 	return (0);
 }
 
 /*
  * Cleanup procedure.  It will be called (by the atexit mechanism) just
  * before the process exits.
  */
 static void
 rtld_exit(void)
 {
     RtldLockState lockstate;
 
     wlock_acquire(rtld_bind_lock, &lockstate);
     dbg("rtld_exit()");
     objlist_call_fini(&list_fini, NULL, &lockstate);
     /* No need to remove the items from the list, since we are exiting. */
     if (!libmap_disable)
         lm_fini();
     lock_release(rtld_bind_lock, &lockstate);
 }
 
 /*
  * Iterate over a search path, translate each element, and invoke the
  * callback on the result.
  */
 static void *
 path_enumerate(const char *path, path_enum_proc callback,
     const char *refobj_path, void *arg)
 {
     const char *trans;
     if (path == NULL)
 	return (NULL);
 
     path += strspn(path, ":;");
     while (*path != '\0') {
 	size_t len;
 	char  *res;
 
 	len = strcspn(path, ":;");
 	trans = lm_findn(refobj_path, path, len);
 	if (trans)
 	    res = callback(trans, strlen(trans), arg);
 	else
 	    res = callback(path, len, arg);
 
 	if (res != NULL)
 	    return (res);
 
 	path += len;
 	path += strspn(path, ":;");
     }
 
     return (NULL);
 }
 
 struct try_library_args {
     const char	*name;
     size_t	 namelen;
     char	*buffer;
     size_t	 buflen;
     int		 fd;
 };
 
 static void *
 try_library_path(const char *dir, size_t dirlen, void *param)
 {
     struct try_library_args *arg;
     int fd;
 
     arg = param;
     if (*dir == '/' || trust) {
 	char *pathname;
 
 	if (dirlen + 1 + arg->namelen + 1 > arg->buflen)
 		return (NULL);
 
 	pathname = arg->buffer;
 	strncpy(pathname, dir, dirlen);
 	pathname[dirlen] = '/';
 	strcpy(pathname + dirlen + 1, arg->name);
 
 	dbg("  Trying \"%s\"", pathname);
 	fd = open(pathname, O_RDONLY | O_CLOEXEC | O_VERIFY);
 	if (fd >= 0) {
 	    dbg("  Opened \"%s\", fd %d", pathname, fd);
 	    pathname = xmalloc(dirlen + 1 + arg->namelen + 1);
 	    strcpy(pathname, arg->buffer);
 	    arg->fd = fd;
 	    return (pathname);
 	} else {
 	    dbg("  Failed to open \"%s\": %s",
 		pathname, rtld_strerror(errno));
 	}
     }
     return (NULL);
 }
 
 static char *
 search_library_path(const char *name, const char *path,
     const char *refobj_path, int *fdp)
 {
     char *p;
     struct try_library_args arg;
 
     if (path == NULL)
 	return NULL;
 
     arg.name = name;
     arg.namelen = strlen(name);
     arg.buffer = xmalloc(PATH_MAX);
     arg.buflen = PATH_MAX;
     arg.fd = -1;
 
     p = path_enumerate(path, try_library_path, refobj_path, &arg);
     *fdp = arg.fd;
 
     free(arg.buffer);
 
     return (p);
 }
 
 
 /*
  * Finds the library with the given name using the directory descriptors
  * listed in the LD_LIBRARY_PATH_FDS environment variable.
  *
  * Returns a freshly-opened close-on-exec file descriptor for the library,
  * or -1 if the library cannot be found.
  */
 static char *
 search_library_pathfds(const char *name, const char *path, int *fdp)
 {
 	char *envcopy, *fdstr, *found, *last_token;
 	size_t len;
 	int dirfd, fd;
 
 	dbg("%s('%s', '%s', fdp)", __func__, name, path);
 
 	/* Don't load from user-specified libdirs into setuid binaries. */
 	if (!trust)
 		return (NULL);
 
 	/* We can't do anything if LD_LIBRARY_PATH_FDS isn't set. */
 	if (path == NULL)
 		return (NULL);
 
 	/* LD_LIBRARY_PATH_FDS only works with relative paths. */
 	if (name[0] == '/') {
 		dbg("Absolute path (%s) passed to %s", name, __func__);
 		return (NULL);
 	}
 
 	/*
 	 * Use strtok_r() to walk the FD:FD:FD list.  This requires a local
 	 * copy of the path, as strtok_r rewrites separator tokens
 	 * with '\0'.
 	 */
 	found = NULL;
 	envcopy = xstrdup(path);
 	for (fdstr = strtok_r(envcopy, ":", &last_token); fdstr != NULL;
 	    fdstr = strtok_r(NULL, ":", &last_token)) {
 		dirfd = parse_integer(fdstr);
 		if (dirfd < 0) {
 			_rtld_error("failed to parse directory FD: '%s'",
 				fdstr);
 			break;
 		}
 		fd = __sys_openat(dirfd, name, O_RDONLY | O_CLOEXEC | O_VERIFY);
 		if (fd >= 0) {
 			*fdp = fd;
 			len = strlen(fdstr) + strlen(name) + 3;
 			found = xmalloc(len);
 			if (rtld_snprintf(found, len, "#%d/%s", dirfd, name) < 0) {
 				_rtld_error("error generating '%d/%s'",
 				    dirfd, name);
 				rtld_die();
 			}
 			dbg("open('%s') => %d", found, fd);
 			break;
 		}
 	}
 	free(envcopy);
 
 	return (found);
 }
 
 
 int
 dlclose(void *handle)
 {
 	RtldLockState lockstate;
 	int error;
 
 	wlock_acquire(rtld_bind_lock, &lockstate);
 	error = dlclose_locked(handle, &lockstate);
 	lock_release(rtld_bind_lock, &lockstate);
 	return (error);
 }
 
 static int
 dlclose_locked(void *handle, RtldLockState *lockstate)
 {
     Obj_Entry *root;
 
     root = dlcheck(handle);
     if (root == NULL)
 	return -1;
     LD_UTRACE(UTRACE_DLCLOSE_START, handle, NULL, 0, root->dl_refcount,
 	root->path);
 
     /* Unreference the object and its dependencies. */
     root->dl_refcount--;
 
     if (root->refcount == 1) {
 	/*
 	 * The object will be no longer referenced, so we must unload it.
 	 * First, call the fini functions.
 	 */
 	objlist_call_fini(&list_fini, root, lockstate);
 
 	unref_dag(root);
 
 	/* Finish cleaning up the newly-unreferenced objects. */
 	GDB_STATE(RT_DELETE,&root->linkmap);
 	unload_object(root, lockstate);
 	GDB_STATE(RT_CONSISTENT,NULL);
     } else
 	unref_dag(root);
 
     LD_UTRACE(UTRACE_DLCLOSE_STOP, handle, NULL, 0, 0, NULL);
     return 0;
 }
 
 char *
 dlerror(void)
 {
     char *msg = error_message;
     error_message = NULL;
     return msg;
 }
 
 /*
  * This function is deprecated and has no effect.
  */
 void
 dllockinit(void *context,
     void *(*_lock_create)(void *context) __unused,
     void (*_rlock_acquire)(void *lock) __unused,
     void (*_wlock_acquire)(void *lock)  __unused,
     void (*_lock_release)(void *lock) __unused,
     void (*_lock_destroy)(void *lock) __unused,
     void (*context_destroy)(void *context))
 {
     static void *cur_context;
     static void (*cur_context_destroy)(void *);
 
     /* Just destroy the context from the previous call, if necessary. */
     if (cur_context_destroy != NULL)
 	cur_context_destroy(cur_context);
     cur_context = context;
     cur_context_destroy = context_destroy;
 }
 
 void *
 dlopen(const char *name, int mode)
 {
 
 	return (rtld_dlopen(name, -1, mode));
 }
 
 void *
 fdlopen(int fd, int mode)
 {
 
 	return (rtld_dlopen(NULL, fd, mode));
 }
 
 static void *
 rtld_dlopen(const char *name, int fd, int mode)
 {
     RtldLockState lockstate;
     int lo_flags;
 
     LD_UTRACE(UTRACE_DLOPEN_START, NULL, NULL, 0, mode, name);
     ld_tracing = (mode & RTLD_TRACE) == 0 ? NULL : "1";
     if (ld_tracing != NULL) {
 	rlock_acquire(rtld_bind_lock, &lockstate);
 	if (sigsetjmp(lockstate.env, 0) != 0)
 	    lock_upgrade(rtld_bind_lock, &lockstate);
 	environ = __DECONST(char **, *get_program_var_addr("environ", &lockstate));
 	lock_release(rtld_bind_lock, &lockstate);
     }
     lo_flags = RTLD_LO_DLOPEN;
     if (mode & RTLD_NODELETE)
 	    lo_flags |= RTLD_LO_NODELETE;
     if (mode & RTLD_NOLOAD)
 	    lo_flags |= RTLD_LO_NOLOAD;
     if (ld_tracing != NULL)
 	    lo_flags |= RTLD_LO_TRACE;
 
     return (dlopen_object(name, fd, obj_main, lo_flags,
       mode & (RTLD_MODEMASK | RTLD_GLOBAL), NULL));
 }
 
 static void
 dlopen_cleanup(Obj_Entry *obj, RtldLockState *lockstate)
 {
 
 	obj->dl_refcount--;
 	unref_dag(obj);
 	if (obj->refcount == 0)
 		unload_object(obj, lockstate);
 }
 
 static Obj_Entry *
 dlopen_object(const char *name, int fd, Obj_Entry *refobj, int lo_flags,
     int mode, RtldLockState *lockstate)
 {
     Obj_Entry *old_obj_tail;
     Obj_Entry *obj;
     Objlist initlist;
     RtldLockState mlockstate;
     int result;
 
     objlist_init(&initlist);
 
     if (lockstate == NULL && !(lo_flags & RTLD_LO_EARLY)) {
 	wlock_acquire(rtld_bind_lock, &mlockstate);
 	lockstate = &mlockstate;
     }
     GDB_STATE(RT_ADD,NULL);
 
     old_obj_tail = globallist_curr(TAILQ_LAST(&obj_list, obj_entry_q));
     obj = NULL;
     if (name == NULL && fd == -1) {
 	obj = obj_main;
 	obj->refcount++;
     } else {
 	obj = load_object(name, fd, refobj, lo_flags);
     }
 
     if (obj) {
 	obj->dl_refcount++;
 	if (mode & RTLD_GLOBAL && objlist_find(&list_global, obj) == NULL)
 	    objlist_push_tail(&list_global, obj);
 	if (globallist_next(old_obj_tail) != NULL) {
 	    /* We loaded something new. */
 	    assert(globallist_next(old_obj_tail) == obj);
-	    result = load_needed_objects(obj,
-		lo_flags & (RTLD_LO_DLOPEN | RTLD_LO_EARLY));
+	    result = 0;
+	    if ((lo_flags & RTLD_LO_EARLY) == 0 && obj->static_tls &&
+	      !allocate_tls_offset(obj)) {
+		_rtld_error("%s: No space available "
+		  "for static Thread Local Storage", obj->path);
+		result = -1;
+	    }
+	    if (result != -1)
+		result = load_needed_objects(obj, lo_flags & (RTLD_LO_DLOPEN |
+		    RTLD_LO_EARLY));
 	    init_dag(obj);
 	    ref_dag(obj);
 	    if (result != -1)
 		result = rtld_verify_versions(&obj->dagmembers);
 	    if (result != -1 && ld_tracing)
 		goto trace;
 	    if (result == -1 || relocate_object_dag(obj,
 	      (mode & RTLD_MODEMASK) == RTLD_NOW, &obj_rtld,
 	      (lo_flags & RTLD_LO_EARLY) ? SYMLOOK_EARLY : 0,
 	      lockstate) == -1) {
 		dlopen_cleanup(obj, lockstate);
 		obj = NULL;
 	    } else if (lo_flags & RTLD_LO_EARLY) {
 		/*
 		 * Do not call the init functions for early loaded
 		 * filtees.  The image is still not initialized enough
 		 * for them to work.
 		 *
 		 * Our object is found by the global object list and
 		 * will be ordered among all init calls done right
 		 * before transferring control to main.
 		 */
 	    } else {
 		/* Make list of init functions to call. */
 		initlist_add_objects(obj, obj, &initlist);
 	    }
 	    /*
 	     * Process all no_delete or global objects here, given
 	     * them own DAGs to prevent their dependencies from being
 	     * unloaded.  This has to be done after we have loaded all
 	     * of the dependencies, so that we do not miss any.
 	     */
 	    if (obj != NULL)
 		process_z(obj);
 	} else {
 	    /*
 	     * Bump the reference counts for objects on this DAG.  If
 	     * this is the first dlopen() call for the object that was
 	     * already loaded as a dependency, initialize the dag
 	     * starting at it.
 	     */
 	    init_dag(obj);
 	    ref_dag(obj);
 
 	    if ((lo_flags & RTLD_LO_TRACE) != 0)
 		goto trace;
 	}
 	if (obj != NULL && ((lo_flags & RTLD_LO_NODELETE) != 0 ||
 	  obj->z_nodelete) && !obj->ref_nodel) {
 	    dbg("obj %s nodelete", obj->path);
 	    ref_dag(obj);
 	    obj->z_nodelete = obj->ref_nodel = true;
 	}
     }
 
     LD_UTRACE(UTRACE_DLOPEN_STOP, obj, NULL, 0, obj ? obj->dl_refcount : 0,
 	name);
     GDB_STATE(RT_CONSISTENT,obj ? &obj->linkmap : NULL);
 
-    if (!(lo_flags & RTLD_LO_EARLY)) {
+    if ((lo_flags & RTLD_LO_EARLY) == 0) {
 	map_stacks_exec(lockstate);
+	if (obj != NULL)
+	    distribute_static_tls(&initlist, lockstate);
     }
 
     if (initlist_objects_ifunc(&initlist, (mode & RTLD_MODEMASK) == RTLD_NOW,
       (lo_flags & RTLD_LO_EARLY) ? SYMLOOK_EARLY : 0,
       lockstate) == -1) {
 	objlist_clear(&initlist);
 	dlopen_cleanup(obj, lockstate);
 	if (lockstate == &mlockstate)
 	    lock_release(rtld_bind_lock, lockstate);
 	return (NULL);
     }
 
     if (!(lo_flags & RTLD_LO_EARLY)) {
 	/* Call the init functions. */
 	objlist_call_init(&initlist, lockstate);
     }
     objlist_clear(&initlist);
     if (lockstate == &mlockstate)
 	lock_release(rtld_bind_lock, lockstate);
     return obj;
 trace:
     trace_loaded_objects(obj);
     if (lockstate == &mlockstate)
 	lock_release(rtld_bind_lock, lockstate);
     exit(0);
 }
 
 static void *
 do_dlsym(void *handle, const char *name, void *retaddr, const Ver_Entry *ve,
     int flags)
 {
     DoneList donelist;
     const Obj_Entry *obj, *defobj;
     const Elf_Sym *def;
     SymLook req;
     RtldLockState lockstate;
     tls_index ti;
     void *sym;
     int res;
 
     def = NULL;
     defobj = NULL;
     symlook_init(&req, name);
     req.ventry = ve;
     req.flags = flags | SYMLOOK_IN_PLT;
     req.lockstate = &lockstate;
 
     LD_UTRACE(UTRACE_DLSYM_START, handle, NULL, 0, 0, name);
     rlock_acquire(rtld_bind_lock, &lockstate);
     if (sigsetjmp(lockstate.env, 0) != 0)
 	    lock_upgrade(rtld_bind_lock, &lockstate);
     if (handle == NULL || handle == RTLD_NEXT ||
 	handle == RTLD_DEFAULT || handle == RTLD_SELF) {
 
 	if ((obj = obj_from_addr(retaddr)) == NULL) {
 	    _rtld_error("Cannot determine caller's shared object");
 	    lock_release(rtld_bind_lock, &lockstate);
 	    LD_UTRACE(UTRACE_DLSYM_STOP, handle, NULL, 0, 0, name);
 	    return NULL;
 	}
 	if (handle == NULL) {	/* Just the caller's shared object. */
 	    res = symlook_obj(&req, obj);
 	    if (res == 0) {
 		def = req.sym_out;
 		defobj = req.defobj_out;
 	    }
 	} else if (handle == RTLD_NEXT || /* Objects after caller's */
 		   handle == RTLD_SELF) { /* ... caller included */
 	    if (handle == RTLD_NEXT)
 		obj = globallist_next(obj);
 	    for (; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 		if (obj->marker)
 		    continue;
 		res = symlook_obj(&req, obj);
 		if (res == 0) {
 		    if (def == NULL ||
 		      ELF_ST_BIND(req.sym_out->st_info) != STB_WEAK) {
 			def = req.sym_out;
 			defobj = req.defobj_out;
 			if (ELF_ST_BIND(def->st_info) != STB_WEAK)
 			    break;
 		    }
 		}
 	    }
 	    /*
 	     * Search the dynamic linker itself, and possibly resolve the
 	     * symbol from there.  This is how the application links to
 	     * dynamic linker services such as dlopen.
 	     */
 	    if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
 		res = symlook_obj(&req, &obj_rtld);
 		if (res == 0) {
 		    def = req.sym_out;
 		    defobj = req.defobj_out;
 		}
 	    }
 	} else {
 	    assert(handle == RTLD_DEFAULT);
 	    res = symlook_default(&req, obj);
 	    if (res == 0) {
 		defobj = req.defobj_out;
 		def = req.sym_out;
 	    }
 	}
     } else {
 	if ((obj = dlcheck(handle)) == NULL) {
 	    lock_release(rtld_bind_lock, &lockstate);
 	    LD_UTRACE(UTRACE_DLSYM_STOP, handle, NULL, 0, 0, name);
 	    return NULL;
 	}
 
 	donelist_init(&donelist);
 	if (obj->mainprog) {
             /* Handle obtained by dlopen(NULL, ...) implies global scope. */
 	    res = symlook_global(&req, &donelist);
 	    if (res == 0) {
 		def = req.sym_out;
 		defobj = req.defobj_out;
 	    }
 	    /*
 	     * Search the dynamic linker itself, and possibly resolve the
 	     * symbol from there.  This is how the application links to
 	     * dynamic linker services such as dlopen.
 	     */
 	    if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
 		res = symlook_obj(&req, &obj_rtld);
 		if (res == 0) {
 		    def = req.sym_out;
 		    defobj = req.defobj_out;
 		}
 	    }
 	}
 	else {
 	    /* Search the whole DAG rooted at the given object. */
 	    res = symlook_list(&req, &obj->dagmembers, &donelist);
 	    if (res == 0) {
 		def = req.sym_out;
 		defobj = req.defobj_out;
 	    }
 	}
     }
 
     if (def != NULL) {
 	lock_release(rtld_bind_lock, &lockstate);
 
 	/*
 	 * The value required by the caller is derived from the value
 	 * of the symbol. this is simply the relocated value of the
 	 * symbol.
 	 */
 	if (ELF_ST_TYPE(def->st_info) == STT_FUNC)
 	    sym = make_function_pointer(def, defobj);
 	else if (ELF_ST_TYPE(def->st_info) == STT_GNU_IFUNC)
 	    sym = rtld_resolve_ifunc(defobj, def);
 	else if (ELF_ST_TYPE(def->st_info) == STT_TLS) {
 	    ti.ti_module = defobj->tlsindex;
 	    ti.ti_offset = def->st_value;
 	    sym = __tls_get_addr(&ti);
 	} else
 	    sym = defobj->relocbase + def->st_value;
 	LD_UTRACE(UTRACE_DLSYM_STOP, handle, sym, 0, 0, name);
 	return (sym);
     }
 
     _rtld_error("Undefined symbol \"%s%s%s\"", name, ve != NULL ? "@" : "",
       ve != NULL ? ve->name : "");
     lock_release(rtld_bind_lock, &lockstate);
     LD_UTRACE(UTRACE_DLSYM_STOP, handle, NULL, 0, 0, name);
     return NULL;
 }
 
 void *
 dlsym(void *handle, const char *name)
 {
 	return do_dlsym(handle, name, __builtin_return_address(0), NULL,
 	    SYMLOOK_DLSYM);
 }
 
 dlfunc_t
 dlfunc(void *handle, const char *name)
 {
 	union {
 		void *d;
 		dlfunc_t f;
 	} rv;
 
 	rv.d = do_dlsym(handle, name, __builtin_return_address(0), NULL,
 	    SYMLOOK_DLSYM);
 	return (rv.f);
 }
 
 void *
 dlvsym(void *handle, const char *name, const char *version)
 {
 	Ver_Entry ventry;
 
 	ventry.name = version;
 	ventry.file = NULL;
 	ventry.hash = elf_hash(version);
 	ventry.flags= 0;
 	return do_dlsym(handle, name, __builtin_return_address(0), &ventry,
 	    SYMLOOK_DLSYM);
 }
 
 int
 _rtld_addr_phdr(const void *addr, struct dl_phdr_info *phdr_info)
 {
     const Obj_Entry *obj;
     RtldLockState lockstate;
 
     rlock_acquire(rtld_bind_lock, &lockstate);
     obj = obj_from_addr(addr);
     if (obj == NULL) {
         _rtld_error("No shared object contains address");
 	lock_release(rtld_bind_lock, &lockstate);
         return (0);
     }
     rtld_fill_dl_phdr_info(obj, phdr_info);
     lock_release(rtld_bind_lock, &lockstate);
     return (1);
 }
 
 int
 dladdr(const void *addr, Dl_info *info)
 {
     const Obj_Entry *obj;
     const Elf_Sym *def;
     void *symbol_addr;
     unsigned long symoffset;
     RtldLockState lockstate;
 
     rlock_acquire(rtld_bind_lock, &lockstate);
     obj = obj_from_addr(addr);
     if (obj == NULL) {
         _rtld_error("No shared object contains address");
 	lock_release(rtld_bind_lock, &lockstate);
         return 0;
     }
     info->dli_fname = obj->path;
     info->dli_fbase = obj->mapbase;
     info->dli_saddr = (void *)0;
     info->dli_sname = NULL;
 
     /*
      * Walk the symbol list looking for the symbol whose address is
      * closest to the address sent in.
      */
     for (symoffset = 0; symoffset < obj->dynsymcount; symoffset++) {
         def = obj->symtab + symoffset;
 
         /*
          * For skip the symbol if st_shndx is either SHN_UNDEF or
          * SHN_COMMON.
          */
         if (def->st_shndx == SHN_UNDEF || def->st_shndx == SHN_COMMON)
             continue;
 
         /*
          * If the symbol is greater than the specified address, or if it
          * is further away from addr than the current nearest symbol,
          * then reject it.
          */
         symbol_addr = obj->relocbase + def->st_value;
         if (symbol_addr > addr || symbol_addr < info->dli_saddr)
             continue;
 
         /* Update our idea of the nearest symbol. */
         info->dli_sname = obj->strtab + def->st_name;
         info->dli_saddr = symbol_addr;
 
         /* Exact match? */
         if (info->dli_saddr == addr)
             break;
     }
     lock_release(rtld_bind_lock, &lockstate);
     return 1;
 }
 
 int
 dlinfo(void *handle, int request, void *p)
 {
     const Obj_Entry *obj;
     RtldLockState lockstate;
     int error;
 
     rlock_acquire(rtld_bind_lock, &lockstate);
 
     if (handle == NULL || handle == RTLD_SELF) {
 	void *retaddr;
 
 	retaddr = __builtin_return_address(0);	/* __GNUC__ only */
 	if ((obj = obj_from_addr(retaddr)) == NULL)
 	    _rtld_error("Cannot determine caller's shared object");
     } else
 	obj = dlcheck(handle);
 
     if (obj == NULL) {
 	lock_release(rtld_bind_lock, &lockstate);
 	return (-1);
     }
 
     error = 0;
     switch (request) {
     case RTLD_DI_LINKMAP:
 	*((struct link_map const **)p) = &obj->linkmap;
 	break;
     case RTLD_DI_ORIGIN:
 	error = rtld_dirname(obj->path, p);
 	break;
 
     case RTLD_DI_SERINFOSIZE:
     case RTLD_DI_SERINFO:
 	error = do_search_info(obj, request, (struct dl_serinfo *)p);
 	break;
 
     default:
 	_rtld_error("Invalid request %d passed to dlinfo()", request);
 	error = -1;
     }
 
     lock_release(rtld_bind_lock, &lockstate);
 
     return (error);
 }
 
 static void
 rtld_fill_dl_phdr_info(const Obj_Entry *obj, struct dl_phdr_info *phdr_info)
 {
 
 	phdr_info->dlpi_addr = (Elf_Addr)obj->relocbase;
 	phdr_info->dlpi_name = obj->path;
 	phdr_info->dlpi_phdr = obj->phdr;
 	phdr_info->dlpi_phnum = obj->phsize / sizeof(obj->phdr[0]);
 	phdr_info->dlpi_tls_modid = obj->tlsindex;
 	phdr_info->dlpi_tls_data = obj->tlsinit;
 	phdr_info->dlpi_adds = obj_loads;
 	phdr_info->dlpi_subs = obj_loads - obj_count;
 }
 
 int
 dl_iterate_phdr(__dl_iterate_hdr_callback callback, void *param)
 {
 	struct dl_phdr_info phdr_info;
 	Obj_Entry *obj, marker;
 	RtldLockState bind_lockstate, phdr_lockstate;
 	int error;
 
 	init_marker(&marker);
 	error = 0;
 
 	wlock_acquire(rtld_phdr_lock, &phdr_lockstate);
 	wlock_acquire(rtld_bind_lock, &bind_lockstate);
 	for (obj = globallist_curr(TAILQ_FIRST(&obj_list)); obj != NULL;) {
 		TAILQ_INSERT_AFTER(&obj_list, obj, &marker, next);
 		rtld_fill_dl_phdr_info(obj, &phdr_info);
 		hold_object(obj);
 		lock_release(rtld_bind_lock, &bind_lockstate);
 
 		error = callback(&phdr_info, sizeof phdr_info, param);
 
 		wlock_acquire(rtld_bind_lock, &bind_lockstate);
 		unhold_object(obj);
 		obj = globallist_next(&marker);
 		TAILQ_REMOVE(&obj_list, &marker, next);
 		if (error != 0) {
 			lock_release(rtld_bind_lock, &bind_lockstate);
 			lock_release(rtld_phdr_lock, &phdr_lockstate);
 			return (error);
 		}
 	}
 
 	if (error == 0) {
 		rtld_fill_dl_phdr_info(&obj_rtld, &phdr_info);
 		lock_release(rtld_bind_lock, &bind_lockstate);
 		error = callback(&phdr_info, sizeof(phdr_info), param);
 	}
 	lock_release(rtld_phdr_lock, &phdr_lockstate);
 	return (error);
 }
 
 static void *
 fill_search_info(const char *dir, size_t dirlen, void *param)
 {
     struct fill_search_info_args *arg;
 
     arg = param;
 
     if (arg->request == RTLD_DI_SERINFOSIZE) {
 	arg->serinfo->dls_cnt ++;
 	arg->serinfo->dls_size += sizeof(struct dl_serpath) + dirlen + 1;
     } else {
 	struct dl_serpath *s_entry;
 
 	s_entry = arg->serpath;
 	s_entry->dls_name  = arg->strspace;
 	s_entry->dls_flags = arg->flags;
 
 	strncpy(arg->strspace, dir, dirlen);
 	arg->strspace[dirlen] = '\0';
 
 	arg->strspace += dirlen + 1;
 	arg->serpath++;
     }
 
     return (NULL);
 }
 
 static int
 do_search_info(const Obj_Entry *obj, int request, struct dl_serinfo *info)
 {
     struct dl_serinfo _info;
     struct fill_search_info_args args;
 
     args.request = RTLD_DI_SERINFOSIZE;
     args.serinfo = &_info;
 
     _info.dls_size = __offsetof(struct dl_serinfo, dls_serpath);
     _info.dls_cnt  = 0;
 
     path_enumerate(obj->rpath, fill_search_info, NULL, &args);
     path_enumerate(ld_library_path, fill_search_info, NULL, &args);
     path_enumerate(obj->runpath, fill_search_info, NULL, &args);
     path_enumerate(gethints(obj->z_nodeflib), fill_search_info, NULL, &args);
     if (!obj->z_nodeflib)
       path_enumerate(ld_standard_library_path, fill_search_info, NULL, &args);
 
 
     if (request == RTLD_DI_SERINFOSIZE) {
 	info->dls_size = _info.dls_size;
 	info->dls_cnt = _info.dls_cnt;
 	return (0);
     }
 
     if (info->dls_cnt != _info.dls_cnt || info->dls_size != _info.dls_size) {
 	_rtld_error("Uninitialized Dl_serinfo struct passed to dlinfo()");
 	return (-1);
     }
 
     args.request  = RTLD_DI_SERINFO;
     args.serinfo  = info;
     args.serpath  = &info->dls_serpath[0];
     args.strspace = (char *)&info->dls_serpath[_info.dls_cnt];
 
     args.flags = LA_SER_RUNPATH;
     if (path_enumerate(obj->rpath, fill_search_info, NULL, &args) != NULL)
 	return (-1);
 
     args.flags = LA_SER_LIBPATH;
     if (path_enumerate(ld_library_path, fill_search_info, NULL, &args) != NULL)
 	return (-1);
 
     args.flags = LA_SER_RUNPATH;
     if (path_enumerate(obj->runpath, fill_search_info, NULL, &args) != NULL)
 	return (-1);
 
     args.flags = LA_SER_CONFIG;
     if (path_enumerate(gethints(obj->z_nodeflib), fill_search_info, NULL, &args)
       != NULL)
 	return (-1);
 
     args.flags = LA_SER_DEFAULT;
     if (!obj->z_nodeflib && path_enumerate(ld_standard_library_path,
       fill_search_info, NULL, &args) != NULL)
 	return (-1);
     return (0);
 }
 
 static int
 rtld_dirname(const char *path, char *bname)
 {
     const char *endp;
 
     /* Empty or NULL string gets treated as "." */
     if (path == NULL || *path == '\0') {
 	bname[0] = '.';
 	bname[1] = '\0';
 	return (0);
     }
 
     /* Strip trailing slashes */
     endp = path + strlen(path) - 1;
     while (endp > path && *endp == '/')
 	endp--;
 
     /* Find the start of the dir */
     while (endp > path && *endp != '/')
 	endp--;
 
     /* Either the dir is "/" or there are no slashes */
     if (endp == path) {
 	bname[0] = *endp == '/' ? '/' : '.';
 	bname[1] = '\0';
 	return (0);
     } else {
 	do {
 	    endp--;
 	} while (endp > path && *endp == '/');
     }
 
     if (endp - path + 2 > PATH_MAX)
     {
 	_rtld_error("Filename is too long: %s", path);
 	return(-1);
     }
 
     strncpy(bname, path, endp - path + 1);
     bname[endp - path + 1] = '\0';
     return (0);
 }
 
 static int
 rtld_dirname_abs(const char *path, char *base)
 {
 	char *last;
 
 	if (realpath(path, base) == NULL)
 		return (-1);
 	dbg("%s -> %s", path, base);
 	last = strrchr(base, '/');
 	if (last == NULL)
 		return (-1);
 	if (last != base)
 		*last = '\0';
 	return (0);
 }
 
 static void
 linkmap_add(Obj_Entry *obj)
 {
     struct link_map *l = &obj->linkmap;
     struct link_map *prev;
 
     obj->linkmap.l_name = obj->path;
     obj->linkmap.l_addr = obj->mapbase;
     obj->linkmap.l_ld = obj->dynamic;
 #ifdef __mips__
     /* GDB needs load offset on MIPS to use the symbols */
     obj->linkmap.l_offs = obj->relocbase;
 #endif
 
     if (r_debug.r_map == NULL) {
 	r_debug.r_map = l;
 	return;
     }
 
     /*
      * Scan to the end of the list, but not past the entry for the
      * dynamic linker, which we want to keep at the very end.
      */
     for (prev = r_debug.r_map;
       prev->l_next != NULL && prev->l_next != &obj_rtld.linkmap;
       prev = prev->l_next)
 	;
 
     /* Link in the new entry. */
     l->l_prev = prev;
     l->l_next = prev->l_next;
     if (l->l_next != NULL)
 	l->l_next->l_prev = l;
     prev->l_next = l;
 }
 
 static void
 linkmap_delete(Obj_Entry *obj)
 {
     struct link_map *l = &obj->linkmap;
 
     if (l->l_prev == NULL) {
 	if ((r_debug.r_map = l->l_next) != NULL)
 	    l->l_next->l_prev = NULL;
 	return;
     }
 
     if ((l->l_prev->l_next = l->l_next) != NULL)
 	l->l_next->l_prev = l->l_prev;
 }
 
 /*
  * Function for the debugger to set a breakpoint on to gain control.
  *
  * The two parameters allow the debugger to easily find and determine
  * what the runtime loader is doing and to whom it is doing it.
  *
  * When the loadhook trap is hit (r_debug_state, set at program
  * initialization), the arguments can be found on the stack:
  *
  *  +8   struct link_map *m
  *  +4   struct r_debug  *rd
  *  +0   RetAddr
  */
 void
 r_debug_state(struct r_debug* rd __unused, struct link_map *m  __unused)
 {
     /*
      * The following is a hack to force the compiler to emit calls to
      * this function, even when optimizing.  If the function is empty,
      * the compiler is not obliged to emit any code for calls to it,
      * even when marked __noinline.  However, gdb depends on those
      * calls being made.
      */
     __compiler_membar();
 }
 
 /*
  * A function called after init routines have completed. This can be used to
  * break before a program's entry routine is called, and can be used when
  * main is not available in the symbol table.
  */
 void
 _r_debug_postinit(struct link_map *m __unused)
 {
 
 	/* See r_debug_state(). */
 	__compiler_membar();
 }
 
 static void
 release_object(Obj_Entry *obj)
 {
 
 	if (obj->holdcount > 0) {
 		obj->unholdfree = true;
 		return;
 	}
 	munmap(obj->mapbase, obj->mapsize);
 	linkmap_delete(obj);
 	obj_free(obj);
 }
 
 /*
  * Get address of the pointer variable in the main program.
  * Prefer non-weak symbol over the weak one.
  */
 static const void **
 get_program_var_addr(const char *name, RtldLockState *lockstate)
 {
     SymLook req;
     DoneList donelist;
 
     symlook_init(&req, name);
     req.lockstate = lockstate;
     donelist_init(&donelist);
     if (symlook_global(&req, &donelist) != 0)
 	return (NULL);
     if (ELF_ST_TYPE(req.sym_out->st_info) == STT_FUNC)
 	return ((const void **)make_function_pointer(req.sym_out,
 	  req.defobj_out));
     else if (ELF_ST_TYPE(req.sym_out->st_info) == STT_GNU_IFUNC)
 	return ((const void **)rtld_resolve_ifunc(req.defobj_out, req.sym_out));
     else
 	return ((const void **)(req.defobj_out->relocbase +
 	  req.sym_out->st_value));
 }
 
 /*
  * Set a pointer variable in the main program to the given value.  This
  * is used to set key variables such as "environ" before any of the
  * init functions are called.
  */
 static void
 set_program_var(const char *name, const void *value)
 {
     const void **addr;
 
     if ((addr = get_program_var_addr(name, NULL)) != NULL) {
 	dbg("\"%s\": *%p <-- %p", name, addr, value);
 	*addr = value;
     }
 }
 
 /*
  * Search the global objects, including dependencies and main object,
  * for the given symbol.
  */
 static int
 symlook_global(SymLook *req, DoneList *donelist)
 {
     SymLook req1;
     const Objlist_Entry *elm;
     int res;
 
     symlook_init_from_req(&req1, req);
 
     /* Search all objects loaded at program start up. */
     if (req->defobj_out == NULL ||
       ELF_ST_BIND(req->sym_out->st_info) == STB_WEAK) {
 	res = symlook_list(&req1, &list_main, donelist);
 	if (res == 0 && (req->defobj_out == NULL ||
 	  ELF_ST_BIND(req1.sym_out->st_info) != STB_WEAK)) {
 	    req->sym_out = req1.sym_out;
 	    req->defobj_out = req1.defobj_out;
 	    assert(req->defobj_out != NULL);
 	}
     }
 
     /* Search all DAGs whose roots are RTLD_GLOBAL objects. */
     STAILQ_FOREACH(elm, &list_global, link) {
 	if (req->defobj_out != NULL &&
 	  ELF_ST_BIND(req->sym_out->st_info) != STB_WEAK)
 	    break;
 	res = symlook_list(&req1, &elm->obj->dagmembers, donelist);
 	if (res == 0 && (req->defobj_out == NULL ||
 	  ELF_ST_BIND(req1.sym_out->st_info) != STB_WEAK)) {
 	    req->sym_out = req1.sym_out;
 	    req->defobj_out = req1.defobj_out;
 	    assert(req->defobj_out != NULL);
 	}
     }
 
     return (req->sym_out != NULL ? 0 : ESRCH);
 }
 
 /*
  * Given a symbol name in a referencing object, find the corresponding
  * definition of the symbol.  Returns a pointer to the symbol, or NULL if
  * no definition was found.  Returns a pointer to the Obj_Entry of the
  * defining object via the reference parameter DEFOBJ_OUT.
  */
 static int
 symlook_default(SymLook *req, const Obj_Entry *refobj)
 {
     DoneList donelist;
     const Objlist_Entry *elm;
     SymLook req1;
     int res;
 
     donelist_init(&donelist);
     symlook_init_from_req(&req1, req);
 
     /*
      * Look first in the referencing object if linked symbolically,
      * and similarly handle protected symbols.
      */
     res = symlook_obj(&req1, refobj);
     if (res == 0 && (refobj->symbolic ||
       ELF_ST_VISIBILITY(req1.sym_out->st_other) == STV_PROTECTED)) {
 	req->sym_out = req1.sym_out;
 	req->defobj_out = req1.defobj_out;
 	assert(req->defobj_out != NULL);
     }
     if (refobj->symbolic || req->defobj_out != NULL)
 	donelist_check(&donelist, refobj);
 
     symlook_global(req, &donelist);
 
     /* Search all dlopened DAGs containing the referencing object. */
     STAILQ_FOREACH(elm, &refobj->dldags, link) {
 	if (req->sym_out != NULL &&
 	  ELF_ST_BIND(req->sym_out->st_info) != STB_WEAK)
 	    break;
 	res = symlook_list(&req1, &elm->obj->dagmembers, &donelist);
 	if (res == 0 && (req->sym_out == NULL ||
 	  ELF_ST_BIND(req1.sym_out->st_info) != STB_WEAK)) {
 	    req->sym_out = req1.sym_out;
 	    req->defobj_out = req1.defobj_out;
 	    assert(req->defobj_out != NULL);
 	}
     }
 
     /*
      * Search the dynamic linker itself, and possibly resolve the
      * symbol from there.  This is how the application links to
      * dynamic linker services such as dlopen.
      */
     if (req->sym_out == NULL ||
       ELF_ST_BIND(req->sym_out->st_info) == STB_WEAK) {
 	res = symlook_obj(&req1, &obj_rtld);
 	if (res == 0) {
 	    req->sym_out = req1.sym_out;
 	    req->defobj_out = req1.defobj_out;
 	    assert(req->defobj_out != NULL);
 	}
     }
 
     return (req->sym_out != NULL ? 0 : ESRCH);
 }
 
 static int
 symlook_list(SymLook *req, const Objlist *objlist, DoneList *dlp)
 {
     const Elf_Sym *def;
     const Obj_Entry *defobj;
     const Objlist_Entry *elm;
     SymLook req1;
     int res;
 
     def = NULL;
     defobj = NULL;
     STAILQ_FOREACH(elm, objlist, link) {
 	if (donelist_check(dlp, elm->obj))
 	    continue;
 	symlook_init_from_req(&req1, req);
 	if ((res = symlook_obj(&req1, elm->obj)) == 0) {
 	    if (def == NULL || ELF_ST_BIND(req1.sym_out->st_info) != STB_WEAK) {
 		def = req1.sym_out;
 		defobj = req1.defobj_out;
 		if (ELF_ST_BIND(def->st_info) != STB_WEAK)
 		    break;
 	    }
 	}
     }
     if (def != NULL) {
 	req->sym_out = def;
 	req->defobj_out = defobj;
 	return (0);
     }
     return (ESRCH);
 }
 
 /*
  * Search the chain of DAGS cointed to by the given Needed_Entry
  * for a symbol of the given name.  Each DAG is scanned completely
  * before advancing to the next one.  Returns a pointer to the symbol,
  * or NULL if no definition was found.
  */
 static int
 symlook_needed(SymLook *req, const Needed_Entry *needed, DoneList *dlp)
 {
     const Elf_Sym *def;
     const Needed_Entry *n;
     const Obj_Entry *defobj;
     SymLook req1;
     int res;
 
     def = NULL;
     defobj = NULL;
     symlook_init_from_req(&req1, req);
     for (n = needed; n != NULL; n = n->next) {
 	if (n->obj == NULL ||
 	    (res = symlook_list(&req1, &n->obj->dagmembers, dlp)) != 0)
 	    continue;
 	if (def == NULL || ELF_ST_BIND(req1.sym_out->st_info) != STB_WEAK) {
 	    def = req1.sym_out;
 	    defobj = req1.defobj_out;
 	    if (ELF_ST_BIND(def->st_info) != STB_WEAK)
 		break;
 	}
     }
     if (def != NULL) {
 	req->sym_out = def;
 	req->defobj_out = defobj;
 	return (0);
     }
     return (ESRCH);
 }
 
 /*
  * Search the symbol table of a single shared object for a symbol of
  * the given name and version, if requested.  Returns a pointer to the
  * symbol, or NULL if no definition was found.  If the object is
  * filter, return filtered symbol from filtee.
  *
  * The symbol's hash value is passed in for efficiency reasons; that
  * eliminates many recomputations of the hash value.
  */
 int
 symlook_obj(SymLook *req, const Obj_Entry *obj)
 {
     DoneList donelist;
     SymLook req1;
     int flags, res, mres;
 
     /*
      * If there is at least one valid hash at this point, we prefer to
      * use the faster GNU version if available.
      */
     if (obj->valid_hash_gnu)
 	mres = symlook_obj1_gnu(req, obj);
     else if (obj->valid_hash_sysv)
 	mres = symlook_obj1_sysv(req, obj);
     else
 	return (EINVAL);
 
     if (mres == 0) {
 	if (obj->needed_filtees != NULL) {
 	    flags = (req->flags & SYMLOOK_EARLY) ? RTLD_LO_EARLY : 0;
 	    load_filtees(__DECONST(Obj_Entry *, obj), flags, req->lockstate);
 	    donelist_init(&donelist);
 	    symlook_init_from_req(&req1, req);
 	    res = symlook_needed(&req1, obj->needed_filtees, &donelist);
 	    if (res == 0) {
 		req->sym_out = req1.sym_out;
 		req->defobj_out = req1.defobj_out;
 	    }
 	    return (res);
 	}
 	if (obj->needed_aux_filtees != NULL) {
 	    flags = (req->flags & SYMLOOK_EARLY) ? RTLD_LO_EARLY : 0;
 	    load_filtees(__DECONST(Obj_Entry *, obj), flags, req->lockstate);
 	    donelist_init(&donelist);
 	    symlook_init_from_req(&req1, req);
 	    res = symlook_needed(&req1, obj->needed_aux_filtees, &donelist);
 	    if (res == 0) {
 		req->sym_out = req1.sym_out;
 		req->defobj_out = req1.defobj_out;
 		return (res);
 	    }
 	}
     }
     return (mres);
 }
 
 /* Symbol match routine common to both hash functions */
 static bool
 matched_symbol(SymLook *req, const Obj_Entry *obj, Sym_Match_Result *result,
     const unsigned long symnum)
 {
 	Elf_Versym verndx;
 	const Elf_Sym *symp;
 	const char *strp;
 
 	symp = obj->symtab + symnum;
 	strp = obj->strtab + symp->st_name;
 
 	switch (ELF_ST_TYPE(symp->st_info)) {
 	case STT_FUNC:
 	case STT_NOTYPE:
 	case STT_OBJECT:
 	case STT_COMMON:
 	case STT_GNU_IFUNC:
 		if (symp->st_value == 0)
 			return (false);
 		/* fallthrough */
 	case STT_TLS:
 		if (symp->st_shndx != SHN_UNDEF)
 			break;
 #ifndef __mips__
 		else if (((req->flags & SYMLOOK_IN_PLT) == 0) &&
 		    (ELF_ST_TYPE(symp->st_info) == STT_FUNC))
 			break;
 #endif
 		/* fallthrough */
 	default:
 		return (false);
 	}
 	if (req->name[0] != strp[0] || strcmp(req->name, strp) != 0)
 		return (false);
 
 	if (req->ventry == NULL) {
 		if (obj->versyms != NULL) {
 			verndx = VER_NDX(obj->versyms[symnum]);
 			if (verndx > obj->vernum) {
 				_rtld_error(
 				    "%s: symbol %s references wrong version %d",
 				    obj->path, obj->strtab + symnum, verndx);
 				return (false);
 			}
 			/*
 			 * If we are not called from dlsym (i.e. this
 			 * is a normal relocation from unversioned
 			 * binary), accept the symbol immediately if
 			 * it happens to have first version after this
 			 * shared object became versioned.  Otherwise,
 			 * if symbol is versioned and not hidden,
 			 * remember it. If it is the only symbol with
 			 * this name exported by the shared object, it
 			 * will be returned as a match by the calling
 			 * function. If symbol is global (verndx < 2)
 			 * accept it unconditionally.
 			 */
 			if ((req->flags & SYMLOOK_DLSYM) == 0 &&
 			    verndx == VER_NDX_GIVEN) {
 				result->sym_out = symp;
 				return (true);
 			}
 			else if (verndx >= VER_NDX_GIVEN) {
 				if ((obj->versyms[symnum] & VER_NDX_HIDDEN)
 				    == 0) {
 					if (result->vsymp == NULL)
 						result->vsymp = symp;
 					result->vcount++;
 				}
 				return (false);
 			}
 		}
 		result->sym_out = symp;
 		return (true);
 	}
 	if (obj->versyms == NULL) {
 		if (object_match_name(obj, req->ventry->name)) {
 			_rtld_error("%s: object %s should provide version %s "
 			    "for symbol %s", obj_rtld.path, obj->path,
 			    req->ventry->name, obj->strtab + symnum);
 			return (false);
 		}
 	} else {
 		verndx = VER_NDX(obj->versyms[symnum]);
 		if (verndx > obj->vernum) {
 			_rtld_error("%s: symbol %s references wrong version %d",
 			    obj->path, obj->strtab + symnum, verndx);
 			return (false);
 		}
 		if (obj->vertab[verndx].hash != req->ventry->hash ||
 		    strcmp(obj->vertab[verndx].name, req->ventry->name)) {
 			/*
 			 * Version does not match. Look if this is a
 			 * global symbol and if it is not hidden. If
 			 * global symbol (verndx < 2) is available,
 			 * use it. Do not return symbol if we are
 			 * called by dlvsym, because dlvsym looks for
 			 * a specific version and default one is not
 			 * what dlvsym wants.
 			 */
 			if ((req->flags & SYMLOOK_DLSYM) ||
 			    (verndx >= VER_NDX_GIVEN) ||
 			    (obj->versyms[symnum] & VER_NDX_HIDDEN))
 				return (false);
 		}
 	}
 	result->sym_out = symp;
 	return (true);
 }
 
 /*
  * Search for symbol using SysV hash function.
  * obj->buckets is known not to be NULL at this point; the test for this was
  * performed with the obj->valid_hash_sysv assignment.
  */
 static int
 symlook_obj1_sysv(SymLook *req, const Obj_Entry *obj)
 {
 	unsigned long symnum;
 	Sym_Match_Result matchres;
 
 	matchres.sym_out = NULL;
 	matchres.vsymp = NULL;
 	matchres.vcount = 0;
 
 	for (symnum = obj->buckets[req->hash % obj->nbuckets];
 	    symnum != STN_UNDEF; symnum = obj->chains[symnum]) {
 		if (symnum >= obj->nchains)
 			return (ESRCH);	/* Bad object */
 
 		if (matched_symbol(req, obj, &matchres, symnum)) {
 			req->sym_out = matchres.sym_out;
 			req->defobj_out = obj;
 			return (0);
 		}
 	}
 	if (matchres.vcount == 1) {
 		req->sym_out = matchres.vsymp;
 		req->defobj_out = obj;
 		return (0);
 	}
 	return (ESRCH);
 }
 
 /* Search for symbol using GNU hash function */
 static int
 symlook_obj1_gnu(SymLook *req, const Obj_Entry *obj)
 {
 	Elf_Addr bloom_word;
 	const Elf32_Word *hashval;
 	Elf32_Word bucket;
 	Sym_Match_Result matchres;
 	unsigned int h1, h2;
 	unsigned long symnum;
 
 	matchres.sym_out = NULL;
 	matchres.vsymp = NULL;
 	matchres.vcount = 0;
 
 	/* Pick right bitmask word from Bloom filter array */
 	bloom_word = obj->bloom_gnu[(req->hash_gnu / __ELF_WORD_SIZE) &
 	    obj->maskwords_bm_gnu];
 
 	/* Calculate modulus word size of gnu hash and its derivative */
 	h1 = req->hash_gnu & (__ELF_WORD_SIZE - 1);
 	h2 = ((req->hash_gnu >> obj->shift2_gnu) & (__ELF_WORD_SIZE - 1));
 
 	/* Filter out the "definitely not in set" queries */
 	if (((bloom_word >> h1) & (bloom_word >> h2) & 1) == 0)
 		return (ESRCH);
 
 	/* Locate hash chain and corresponding value element*/
 	bucket = obj->buckets_gnu[req->hash_gnu % obj->nbuckets_gnu];
 	if (bucket == 0)
 		return (ESRCH);
 	hashval = &obj->chain_zero_gnu[bucket];
 	do {
 		if (((*hashval ^ req->hash_gnu) >> 1) == 0) {
 			symnum = hashval - obj->chain_zero_gnu;
 			if (matched_symbol(req, obj, &matchres, symnum)) {
 				req->sym_out = matchres.sym_out;
 				req->defobj_out = obj;
 				return (0);
 			}
 		}
 	} while ((*hashval++ & 1) == 0);
 	if (matchres.vcount == 1) {
 		req->sym_out = matchres.vsymp;
 		req->defobj_out = obj;
 		return (0);
 	}
 	return (ESRCH);
 }
 
 static void
 trace_loaded_objects(Obj_Entry *obj)
 {
     const char *fmt1, *fmt2, *fmt, *main_local, *list_containers;
     int c;
 
     if ((main_local = getenv(_LD("TRACE_LOADED_OBJECTS_PROGNAME"))) == NULL)
 	main_local = "";
 
     if ((fmt1 = getenv(_LD("TRACE_LOADED_OBJECTS_FMT1"))) == NULL)
 	fmt1 = "\t%o => %p (%x)\n";
 
     if ((fmt2 = getenv(_LD("TRACE_LOADED_OBJECTS_FMT2"))) == NULL)
 	fmt2 = "\t%o (%x)\n";
 
     list_containers = getenv(_LD("TRACE_LOADED_OBJECTS_ALL"));
 
     for (; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 	Needed_Entry *needed;
 	const char *name, *path;
 	bool is_lib;
 
 	if (obj->marker)
 	    continue;
 	if (list_containers && obj->needed != NULL)
 	    rtld_printf("%s:\n", obj->path);
 	for (needed = obj->needed; needed; needed = needed->next) {
 	    if (needed->obj != NULL) {
 		if (needed->obj->traced && !list_containers)
 		    continue;
 		needed->obj->traced = true;
 		path = needed->obj->path;
 	    } else
 		path = "not found";
 
 	    name = obj->strtab + needed->name;
 	    is_lib = strncmp(name, "lib", 3) == 0;	/* XXX - bogus */
 
 	    fmt = is_lib ? fmt1 : fmt2;
 	    while ((c = *fmt++) != '\0') {
 		switch (c) {
 		default:
 		    rtld_putchar(c);
 		    continue;
 		case '\\':
 		    switch (c = *fmt) {
 		    case '\0':
 			continue;
 		    case 'n':
 			rtld_putchar('\n');
 			break;
 		    case 't':
 			rtld_putchar('\t');
 			break;
 		    }
 		    break;
 		case '%':
 		    switch (c = *fmt) {
 		    case '\0':
 			continue;
 		    case '%':
 		    default:
 			rtld_putchar(c);
 			break;
 		    case 'A':
 			rtld_putstr(main_local);
 			break;
 		    case 'a':
 			rtld_putstr(obj_main->path);
 			break;
 		    case 'o':
 			rtld_putstr(name);
 			break;
 #if 0
 		    case 'm':
 			rtld_printf("%d", sodp->sod_major);
 			break;
 		    case 'n':
 			rtld_printf("%d", sodp->sod_minor);
 			break;
 #endif
 		    case 'p':
 			rtld_putstr(path);
 			break;
 		    case 'x':
 			rtld_printf("%p", needed->obj ? needed->obj->mapbase :
 			  0);
 			break;
 		    }
 		    break;
 		}
 		++fmt;
 	    }
 	}
     }
 }
 
 /*
  * Unload a dlopened object and its dependencies from memory and from
  * our data structures.  It is assumed that the DAG rooted in the
  * object has already been unreferenced, and that the object has a
  * reference count of 0.
  */
 static void
 unload_object(Obj_Entry *root, RtldLockState *lockstate)
 {
 	Obj_Entry marker, *obj, *next;
 
 	assert(root->refcount == 0);
 
 	/*
 	 * Pass over the DAG removing unreferenced objects from
 	 * appropriate lists.
 	 */
 	unlink_object(root);
 
 	/* Unmap all objects that are no longer referenced. */
 	for (obj = TAILQ_FIRST(&obj_list); obj != NULL; obj = next) {
 		next = TAILQ_NEXT(obj, next);
 		if (obj->marker || obj->refcount != 0)
 			continue;
 		LD_UTRACE(UTRACE_UNLOAD_OBJECT, obj, obj->mapbase,
 		    obj->mapsize, 0, obj->path);
 		dbg("unloading \"%s\"", obj->path);
 		/*
 		 * Unlink the object now to prevent new references from
 		 * being acquired while the bind lock is dropped in
 		 * recursive dlclose() invocations.
 		 */
 		TAILQ_REMOVE(&obj_list, obj, next);
 		obj_count--;
 
 		if (obj->filtees_loaded) {
 			if (next != NULL) {
 				init_marker(&marker);
 				TAILQ_INSERT_BEFORE(next, &marker, next);
 				unload_filtees(obj, lockstate);
 				next = TAILQ_NEXT(&marker, next);
 				TAILQ_REMOVE(&obj_list, &marker, next);
 			} else
 				unload_filtees(obj, lockstate);
 		}
 		release_object(obj);
 	}
 }
 
 static void
 unlink_object(Obj_Entry *root)
 {
     Objlist_Entry *elm;
 
     if (root->refcount == 0) {
 	/* Remove the object from the RTLD_GLOBAL list. */
 	objlist_remove(&list_global, root);
 
     	/* Remove the object from all objects' DAG lists. */
     	STAILQ_FOREACH(elm, &root->dagmembers, link) {
 	    objlist_remove(&elm->obj->dldags, root);
 	    if (elm->obj != root)
 		unlink_object(elm->obj);
 	}
     }
 }
 
 static void
 ref_dag(Obj_Entry *root)
 {
     Objlist_Entry *elm;
 
     assert(root->dag_inited);
     STAILQ_FOREACH(elm, &root->dagmembers, link)
 	elm->obj->refcount++;
 }
 
 static void
 unref_dag(Obj_Entry *root)
 {
     Objlist_Entry *elm;
 
     assert(root->dag_inited);
     STAILQ_FOREACH(elm, &root->dagmembers, link)
 	elm->obj->refcount--;
 }
 
 /*
  * Common code for MD __tls_get_addr().
  */
 static void *tls_get_addr_slow(Elf_Addr **, int, size_t) __noinline;
 static void *
 tls_get_addr_slow(Elf_Addr **dtvp, int index, size_t offset)
 {
     Elf_Addr *newdtv, *dtv;
     RtldLockState lockstate;
     int to_copy;
 
     dtv = *dtvp;
     /* Check dtv generation in case new modules have arrived */
     if (dtv[0] != tls_dtv_generation) {
 	wlock_acquire(rtld_bind_lock, &lockstate);
 	newdtv = xcalloc(tls_max_index + 2, sizeof(Elf_Addr));
 	to_copy = dtv[1];
 	if (to_copy > tls_max_index)
 	    to_copy = tls_max_index;
 	memcpy(&newdtv[2], &dtv[2], to_copy * sizeof(Elf_Addr));
 	newdtv[0] = tls_dtv_generation;
 	newdtv[1] = tls_max_index;
 	free(dtv);
 	lock_release(rtld_bind_lock, &lockstate);
 	dtv = *dtvp = newdtv;
     }
 
     /* Dynamically allocate module TLS if necessary */
     if (dtv[index + 1] == 0) {
 	/* Signal safe, wlock will block out signals. */
 	wlock_acquire(rtld_bind_lock, &lockstate);
 	if (!dtv[index + 1])
 	    dtv[index + 1] = (Elf_Addr)allocate_module_tls(index);
 	lock_release(rtld_bind_lock, &lockstate);
     }
     return ((void *)(dtv[index + 1] + offset));
 }
 
 void *
 tls_get_addr_common(Elf_Addr **dtvp, int index, size_t offset)
 {
 	Elf_Addr *dtv;
 
 	dtv = *dtvp;
 	/* Check dtv generation in case new modules have arrived */
 	if (__predict_true(dtv[0] == tls_dtv_generation &&
 	    dtv[index + 1] != 0))
 		return ((void *)(dtv[index + 1] + offset));
 	return (tls_get_addr_slow(dtvp, index, offset));
 }
 
 #if defined(__aarch64__) || defined(__arm__) || defined(__mips__) || \
     defined(__powerpc__) || defined(__riscv)
 
 /*
  * Return pointer to allocated TLS block
  */
 static void *
 get_tls_block_ptr(void *tcb, size_t tcbsize)
 {
     size_t extra_size, post_size, pre_size, tls_block_size;
     size_t tls_init_align;
 
     tls_init_align = MAX(obj_main->tlsalign, 1);
 
     /* Compute fragments sizes. */
     extra_size = tcbsize - TLS_TCB_SIZE;
     post_size = calculate_tls_post_size(tls_init_align);
     tls_block_size = tcbsize + post_size;
     pre_size = roundup2(tls_block_size, tls_init_align) - tls_block_size;
 
     return ((char *)tcb - pre_size - extra_size);
 }
 
 /*
  * Allocate Static TLS using the Variant I method.
  *
  * For details on the layout, see lib/libc/gen/tls.c.
  *
  * NB: rtld's tls_static_space variable includes TLS_TCB_SIZE and post_size as
  *     it is based on tls_last_offset, and TLS offsets here are really TCB
  *     offsets, whereas libc's tls_static_space is just the executable's static
  *     TLS segment.
  */
 void *
 allocate_tls(Obj_Entry *objs, void *oldtcb, size_t tcbsize, size_t tcbalign)
 {
     Obj_Entry *obj;
     char *tls_block;
     Elf_Addr *dtv, **tcb;
     Elf_Addr addr;
     Elf_Addr i;
     size_t extra_size, maxalign, post_size, pre_size, tls_block_size;
     size_t tls_init_align;
 
     if (oldtcb != NULL && tcbsize == TLS_TCB_SIZE)
 	return (oldtcb);
 
     assert(tcbsize >= TLS_TCB_SIZE);
     maxalign = MAX(tcbalign, tls_static_max_align);
     tls_init_align = MAX(obj_main->tlsalign, 1);
 
     /* Compute fragmets sizes. */
     extra_size = tcbsize - TLS_TCB_SIZE;
     post_size = calculate_tls_post_size(tls_init_align);
     tls_block_size = tcbsize + post_size;
     pre_size = roundup2(tls_block_size, tls_init_align) - tls_block_size;
     tls_block_size += pre_size + tls_static_space - TLS_TCB_SIZE - post_size;
 
     /* Allocate whole TLS block */
     tls_block = malloc_aligned(tls_block_size, maxalign);
     tcb = (Elf_Addr **)(tls_block + pre_size + extra_size);
 
     if (oldtcb != NULL) {
 	memcpy(tls_block, get_tls_block_ptr(oldtcb, tcbsize),
 	    tls_static_space);
 	free_aligned(get_tls_block_ptr(oldtcb, tcbsize));
 
 	/* Adjust the DTV. */
 	dtv = tcb[0];
 	for (i = 0; i < dtv[1]; i++) {
 	    if (dtv[i+2] >= (Elf_Addr)oldtcb &&
 		dtv[i+2] < (Elf_Addr)oldtcb + tls_static_space) {
 		dtv[i+2] = dtv[i+2] - (Elf_Addr)oldtcb + (Elf_Addr)tcb;
 	    }
 	}
     } else {
 	dtv = xcalloc(tls_max_index + 2, sizeof(Elf_Addr));
 	tcb[0] = dtv;
 	dtv[0] = tls_dtv_generation;
 	dtv[1] = tls_max_index;
 
 	for (obj = globallist_curr(objs); obj != NULL;
 	  obj = globallist_next(obj)) {
 	    if (obj->tlsoffset > 0) {
 		addr = (Elf_Addr)tcb + obj->tlsoffset;
 		if (obj->tlsinitsize > 0)
 		    memcpy((void*) addr, obj->tlsinit, obj->tlsinitsize);
 		if (obj->tlssize > obj->tlsinitsize)
 		    memset((void*)(addr + obj->tlsinitsize), 0,
 			   obj->tlssize - obj->tlsinitsize);
 		dtv[obj->tlsindex + 1] = addr;
 	    }
 	}
     }
 
     return (tcb);
 }
 
 void
 free_tls(void *tcb, size_t tcbsize, size_t tcbalign __unused)
 {
     Elf_Addr *dtv;
     Elf_Addr tlsstart, tlsend;
     size_t post_size;
     size_t dtvsize, i, tls_init_align;
 
     assert(tcbsize >= TLS_TCB_SIZE);
     tls_init_align = MAX(obj_main->tlsalign, 1);
 
     /* Compute fragments sizes. */
     post_size = calculate_tls_post_size(tls_init_align);
 
     tlsstart = (Elf_Addr)tcb + TLS_TCB_SIZE + post_size;
     tlsend = (Elf_Addr)tcb + tls_static_space;
 
     dtv = *(Elf_Addr **)tcb;
     dtvsize = dtv[1];
     for (i = 0; i < dtvsize; i++) {
 	if (dtv[i+2] && (dtv[i+2] < tlsstart || dtv[i+2] >= tlsend)) {
 	    free((void*)dtv[i+2]);
 	}
     }
     free(dtv);
     free_aligned(get_tls_block_ptr(tcb, tcbsize));
 }
 
 #endif
 
 #if defined(__i386__) || defined(__amd64__) || defined(__sparc64__)
 
 /*
  * Allocate Static TLS using the Variant II method.
  */
 void *
 allocate_tls(Obj_Entry *objs, void *oldtls, size_t tcbsize, size_t tcbalign)
 {
     Obj_Entry *obj;
     size_t size, ralign;
     char *tls;
     Elf_Addr *dtv, *olddtv;
     Elf_Addr segbase, oldsegbase, addr;
     size_t i;
 
     ralign = tcbalign;
     if (tls_static_max_align > ralign)
 	    ralign = tls_static_max_align;
     size = round(tls_static_space, ralign) + round(tcbsize, ralign);
 
     assert(tcbsize >= 2*sizeof(Elf_Addr));
     tls = malloc_aligned(size, ralign);
     dtv = xcalloc(tls_max_index + 2, sizeof(Elf_Addr));
 
     segbase = (Elf_Addr)(tls + round(tls_static_space, ralign));
     ((Elf_Addr*)segbase)[0] = segbase;
     ((Elf_Addr*)segbase)[1] = (Elf_Addr) dtv;
 
     dtv[0] = tls_dtv_generation;
     dtv[1] = tls_max_index;
 
     if (oldtls) {
 	/*
 	 * Copy the static TLS block over whole.
 	 */
 	oldsegbase = (Elf_Addr) oldtls;
 	memcpy((void *)(segbase - tls_static_space),
 	       (const void *)(oldsegbase - tls_static_space),
 	       tls_static_space);
 
 	/*
 	 * If any dynamic TLS blocks have been created tls_get_addr(),
 	 * move them over.
 	 */
 	olddtv = ((Elf_Addr**)oldsegbase)[1];
 	for (i = 0; i < olddtv[1]; i++) {
 	    if (olddtv[i+2] < oldsegbase - size || olddtv[i+2] > oldsegbase) {
 		dtv[i+2] = olddtv[i+2];
 		olddtv[i+2] = 0;
 	    }
 	}
 
 	/*
 	 * We assume that this block was the one we created with
 	 * allocate_initial_tls().
 	 */
 	free_tls(oldtls, 2*sizeof(Elf_Addr), sizeof(Elf_Addr));
     } else {
 	for (obj = objs; obj != NULL; obj = TAILQ_NEXT(obj, next)) {
 		if (obj->marker || obj->tlsoffset == 0)
 			continue;
 		addr = segbase - obj->tlsoffset;
 		memset((void*)(addr + obj->tlsinitsize),
 		       0, obj->tlssize - obj->tlsinitsize);
-		if (obj->tlsinit)
+		if (obj->tlsinit) {
 		    memcpy((void*) addr, obj->tlsinit, obj->tlsinitsize);
+		    obj->static_tls_copied = true;
+		}
 		dtv[obj->tlsindex + 1] = addr;
 	}
     }
 
     return (void*) segbase;
 }
 
 void
 free_tls(void *tls, size_t tcbsize  __unused, size_t tcbalign)
 {
     Elf_Addr* dtv;
     size_t size, ralign;
     int dtvsize, i;
     Elf_Addr tlsstart, tlsend;
 
     /*
      * Figure out the size of the initial TLS block so that we can
      * find stuff which ___tls_get_addr() allocated dynamically.
      */
     ralign = tcbalign;
     if (tls_static_max_align > ralign)
 	    ralign = tls_static_max_align;
     size = round(tls_static_space, ralign);
 
     dtv = ((Elf_Addr**)tls)[1];
     dtvsize = dtv[1];
     tlsend = (Elf_Addr) tls;
     tlsstart = tlsend - size;
     for (i = 0; i < dtvsize; i++) {
 	if (dtv[i + 2] != 0 && (dtv[i + 2] < tlsstart || dtv[i + 2] > tlsend)) {
 		free_aligned((void *)dtv[i + 2]);
 	}
     }
 
     free_aligned((void *)tlsstart);
     free((void*) dtv);
 }
 
 #endif
 
 /*
  * Allocate TLS block for module with given index.
  */
 void *
 allocate_module_tls(int index)
 {
     Obj_Entry* obj;
     char* p;
 
     TAILQ_FOREACH(obj, &obj_list, next) {
 	if (obj->marker)
 	    continue;
 	if (obj->tlsindex == index)
 	    break;
     }
     if (!obj) {
 	_rtld_error("Can't find module with TLS index %d", index);
 	rtld_die();
     }
 
     p = malloc_aligned(obj->tlssize, obj->tlsalign);
     memcpy(p, obj->tlsinit, obj->tlsinitsize);
     memset(p + obj->tlsinitsize, 0, obj->tlssize - obj->tlsinitsize);
 
     return p;
 }
 
 bool
 allocate_tls_offset(Obj_Entry *obj)
 {
     size_t off;
 
     if (obj->tls_done)
 	return true;
 
     if (obj->tlssize == 0) {
 	obj->tls_done = true;
 	return true;
     }
 
     if (tls_last_offset == 0)
 	off = calculate_first_tls_offset(obj->tlssize, obj->tlsalign);
     else
 	off = calculate_tls_offset(tls_last_offset, tls_last_size,
 				   obj->tlssize, obj->tlsalign);
 
     /*
      * If we have already fixed the size of the static TLS block, we
      * must stay within that size. When allocating the static TLS, we
      * leave a small amount of space spare to be used for dynamically
      * loading modules which use static TLS.
      */
     if (tls_static_space != 0) {
 	if (calculate_tls_end(off, obj->tlssize) > tls_static_space)
 	    return false;
     } else if (obj->tlsalign > tls_static_max_align) {
 	    tls_static_max_align = obj->tlsalign;
     }
 
     tls_last_offset = obj->tlsoffset = off;
     tls_last_size = obj->tlssize;
     obj->tls_done = true;
 
     return true;
 }
 
 void
 free_tls_offset(Obj_Entry *obj)
 {
 
     /*
      * If we were the last thing to allocate out of the static TLS
      * block, we give our space back to the 'allocator'. This is a
      * simplistic workaround to allow libGL.so.1 to be loaded and
      * unloaded multiple times.
      */
     if (calculate_tls_end(obj->tlsoffset, obj->tlssize)
 	== calculate_tls_end(tls_last_offset, tls_last_size)) {
 	tls_last_offset -= obj->tlssize;
 	tls_last_size = 0;
     }
 }
 
 void *
 _rtld_allocate_tls(void *oldtls, size_t tcbsize, size_t tcbalign)
 {
     void *ret;
     RtldLockState lockstate;
 
     wlock_acquire(rtld_bind_lock, &lockstate);
     ret = allocate_tls(globallist_curr(TAILQ_FIRST(&obj_list)), oldtls,
       tcbsize, tcbalign);
     lock_release(rtld_bind_lock, &lockstate);
     return (ret);
 }
 
 void
 _rtld_free_tls(void *tcb, size_t tcbsize, size_t tcbalign)
 {
     RtldLockState lockstate;
 
     wlock_acquire(rtld_bind_lock, &lockstate);
     free_tls(tcb, tcbsize, tcbalign);
     lock_release(rtld_bind_lock, &lockstate);
 }
 
 static void
 object_add_name(Obj_Entry *obj, const char *name)
 {
     Name_Entry *entry;
     size_t len;
 
     len = strlen(name);
     entry = malloc(sizeof(Name_Entry) + len);
 
     if (entry != NULL) {
 	strcpy(entry->name, name);
 	STAILQ_INSERT_TAIL(&obj->names, entry, link);
     }
 }
 
 static int
 object_match_name(const Obj_Entry *obj, const char *name)
 {
     Name_Entry *entry;
 
     STAILQ_FOREACH(entry, &obj->names, link) {
 	if (strcmp(name, entry->name) == 0)
 	    return (1);
     }
     return (0);
 }
 
 static Obj_Entry *
 locate_dependency(const Obj_Entry *obj, const char *name)
 {
     const Objlist_Entry *entry;
     const Needed_Entry *needed;
 
     STAILQ_FOREACH(entry, &list_main, link) {
 	if (object_match_name(entry->obj, name))
 	    return entry->obj;
     }
 
     for (needed = obj->needed;  needed != NULL;  needed = needed->next) {
 	if (strcmp(obj->strtab + needed->name, name) == 0 ||
 	  (needed->obj != NULL && object_match_name(needed->obj, name))) {
 	    /*
 	     * If there is DT_NEEDED for the name we are looking for,
 	     * we are all set.  Note that object might not be found if
 	     * dependency was not loaded yet, so the function can
 	     * return NULL here.  This is expected and handled
 	     * properly by the caller.
 	     */
 	    return (needed->obj);
 	}
     }
     _rtld_error("%s: Unexpected inconsistency: dependency %s not found",
 	obj->path, name);
     rtld_die();
 }
 
 static int
 check_object_provided_version(Obj_Entry *refobj, const Obj_Entry *depobj,
     const Elf_Vernaux *vna)
 {
     const Elf_Verdef *vd;
     const char *vername;
 
     vername = refobj->strtab + vna->vna_name;
     vd = depobj->verdef;
     if (vd == NULL) {
 	_rtld_error("%s: version %s required by %s not defined",
 	    depobj->path, vername, refobj->path);
 	return (-1);
     }
     for (;;) {
 	if (vd->vd_version != VER_DEF_CURRENT) {
 	    _rtld_error("%s: Unsupported version %d of Elf_Verdef entry",
 		depobj->path, vd->vd_version);
 	    return (-1);
 	}
 	if (vna->vna_hash == vd->vd_hash) {
 	    const Elf_Verdaux *aux = (const Elf_Verdaux *)
 		((const char *)vd + vd->vd_aux);
 	    if (strcmp(vername, depobj->strtab + aux->vda_name) == 0)
 		return (0);
 	}
 	if (vd->vd_next == 0)
 	    break;
 	vd = (const Elf_Verdef *)((const char *)vd + vd->vd_next);
     }
     if (vna->vna_flags & VER_FLG_WEAK)
 	return (0);
     _rtld_error("%s: version %s required by %s not found",
 	depobj->path, vername, refobj->path);
     return (-1);
 }
 
 static int
 rtld_verify_object_versions(Obj_Entry *obj)
 {
     const Elf_Verneed *vn;
     const Elf_Verdef  *vd;
     const Elf_Verdaux *vda;
     const Elf_Vernaux *vna;
     const Obj_Entry *depobj;
     int maxvernum, vernum;
 
     if (obj->ver_checked)
 	return (0);
     obj->ver_checked = true;
 
     maxvernum = 0;
     /*
      * Walk over defined and required version records and figure out
      * max index used by any of them. Do very basic sanity checking
      * while there.
      */
     vn = obj->verneed;
     while (vn != NULL) {
 	if (vn->vn_version != VER_NEED_CURRENT) {
 	    _rtld_error("%s: Unsupported version %d of Elf_Verneed entry",
 		obj->path, vn->vn_version);
 	    return (-1);
 	}
 	vna = (const Elf_Vernaux *)((const char *)vn + vn->vn_aux);
 	for (;;) {
 	    vernum = VER_NEED_IDX(vna->vna_other);
 	    if (vernum > maxvernum)
 		maxvernum = vernum;
 	    if (vna->vna_next == 0)
 		 break;
 	    vna = (const Elf_Vernaux *)((const char *)vna + vna->vna_next);
 	}
 	if (vn->vn_next == 0)
 	    break;
 	vn = (const Elf_Verneed *)((const char *)vn + vn->vn_next);
     }
 
     vd = obj->verdef;
     while (vd != NULL) {
 	if (vd->vd_version != VER_DEF_CURRENT) {
 	    _rtld_error("%s: Unsupported version %d of Elf_Verdef entry",
 		obj->path, vd->vd_version);
 	    return (-1);
 	}
 	vernum = VER_DEF_IDX(vd->vd_ndx);
 	if (vernum > maxvernum)
 		maxvernum = vernum;
 	if (vd->vd_next == 0)
 	    break;
 	vd = (const Elf_Verdef *)((const char *)vd + vd->vd_next);
     }
 
     if (maxvernum == 0)
 	return (0);
 
     /*
      * Store version information in array indexable by version index.
      * Verify that object version requirements are satisfied along the
      * way.
      */
     obj->vernum = maxvernum + 1;
     obj->vertab = xcalloc(obj->vernum, sizeof(Ver_Entry));
 
     vd = obj->verdef;
     while (vd != NULL) {
 	if ((vd->vd_flags & VER_FLG_BASE) == 0) {
 	    vernum = VER_DEF_IDX(vd->vd_ndx);
 	    assert(vernum <= maxvernum);
 	    vda = (const Elf_Verdaux *)((const char *)vd + vd->vd_aux);
 	    obj->vertab[vernum].hash = vd->vd_hash;
 	    obj->vertab[vernum].name = obj->strtab + vda->vda_name;
 	    obj->vertab[vernum].file = NULL;
 	    obj->vertab[vernum].flags = 0;
 	}
 	if (vd->vd_next == 0)
 	    break;
 	vd = (const Elf_Verdef *)((const char *)vd + vd->vd_next);
     }
 
     vn = obj->verneed;
     while (vn != NULL) {
 	depobj = locate_dependency(obj, obj->strtab + vn->vn_file);
 	if (depobj == NULL)
 	    return (-1);
 	vna = (const Elf_Vernaux *)((const char *)vn + vn->vn_aux);
 	for (;;) {
 	    if (check_object_provided_version(obj, depobj, vna))
 		return (-1);
 	    vernum = VER_NEED_IDX(vna->vna_other);
 	    assert(vernum <= maxvernum);
 	    obj->vertab[vernum].hash = vna->vna_hash;
 	    obj->vertab[vernum].name = obj->strtab + vna->vna_name;
 	    obj->vertab[vernum].file = obj->strtab + vn->vn_file;
 	    obj->vertab[vernum].flags = (vna->vna_other & VER_NEED_HIDDEN) ?
 		VER_INFO_HIDDEN : 0;
 	    if (vna->vna_next == 0)
 		 break;
 	    vna = (const Elf_Vernaux *)((const char *)vna + vna->vna_next);
 	}
 	if (vn->vn_next == 0)
 	    break;
 	vn = (const Elf_Verneed *)((const char *)vn + vn->vn_next);
     }
     return 0;
 }
 
 static int
 rtld_verify_versions(const Objlist *objlist)
 {
     Objlist_Entry *entry;
     int rc;
 
     rc = 0;
     STAILQ_FOREACH(entry, objlist, link) {
 	/*
 	 * Skip dummy objects or objects that have their version requirements
 	 * already checked.
 	 */
 	if (entry->obj->strtab == NULL || entry->obj->vertab != NULL)
 	    continue;
 	if (rtld_verify_object_versions(entry->obj) == -1) {
 	    rc = -1;
 	    if (ld_tracing == NULL)
 		break;
 	}
     }
     if (rc == 0 || ld_tracing != NULL)
     	rc = rtld_verify_object_versions(&obj_rtld);
     return rc;
 }
 
 const Ver_Entry *
 fetch_ventry(const Obj_Entry *obj, unsigned long symnum)
 {
     Elf_Versym vernum;
 
     if (obj->vertab) {
 	vernum = VER_NDX(obj->versyms[symnum]);
 	if (vernum >= obj->vernum) {
 	    _rtld_error("%s: symbol %s has wrong verneed value %d",
 		obj->path, obj->strtab + symnum, vernum);
 	} else if (obj->vertab[vernum].hash != 0) {
 	    return &obj->vertab[vernum];
 	}
     }
     return NULL;
 }
 
 int
 _rtld_get_stack_prot(void)
 {
 
 	return (stack_prot);
 }
 
 int
 _rtld_is_dlopened(void *arg)
 {
 	Obj_Entry *obj;
 	RtldLockState lockstate;
 	int res;
 
 	rlock_acquire(rtld_bind_lock, &lockstate);
 	obj = dlcheck(arg);
 	if (obj == NULL)
 		obj = obj_from_addr(arg);
 	if (obj == NULL) {
 		_rtld_error("No shared object contains address");
 		lock_release(rtld_bind_lock, &lockstate);
 		return (-1);
 	}
 	res = obj->dlopened ? 1 : 0;
 	lock_release(rtld_bind_lock, &lockstate);
 	return (res);
 }
 
 static int
 obj_remap_relro(Obj_Entry *obj, int prot)
 {
 
 	if (obj->relro_size > 0 && mprotect(obj->relro_page, obj->relro_size,
 	    prot) == -1) {
 		_rtld_error("%s: Cannot set relro protection to %#x: %s",
 		    obj->path, prot, rtld_strerror(errno));
 		return (-1);
 	}
 	return (0);
 }
 
 static int
 obj_disable_relro(Obj_Entry *obj)
 {
 
 	return (obj_remap_relro(obj, PROT_READ | PROT_WRITE));
 }
 
 static int
 obj_enforce_relro(Obj_Entry *obj)
 {
 
 	return (obj_remap_relro(obj, PROT_READ));
 }
 
 static void
 map_stacks_exec(RtldLockState *lockstate)
 {
 	void (*thr_map_stacks_exec)(void);
 
 	if ((max_stack_flags & PF_X) == 0 || (stack_prot & PROT_EXEC) != 0)
 		return;
 	thr_map_stacks_exec = (void (*)(void))(uintptr_t)
 	    get_program_var_addr("__pthread_map_stacks_exec", lockstate);
 	if (thr_map_stacks_exec != NULL) {
 		stack_prot |= PROT_EXEC;
 		thr_map_stacks_exec();
+	}
+}
+
+static void
+distribute_static_tls(Objlist *list, RtldLockState *lockstate)
+{
+	Objlist_Entry *elm;
+	Obj_Entry *obj;
+	void (*distrib)(size_t, void *, size_t, size_t);
+
+	distrib = (void (*)(size_t, void *, size_t, size_t))(uintptr_t)
+	    get_program_var_addr("__pthread_distribute_static_tls", lockstate);
+	if (distrib == NULL)
+		return;
+	STAILQ_FOREACH(elm, list, link) {
+		obj = elm->obj;
+		if (obj->marker || !obj->tls_done || obj->static_tls_copied)
+			continue;
+		distrib(obj->tlsoffset, obj->tlsinit, obj->tlsinitsize,
+		    obj->tlssize);
+		obj->static_tls_copied = true;
 	}
 }
 
 void
 symlook_init(SymLook *dst, const char *name)
 {
 
 	bzero(dst, sizeof(*dst));
 	dst->name = name;
 	dst->hash = elf_hash(name);
 	dst->hash_gnu = gnu_hash(name);
 }
 
 static void
 symlook_init_from_req(SymLook *dst, const SymLook *src)
 {
 
 	dst->name = src->name;
 	dst->hash = src->hash;
 	dst->hash_gnu = src->hash_gnu;
 	dst->ventry = src->ventry;
 	dst->flags = src->flags;
 	dst->defobj_out = NULL;
 	dst->sym_out = NULL;
 	dst->lockstate = src->lockstate;
 }
 
 static int
 open_binary_fd(const char *argv0, bool search_in_path)
 {
 	char *pathenv, *pe, binpath[PATH_MAX];
 	int fd;
 
 	if (search_in_path && strchr(argv0, '/') == NULL) {
 		pathenv = getenv("PATH");
 		if (pathenv == NULL) {
 			_rtld_error("-p and no PATH environment variable");
 			rtld_die();
 		}
 		pathenv = strdup(pathenv);
 		if (pathenv == NULL) {
 			_rtld_error("Cannot allocate memory");
 			rtld_die();
 		}
 		fd = -1;
 		errno = ENOENT;
 		while ((pe = strsep(&pathenv, ":")) != NULL) {
 			if (strlcpy(binpath, pe, sizeof(binpath)) >=
 			    sizeof(binpath))
 				continue;
 			if (binpath[0] != '\0' &&
 			    strlcat(binpath, "/", sizeof(binpath)) >=
 			    sizeof(binpath))
 				continue;
 			if (strlcat(binpath, argv0, sizeof(binpath)) >=
 			    sizeof(binpath))
 				continue;
 			fd = open(binpath, O_RDONLY | O_CLOEXEC | O_VERIFY);
 			if (fd != -1 || errno != ENOENT)
 				break;
 		}
 		free(pathenv);
 	} else {
 		fd = open(argv0, O_RDONLY | O_CLOEXEC | O_VERIFY);
 	}
 
 	if (fd == -1) {
 		_rtld_error("Cannot open %s: %s", argv0, rtld_strerror(errno));
 		rtld_die();
 	}
 	return (fd);
 }
 
 /*
  * Parse a set of command-line arguments.
  */
 static int
 parse_args(char* argv[], int argc, bool *use_pathp, int *fdp)
 {
 	const char *arg;
 	int fd, i, j, arglen;
 	char opt;
 
 	dbg("Parsing command-line arguments");
 	*use_pathp = false;
 	*fdp = -1;
 
 	for (i = 1; i < argc; i++ ) {
 		arg = argv[i];
 		dbg("argv[%d]: '%s'", i, arg);
 
 		/*
 		 * rtld arguments end with an explicit "--" or with the first
 		 * non-prefixed argument.
 		 */
 		if (strcmp(arg, "--") == 0) {
 			i++;
 			break;
 		}
 		if (arg[0] != '-')
 			break;
 
 		/*
 		 * All other arguments are single-character options that can
 		 * be combined, so we need to search through `arg` for them.
 		 */
 		arglen = strlen(arg);
 		for (j = 1; j < arglen; j++) {
 			opt = arg[j];
 			if (opt == 'h') {
 				print_usage(argv[0]);
 				_exit(0);
 			} else if (opt == 'f') {
 			/*
 			 * -f XX can be used to specify a descriptor for the
 			 * binary named at the command line (i.e., the later
 			 * argument will specify the process name but the
 			 * descriptor is what will actually be executed)
 			 */
 			if (j != arglen - 1) {
 				/* -f must be the last option in, e.g., -abcf */
 				_rtld_error("Invalid options: %s", arg);
 				rtld_die();
 			}
 			i++;
 			fd = parse_integer(argv[i]);
 			if (fd == -1) {
 				_rtld_error("Invalid file descriptor: '%s'",
 				    argv[i]);
 				rtld_die();
 			}
 			*fdp = fd;
 			break;
 			} else if (opt == 'p') {
 				*use_pathp = true;
 			} else {
 				_rtld_error("Invalid argument: '%s'", arg);
 				print_usage(argv[0]);
 				rtld_die();
 			}
 		}
 	}
 
 	return (i);
 }
 
 /*
  * Parse a file descriptor number without pulling in more of libc (e.g. atoi).
  */
 static int
 parse_integer(const char *str)
 {
 	static const int RADIX = 10;  /* XXXJA: possibly support hex? */
 	const char *orig;
 	int n;
 	char c;
 
 	orig = str;
 	n = 0;
 	for (c = *str; c != '\0'; c = *++str) {
 		if (c < '0' || c > '9')
 			return (-1);
 
 		n *= RADIX;
 		n += c - '0';
 	}
 
 	/* Make sure we actually parsed something. */
 	if (str == orig)
 		return (-1);
 	return (n);
 }
 
 static void
 print_usage(const char *argv0)
 {
 
 	rtld_printf("Usage: %s [-h] [-f <FD>] [--] <binary> [<args>]\n"
 		"\n"
 		"Options:\n"
 		"  -h        Display this help message\n"
 		"  -p        Search in PATH for named binary\n"
 		"  -f <FD>   Execute <FD> instead of searching for <binary>\n"
 		"  --        End of RTLD options\n"
 		"  <binary>  Name of process to execute\n"
 		"  <args>    Arguments to the executed process\n", argv0);
 }
 
 /*
  * Overrides for libc_pic-provided functions.
  */
 
 int
 __getosreldate(void)
 {
 	size_t len;
 	int oid[2];
 	int error, osrel;
 
 	if (osreldate != 0)
 		return (osreldate);
 
 	oid[0] = CTL_KERN;
 	oid[1] = KERN_OSRELDATE;
 	osrel = 0;
 	len = sizeof(osrel);
 	error = sysctl(oid, 2, &osrel, &len, NULL, 0);
 	if (error == 0 && osrel > 0 && len == sizeof(osrel))
 		osreldate = osrel;
 	return (osreldate);
 }
 
 void
 exit(int status)
 {
 
 	_exit(status);
 }
 
 void (*__cleanup)(void);
 int __isthreaded = 0;
 int _thread_autoinit_dummy_decl = 1;
 
 /*
  * No unresolved symbols for rtld.
  */
 void
 __pthread_cxa_finalize(struct dl_phdr_info *a __unused)
 {
 }
 
 const char *
 rtld_strerror(int errnum)
 {
 
 	if (errnum < 0 || errnum >= sys_nerr)
 		return ("Unknown error");
 	return (sys_errlist[errnum]);
 }
 
 /*
  * No ifunc relocations.
  */
 void *
 memset(void *dest, int c, size_t len)
 {
 	size_t i;
 
 	for (i = 0; i < len; i++)
 		((char *)dest)[i] = c;
 	return (dest);
 }
 
 void
 bzero(void *dest, size_t len)
 {
 	size_t i;
 
 	for (i = 0; i < len; i++)
 		((char *)dest)[i] = 0;
 }
 
 /* malloc */
 void *
 malloc(size_t nbytes)
 {
 
 	return (__crt_malloc(nbytes));
 }
 
 void *
 calloc(size_t num, size_t size)
 {
 
 	return (__crt_calloc(num, size));
 }
 
 void
 free(void *cp)
 {
 
 	__crt_free(cp);
 }
 
 void *
 realloc(void *cp, size_t nbytes)
 {
 
 	return (__crt_realloc(cp, nbytes));
 }
Index: projects/capsicum-test/libexec/rtld-elf/rtld.h
===================================================================
--- projects/capsicum-test/libexec/rtld-elf/rtld.h	(revision 345709)
+++ projects/capsicum-test/libexec/rtld-elf/rtld.h	(revision 345710)
@@ -1,417 +1,419 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 1996, 1997, 1998, 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef RTLD_H /* { */
 #define RTLD_H 1
 
 #include <machine/elf.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 
 #include <elf-hints.h>
 #include <link.h>
 #include <stdarg.h>
 #include <setjmp.h>
 #include <stddef.h>
 
 #include "rtld_lock.h"
 #include "rtld_machdep.h"
 
 #define NEW(type)	((type *) xmalloc(sizeof(type)))
 #define CNEW(type)	((type *) xcalloc(1, sizeof(type)))
 
 /* We might as well do booleans like C++. */
 typedef unsigned char bool;
 #define false	0
 #define true	1
 
 extern size_t tls_last_offset;
 extern size_t tls_last_size;
 extern size_t tls_static_space;
 extern Elf_Addr tls_dtv_generation;
 extern int tls_max_index;
 
 extern int npagesizes;
 extern size_t *pagesizes;
 
 extern int main_argc;
 extern char **main_argv;
 extern char **environ;
 
 struct stat;
 struct Struct_Obj_Entry;
 
 /* Lists of shared objects */
 typedef struct Struct_Objlist_Entry {
     STAILQ_ENTRY(Struct_Objlist_Entry) link;
     struct Struct_Obj_Entry *obj;
 } Objlist_Entry;
 
 typedef STAILQ_HEAD(Struct_Objlist, Struct_Objlist_Entry) Objlist;
 
 /* Types of init and fini functions */
 typedef void (*InitFunc)(void);
 typedef void (*InitArrFunc)(int, char **, char **);
 
 /* Lists of shared object dependencies */
 typedef struct Struct_Needed_Entry {
     struct Struct_Needed_Entry *next;
     struct Struct_Obj_Entry *obj;
     unsigned long name;		/* Offset of name in string table */
 } Needed_Entry;
 
 typedef struct Struct_Name_Entry {
     STAILQ_ENTRY(Struct_Name_Entry) link;
     char   name[1];
 } Name_Entry;
 
 /* Lock object */
 typedef struct Struct_LockInfo {
     void *context;		/* Client context for creating locks */
     void *thelock;		/* The one big lock */
     /* Debugging aids. */
     volatile int rcount;	/* Number of readers holding lock */
     volatile int wcount;	/* Number of writers holding lock */
     /* Methods */
     void *(*lock_create)(void *context);
     void (*rlock_acquire)(void *lock);
     void (*wlock_acquire)(void *lock);
     void (*rlock_release)(void *lock);
     void (*wlock_release)(void *lock);
     void (*lock_destroy)(void *lock);
     void (*context_destroy)(void *context);
 } LockInfo;
 
 typedef struct Struct_Ver_Entry {
 	Elf_Word     hash;
 	unsigned int flags;
 	const char  *name;
 	const char  *file;
 } Ver_Entry;
 
 typedef struct Struct_Sym_Match_Result {
     const Elf_Sym *sym_out;
     const Elf_Sym *vsymp;
     int vcount;
 } Sym_Match_Result;
 
 #define VER_INFO_HIDDEN	0x01
 
 /*
  * Shared object descriptor.
  *
  * Items marked with "(%)" are dynamically allocated, and must be freed
  * when the structure is destroyed.
  *
  * CAUTION: It appears that the JDK port peeks into these structures.
  * It looks at "next" and "mapbase" at least.  Don't add new members
  * near the front, until this can be straightened out.
  */
 typedef struct Struct_Obj_Entry {
     /*
      * These two items have to be set right for compatibility with the
      * original ElfKit crt1.o.
      */
     Elf_Size magic;		/* Magic number (sanity check) */
     Elf_Size version;		/* Version number of struct format */
 
     TAILQ_ENTRY(Struct_Obj_Entry) next;
     char *path;			/* Pathname of underlying file (%) */
     char *origin_path;		/* Directory path of origin file */
     int refcount;		/* DAG references */
     int holdcount;		/* Count of transient references */
     int dl_refcount;		/* Number of times loaded by dlopen */
 
     /* These items are computed by map_object() or by digest_phdr(). */
     caddr_t mapbase;		/* Base address of mapped region */
     size_t mapsize;		/* Size of mapped region in bytes */
     Elf_Addr vaddrbase;		/* Base address in shared object file */
     caddr_t relocbase;		/* Relocation constant = mapbase - vaddrbase */
     const Elf_Dyn *dynamic;	/* Dynamic section */
     caddr_t entry;		/* Entry point */
     const Elf_Phdr *phdr;	/* Program header if it is mapped, else NULL */
     size_t phsize;		/* Size of program header in bytes */
     const char *interp;		/* Pathname of the interpreter, if any */
     Elf_Word stack_flags;
 
     /* TLS information */
     int tlsindex;		/* Index in DTV for this module */
     void *tlsinit;		/* Base address of TLS init block */
     size_t tlsinitsize;		/* Size of TLS init block for this module */
     size_t tlssize;		/* Size of TLS block for this module */
     size_t tlsoffset;		/* Offset of static TLS block for this module */
     size_t tlsalign;		/* Alignment of static TLS block */
 
     caddr_t relro_page;
     size_t relro_size;
 
     /* Items from the dynamic section. */
     Elf_Addr *pltgot;		/* PLT or GOT, depending on architecture */
     const Elf_Rel *rel;		/* Relocation entries */
     unsigned long relsize;	/* Size in bytes of relocation info */
     const Elf_Rela *rela;	/* Relocation entries with addend */
     unsigned long relasize;	/* Size in bytes of addend relocation info */
     const Elf_Rel *pltrel;	/* PLT relocation entries */
     unsigned long pltrelsize;	/* Size in bytes of PLT relocation info */
     const Elf_Rela *pltrela;	/* PLT relocation entries with addend */
     unsigned long pltrelasize;	/* Size in bytes of PLT addend reloc info */
     const Elf_Sym *symtab;	/* Symbol table */
     const char *strtab;		/* String table */
     unsigned long strsize;	/* Size in bytes of string table */
 #ifdef __mips__
     Elf_Word local_gotno;	/* Number of local GOT entries */
     Elf_Word symtabno;		/* Number of dynamic symbols */
     Elf_Word gotsym;		/* First dynamic symbol in GOT */
     Elf_Addr *mips_pltgot;	/* Second PLT GOT */
 #endif
 #ifdef __powerpc64__
     Elf_Addr glink;		/* GLINK PLT call stub section */
 #endif
 
     const Elf_Verneed *verneed; /* Required versions. */
     Elf_Word verneednum;	/* Number of entries in verneed table */
     const Elf_Verdef  *verdef;	/* Provided versions. */
     Elf_Word verdefnum;		/* Number of entries in verdef table */
     const Elf_Versym *versyms;  /* Symbol versions table */
 
     const Elf_Hashelt *buckets;	/* Hash table buckets array */
     unsigned long nbuckets;	/* Number of buckets */
     const Elf_Hashelt *chains;	/* Hash table chain array */
     unsigned long nchains;	/* Number of entries in chain array */
 
     Elf32_Word nbuckets_gnu;		/* Number of GNU hash buckets*/
     Elf32_Word symndx_gnu;		/* 1st accessible symbol on dynsym table */
     Elf32_Word maskwords_bm_gnu;  	/* Bloom filter words - 1 (bitmask) */
     Elf32_Word shift2_gnu;		/* Bloom filter shift count */
     Elf32_Word dynsymcount;		/* Total entries in dynsym table */
     const Elf_Addr *bloom_gnu;		/* Bloom filter used by GNU hash func */
     const Elf_Hashelt *buckets_gnu;	/* GNU hash table bucket array */
     const Elf_Hashelt *chain_zero_gnu;	/* GNU hash table value array (Zeroed) */
 
     const char *rpath;		/* Search path specified in object */
     const char *runpath;	/* Search path with different priority */
     Needed_Entry *needed;	/* Shared objects needed by this one (%) */
     Needed_Entry *needed_filtees;
     Needed_Entry *needed_aux_filtees;
 
     STAILQ_HEAD(, Struct_Name_Entry) names; /* List of names for this object we
 					       know about. */
     Ver_Entry *vertab;		/* Versions required /defined by this object */
     int vernum;			/* Number of entries in vertab */
 
     Elf_Addr init;		/* Initialization function to call */
     Elf_Addr fini;		/* Termination function to call */
     Elf_Addr preinit_array;	/* Pre-initialization array of functions */
     Elf_Addr init_array;	/* Initialization array of functions */
     Elf_Addr fini_array;	/* Termination array of functions */
     int preinit_array_num;	/* Number of entries in preinit_array */
     int init_array_num; 	/* Number of entries in init_array */
     int fini_array_num; 	/* Number of entries in fini_array */
 
     int32_t osrel;		/* OSREL note value */
     uint32_t fctl0;		/* FEATURE_CONTROL note desc[0] value */
 
     bool mainprog : 1;		/* True if this is the main program */
     bool rtld : 1;		/* True if this is the dynamic linker */
     bool relocated : 1;		/* True if processed by relocate_objects() */
     bool ver_checked : 1;	/* True if processed by rtld_verify_object_versions */
     bool textrel : 1;		/* True if there are relocations to text seg */
     bool symbolic : 1;		/* True if generated with "-Bsymbolic" */
     bool bind_now : 1;		/* True if all relocations should be made first */
     bool traced : 1;		/* Already printed in ldd trace output */
     bool jmpslots_done : 1;	/* Already have relocated the jump slots */
     bool init_done : 1;		/* Already have added object to init list */
     bool tls_done : 1;		/* Already allocated offset for static TLS */
     bool phdr_alloc : 1;	/* Phdr is allocated and needs to be freed. */
     bool z_origin : 1;		/* Process rpath and soname tokens */
     bool z_nodelete : 1;	/* Do not unload the object and dependencies */
     bool z_noopen : 1;		/* Do not load on dlopen */
     bool z_loadfltr : 1;	/* Immediately load filtees */
     bool z_interpose : 1;	/* Interpose all objects but main */
     bool z_nodeflib : 1;	/* Don't search default library path */
     bool z_global : 1;		/* Make the object global */
+    bool static_tls : 1;	/* Needs static TLS allocation */
+    bool static_tls_copied : 1;	/* Needs static TLS copying */
     bool ref_nodel : 1;		/* Refcount increased to prevent dlclose */
     bool init_scanned: 1;	/* Object is already on init list. */
     bool on_fini_list: 1;	/* Object is already on fini list. */
     bool dag_inited : 1;	/* Object has its DAG initialized. */
     bool filtees_loaded : 1;	/* Filtees loaded */
     bool irelative : 1;		/* Object has R_MACHDEP_IRELATIVE relocs */
     bool gnu_ifunc : 1;		/* Object has references to STT_GNU_IFUNC */
     bool non_plt_gnu_ifunc : 1;	/* Object has non-plt IFUNC references */
     bool ifuncs_resolved : 1;	/* Object ifuncs were already resolved */
     bool crt_no_init : 1;	/* Object' crt does not call _init/_fini */
     bool valid_hash_sysv : 1;	/* A valid System V hash hash tag is available */
     bool valid_hash_gnu : 1;	/* A valid GNU hash tag is available */
     bool dlopened : 1;		/* dlopen()-ed (vs. load statically) */
     bool marker : 1;		/* marker on the global obj list */
     bool unholdfree : 1;	/* unmap upon last unhold */
     bool doomed : 1;		/* Object cannot be referenced */
 
     struct link_map linkmap;	/* For GDB and dlinfo() */
     Objlist dldags;		/* Object belongs to these dlopened DAGs (%) */
     Objlist dagmembers;		/* DAG has these members (%) */
     dev_t dev;			/* Object's filesystem's device */
     ino_t ino;			/* Object's inode number */
     void *priv;			/* Platform-dependent */
 } Obj_Entry;
 
 #define RTLD_MAGIC	0xd550b87a
 #define RTLD_VERSION	1
 
 TAILQ_HEAD(obj_entry_q, Struct_Obj_Entry);
 
 #define RTLD_STATIC_TLS_EXTRA	128
 
 /* Flags to be passed into symlook_ family of functions. */
 #define SYMLOOK_IN_PLT	0x01	/* Lookup for PLT symbol */
 #define SYMLOOK_DLSYM	0x02	/* Return newest versioned symbol. Used by
 				   dlsym. */
 #define	SYMLOOK_EARLY	0x04	/* Symlook is done during initialization. */
 #define	SYMLOOK_IFUNC	0x08	/* Allow IFUNC processing in
 				   reloc_non_plt(). */
 
 /* Flags for load_object(). */
 #define	RTLD_LO_NOLOAD	0x01	/* dlopen() specified RTLD_NOLOAD. */
 #define	RTLD_LO_DLOPEN	0x02	/* Load_object() called from dlopen(). */
 #define	RTLD_LO_TRACE	0x04	/* Only tracing. */
 #define	RTLD_LO_NODELETE 0x08	/* Loaded object cannot be closed. */
 #define	RTLD_LO_FILTEES 0x10	/* Loading filtee. */
 #define	RTLD_LO_EARLY	0x20	/* Do not call ctors, postpone it to the
 				   initialization during the image start. */
 
 /*
  * Symbol cache entry used during relocation to avoid multiple lookups
  * of the same symbol.
  */
 typedef struct Struct_SymCache {
     const Elf_Sym *sym;		/* Symbol table entry */
     const Obj_Entry *obj;	/* Shared object which defines it */
 } SymCache;
 
 /*
  * This structure provides a reentrant way to keep a list of objects and
  * check which ones have already been processed in some way.
  */
 typedef struct Struct_DoneList {
     const Obj_Entry **objs;		/* Array of object pointers */
     unsigned int num_alloc;		/* Allocated size of the array */
     unsigned int num_used;		/* Number of array slots used */
 } DoneList;
 
 struct Struct_RtldLockState {
 	int lockstate;
 	sigjmp_buf env;
 };
 
 struct fill_search_info_args {
 	int request;
 	unsigned int flags;
 	struct dl_serinfo *serinfo;
 	struct dl_serpath *serpath;
 	char *strspace;
 };
 
 /*
  * The pack of arguments and results for the symbol lookup functions.
  */
 typedef struct Struct_SymLook {
     const char *name;
     unsigned long hash;
     uint32_t hash_gnu;
     const Ver_Entry *ventry;
     int flags;
     const Obj_Entry *defobj_out;
     const Elf_Sym *sym_out;
     struct Struct_RtldLockState *lockstate;
 } SymLook;
 
 void _rtld_error(const char *, ...) __printflike(1, 2) __exported;
 void rtld_die(void) __dead2;
 const char *rtld_strerror(int);
 Obj_Entry *map_object(int, const char *, const struct stat *);
 void *xcalloc(size_t, size_t);
 void *xmalloc(size_t);
 char *xstrdup(const char *);
 void *malloc_aligned(size_t size, size_t align);
 void free_aligned(void *ptr);
 extern Elf_Addr _GLOBAL_OFFSET_TABLE_[];
 extern Elf_Sym sym_zero;	/* For resolving undefined weak refs. */
 extern bool ld_bind_not;
 
 void dump_relocations(Obj_Entry *);
 void dump_obj_relocations(Obj_Entry *);
 void dump_Elf_Rel(Obj_Entry *, const Elf_Rel *, u_long);
 void dump_Elf_Rela(Obj_Entry *, const Elf_Rela *, u_long);
 
 /*
  * Function declarations.
  */
 unsigned long elf_hash(const char *);
 const Elf_Sym *find_symdef(unsigned long, const Obj_Entry *,
   const Obj_Entry **, int, SymCache *, struct Struct_RtldLockState *);
 void lockdflt_init(void);
 void digest_notes(Obj_Entry *, Elf_Addr, Elf_Addr);
 Obj_Entry *globallist_curr(const Obj_Entry *obj);
 Obj_Entry *globallist_next(const Obj_Entry *obj);
 void obj_free(Obj_Entry *);
 Obj_Entry *obj_new(void);
 void _rtld_bind_start(void);
 void *rtld_resolve_ifunc(const Obj_Entry *obj, const Elf_Sym *def);
 void symlook_init(SymLook *, const char *);
 int symlook_obj(SymLook *, const Obj_Entry *);
 void *tls_get_addr_common(Elf_Addr** dtvp, int index, size_t offset);
 void *allocate_tls(Obj_Entry *, void *, size_t, size_t);
 void free_tls(void *, size_t, size_t);
 void *allocate_module_tls(int index);
 bool allocate_tls_offset(Obj_Entry *obj);
 void free_tls_offset(Obj_Entry *obj);
 const Ver_Entry *fetch_ventry(const Obj_Entry *obj, unsigned long);
 int convert_prot(int elfflags);
 
 /*
  * MD function declarations.
  */
 int do_copy_relocations(Obj_Entry *);
 int reloc_non_plt(Obj_Entry *, Obj_Entry *, int flags,
     struct Struct_RtldLockState *);
 int reloc_plt(Obj_Entry *, int flags, struct Struct_RtldLockState *);
 int reloc_jmpslots(Obj_Entry *, int flags, struct Struct_RtldLockState *);
 int reloc_iresolve(Obj_Entry *, struct Struct_RtldLockState *);
 int reloc_gnu_ifunc(Obj_Entry *, int flags, struct Struct_RtldLockState *);
 void ifunc_init(Elf_Auxinfo[__min_size(AT_COUNT)]);
 void pre_init(void);
 void init_pltgot(Obj_Entry *);
 void allocate_initial_tls(Obj_Entry *);
 
 void *__crt_calloc(size_t num, size_t size);
 void __crt_free(void *cp);
 void *__crt_malloc(size_t nbytes);
 void *__crt_realloc(void *cp, size_t nbytes);
 
 #endif /* } */
Index: projects/capsicum-test/share/man/man9/Makefile
===================================================================
--- projects/capsicum-test/share/man/man9/Makefile	(revision 345709)
+++ projects/capsicum-test/share/man/man9/Makefile	(revision 345710)
@@ -1,2267 +1,2268 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=runtime-manuals
 
 MAN=	accept_filter.9 \
 	accf_data.9 \
 	accf_dns.9 \
 	accf_http.9 \
 	acl.9 \
 	alq.9 \
 	altq.9 \
 	atomic.9 \
 	bhnd.9 \
 	bhnd_erom.9 \
 	bios.9 \
 	bitset.9 \
 	boot.9 \
 	bpf.9 \
 	buf.9 \
 	buf_ring.9 \
 	BUF_ISLOCKED.9 \
 	BUF_LOCK.9 \
 	BUF_LOCKFREE.9 \
 	BUF_LOCKINIT.9 \
 	BUF_RECURSED.9 \
 	BUF_TIMELOCK.9 \
 	BUF_UNLOCK.9 \
 	bus_activate_resource.9 \
 	BUS_ADD_CHILD.9 \
 	bus_adjust_resource.9 \
 	bus_alloc_resource.9 \
 	BUS_BIND_INTR.9 \
 	bus_child_present.9 \
 	BUS_CHILD_DELETED.9 \
 	BUS_CHILD_DETACHED.9 \
 	BUS_CONFIG_INTR.9 \
 	BUS_DESCRIBE_INTR.9 \
 	bus_dma.9 \
 	bus_generic_attach.9 \
 	bus_generic_detach.9 \
 	bus_generic_new_pass.9 \
 	bus_generic_print_child.9 \
 	bus_generic_read_ivar.9 \
 	bus_generic_shutdown.9 \
 	BUS_GET_CPUS.9 \
 	bus_get_resource.9 \
 	bus_map_resource.9 \
 	BUS_NEW_PASS.9 \
 	BUS_PRINT_CHILD.9 \
 	BUS_READ_IVAR.9 \
 	BUS_RESCAN.9 \
 	bus_release_resource.9 \
 	bus_set_pass.9 \
 	bus_set_resource.9 \
 	BUS_SETUP_INTR.9 \
 	bus_space.9 \
 	byteorder.9 \
 	casuword.9 \
 	cd.9 \
 	cnv.9 \
 	condvar.9 \
 	config_intrhook.9 \
 	contigmalloc.9 \
 	copy.9 \
 	counter.9 \
 	cpuset.9 \
 	cr_cansee.9 \
 	critical_enter.9 \
 	cr_seeothergids.9 \
 	cr_seeotheruids.9 \
 	crypto.9 \
 	CTASSERT.9 \
 	DB_COMMAND.9 \
 	DECLARE_GEOM_CLASS.9 \
 	DECLARE_MODULE.9 \
 	DELAY.9 \
 	devclass.9 \
 	devclass_find.9 \
 	devclass_get_device.9 \
 	devclass_get_devices.9 \
 	devclass_get_drivers.9 \
 	devclass_get_maxunit.9 \
 	devclass_get_name.9 \
 	devclass_get_softc.9 \
 	dev_clone.9 \
 	devfs_set_cdevpriv.9 \
 	device.9 \
 	device_add_child.9 \
 	DEVICE_ATTACH.9 \
 	device_delete_child.9 \
 	device_delete_children.9 \
 	DEVICE_DETACH.9 \
 	device_enable.9 \
 	device_find_child.9 \
 	device_get_children.9 \
 	device_get_devclass.9 \
 	device_get_driver.9 \
 	device_get_ivars.9 \
 	device_get_name.9 \
 	device_get_parent.9 \
 	device_get_softc.9 \
 	device_get_state.9 \
 	device_get_sysctl.9 \
 	device_get_unit.9 \
 	DEVICE_IDENTIFY.9 \
 	device_printf.9 \
 	DEVICE_PROBE.9 \
 	device_probe_and_attach.9 \
 	device_quiet.9 \
 	device_set_desc.9 \
 	device_set_driver.9 \
 	device_set_flags.9 \
 	DEVICE_SHUTDOWN.9 \
 	DEV_MODULE.9 \
 	dev_refthread.9 \
 	devstat.9 \
 	devtoname.9 \
 	disk.9 \
 	dnv.9 \
 	domain.9 \
 	domainset.9 \
 	dpcpu.9 \
 	drbr.9 \
 	driver.9 \
 	DRIVER_MODULE.9 \
 	efirt.9 \
 	epoch.9 \
 	EVENTHANDLER.9 \
 	eventtimers.9 \
 	extattr.9 \
 	fail.9 \
 	fdt_pinctrl.9 \
 	fetch.9 \
 	firmware.9 \
 	fpu_kern.9 \
 	g_access.9 \
 	g_attach.9 \
 	g_bio.9 \
 	g_consumer.9 \
 	g_data.9 \
 	get_cyclecount.9 \
 	getenv.9 \
 	getnewvnode.9 \
 	g_event.9 \
 	g_geom.9 \
 	g_provider.9 \
 	g_provider_by_name.9 \
 	groupmember.9 \
 	g_wither_geom.9 \
 	hash.9 \
 	hashinit.9 \
 	hexdump.9 \
 	hhook.9 \
 	ieee80211.9 \
 	ieee80211_amrr.9 \
 	ieee80211_beacon.9 \
 	ieee80211_bmiss.9 \
 	ieee80211_crypto.9 \
 	ieee80211_ddb.9 \
 	ieee80211_input.9 \
 	ieee80211_node.9 \
 	ieee80211_output.9 \
 	ieee80211_proto.9 \
 	ieee80211_radiotap.9 \
 	ieee80211_regdomain.9 \
 	ieee80211_scan.9 \
 	ieee80211_vap.9 \
 	iflib.9 \
 	iflibdd.9 \
 	iflibdi.9 \
 	iflibtxrx.9 \
 	ifnet.9 \
 	inittodr.9 \
 	insmntque.9 \
 	intro.9 \
 	ithread.9 \
 	KASSERT.9 \
 	kern_testfrwk.9 \
 	kernacc.9 \
 	kernel_mount.9 \
 	khelp.9 \
 	kobj.9 \
 	kproc.9 \
 	kqueue.9 \
 	kthread.9 \
 	ktr.9 \
 	lock.9 \
 	locking.9 \
 	LOCK_PROFILING.9 \
 	mac.9 \
 	make_dev.9 \
 	malloc.9 \
 	mbchain.9 \
 	mbuf.9 \
 	mbuf_tags.9 \
 	MD5.9 \
 	mdchain.9 \
 	memcchr.9 \
 	memguard.9 \
 	microseq.9 \
 	microtime.9 \
 	microuptime.9 \
 	mi_switch.9 \
 	mod_cc.9 \
 	module.9 \
 	MODULE_DEPEND.9 \
 	MODULE_PNP_INFO.9 \
 	MODULE_VERSION.9 \
 	mtx_pool.9 \
 	mutex.9 \
 	namei.9 \
 	netisr.9 \
 	nv.9 \
 	OF_child.9 \
 	OF_device_from_xref.9 \
 	OF_finddevice.9 \
 	OF_getprop.9 \
 	OF_node_from_xref.9 \
 	OF_package_to_path.9 \
 	ofw_bus_is_compatible.9 \
 	ofw_bus_status_okay.9 \
 	osd.9 \
 	owll.9 \
 	own.9 \
 	panic.9 \
 	PCBGROUP.9 \
 	p_candebug.9 \
 	p_cansee.9 \
 	pci.9 \
 	PCI_IOV_ADD_VF.9 \
 	PCI_IOV_INIT.9 \
 	pci_iov_schema.9 \
 	PCI_IOV_UNINIT.9 \
 	pfil.9 \
 	pfind.9 \
 	pget.9 \
 	pgfind.9 \
 	PHOLD.9 \
 	physio.9 \
 	pmap.9 \
 	pmap_activate.9 \
 	pmap_clear_modify.9 \
 	pmap_copy.9 \
 	pmap_enter.9 \
 	pmap_extract.9 \
 	pmap_growkernel.9 \
 	pmap_init.9 \
 	pmap_is_modified.9 \
 	pmap_is_prefaultable.9 \
 	pmap_map.9 \
 	pmap_mincore.9 \
 	pmap_object_init_pt.9 \
 	pmap_page_exists_quick.9 \
 	pmap_page_init.9 \
 	pmap_pinit.9 \
 	pmap_protect.9 \
 	pmap_qenter.9 \
 	pmap_quick_enter_page.9 \
 	pmap_release.9 \
 	pmap_remove.9 \
 	pmap_resident_count.9 \
 	pmap_unwire.9 \
 	pmap_zero_page.9 \
 	printf.9 \
 	prison_check.9 \
 	priv.9 \
 	proc_rwmem.9 \
 	pseudofs.9 \
 	psignal.9 \
 	pwm.9 \
 	pwmbus.9 \
 	random.9 \
 	random_harvest.9 \
 	ratecheck.9 \
 	redzone.9 \
 	refcount.9 \
 	resettodr.9 \
 	resource_int_value.9 \
 	rijndael.9 \
 	rman.9 \
 	rmlock.9 \
 	rtalloc.9 \
 	rtentry.9 \
 	runqueue.9 \
 	rwlock.9 \
 	sbuf.9 \
 	scheduler.9 \
 	SDT.9 \
 	securelevel_gt.9 \
 	selrecord.9 \
 	sema.9 \
 	sf_buf.9 \
 	sglist.9 \
 	shm_map.9 \
 	signal.9 \
 	sleep.9 \
 	sleepqueue.9 \
 	socket.9 \
 	stack.9 \
 	store.9 \
 	style.9 \
 	style.lua.9 \
 	swi.9 \
 	sx.9 \
 	syscall_helper_register.9 \
 	SYSCALL_MODULE.9 \
 	sysctl.9 \
 	sysctl_add_oid.9 \
 	sysctl_ctx_init.9 \
 	SYSINIT.9 \
 	taskqueue.9 \
 	tcp_functions.9 \
 	thread_exit.9 \
 	time.9 \
 	timeout.9 \
 	tvtohz.9 \
 	ucred.9 \
 	uidinfo.9 \
 	uio.9 \
 	unr.9 \
 	vaccess.9 \
 	vaccess_acl_nfs4.9 \
 	vaccess_acl_posix1e.9 \
 	vcount.9 \
 	vflush.9 \
 	VFS.9 \
 	vfs_busy.9 \
 	VFS_CHECKEXP.9 \
 	vfsconf.9 \
 	VFS_FHTOVP.9 \
 	vfs_getnewfsid.9 \
 	vfs_getopt.9 \
 	vfs_getvfs.9 \
 	VFS_MOUNT.9 \
 	vfs_mountedfrom.9 \
 	VFS_QUOTACTL.9 \
 	VFS_ROOT.9 \
 	vfs_rootmountalloc.9 \
 	VFS_SET.9 \
 	VFS_STATFS.9 \
 	vfs_suser.9 \
 	VFS_SYNC.9 \
 	vfs_timestamp.9 \
 	vfs_unbusy.9 \
 	VFS_UNMOUNT.9 \
 	vfs_unmountall.9 \
 	VFS_VGET.9 \
 	vget.9 \
 	vgone.9 \
 	vhold.9 \
 	vinvalbuf.9 \
 	vm_fault_prefault.9 \
 	vm_map.9 \
 	vm_map_check_protection.9 \
 	vm_map_create.9 \
 	vm_map_delete.9 \
 	vm_map_entry_resize_free.9 \
 	vm_map_find.9 \
 	vm_map_findspace.9 \
 	vm_map_inherit.9 \
 	vm_map_init.9 \
 	vm_map_insert.9 \
 	vm_map_lock.9 \
 	vm_map_lookup.9 \
 	vm_map_madvise.9 \
 	vm_map_max.9 \
 	vm_map_protect.9 \
 	vm_map_remove.9 \
 	vm_map_simplify_entry.9 \
 	vm_map_stack.9 \
 	vm_map_submap.9 \
 	vm_map_sync.9 \
 	vm_map_wire.9 \
 	vm_page_alloc.9 \
 	vm_page_bits.9 \
 	vm_page_busy.9 \
 	vm_page_deactivate.9 \
 	vm_page_dontneed.9 \
 	vm_page_aflag.9 \
 	vm_page_free.9 \
 	vm_page_grab.9 \
 	vm_page_hold.9 \
 	vm_page_insert.9 \
 	vm_page_lookup.9 \
 	vm_page_rename.9 \
 	vm_page_wire.9 \
 	vm_set_page_size.9 \
 	vmem.9 \
 	vn_fullpath.9 \
 	vn_isdisk.9 \
 	vnet.9 \
 	vnode.9 \
 	VOP_ACCESS.9 \
 	VOP_ACLCHECK.9 \
 	VOP_ADVISE.9 \
 	VOP_ADVLOCK.9 \
 	VOP_ALLOCATE.9 \
 	VOP_ATTRIB.9 \
 	VOP_BWRITE.9 \
 	VOP_CREATE.9 \
 	VOP_FSYNC.9 \
 	VOP_GETACL.9 \
 	VOP_GETEXTATTR.9 \
 	VOP_GETPAGES.9 \
 	VOP_INACTIVE.9 \
 	VOP_IOCTL.9 \
 	VOP_LINK.9 \
 	VOP_LISTEXTATTR.9 \
 	VOP_LOCK.9 \
 	VOP_LOOKUP.9 \
 	VOP_OPENCLOSE.9 \
 	VOP_PATHCONF.9 \
 	VOP_PRINT.9 \
 	VOP_RDWR.9 \
 	VOP_READDIR.9 \
 	VOP_READLINK.9 \
 	VOP_REALLOCBLKS.9 \
 	VOP_REMOVE.9 \
 	VOP_RENAME.9 \
 	VOP_REVOKE.9 \
 	VOP_SETACL.9 \
 	VOP_SETEXTATTR.9 \
 	VOP_STRATEGY.9 \
 	VOP_VPTOCNP.9 \
 	VOP_VPTOFH.9 \
 	vref.9 \
 	vrefcnt.9 \
 	vrele.9 \
 	vslock.9 \
 	watchdog.9 \
 	zone.9
 
 MLINKS=	unr.9 alloc_unr.9 \
 	unr.9 alloc_unrl.9 \
 	unr.9 alloc_unr_specific.9 \
 	unr.9 clear_unrhdr.9 \
 	unr.9 delete_unrhdr.9 \
 	unr.9 free_unr.9 \
 	unr.9 new_unrhdr.9
 MLINKS+=accept_filter.9 accept_filt_add.9 \
 	accept_filter.9 accept_filt_del.9 \
 	accept_filter.9 accept_filt_generic_mod_event.9 \
 	accept_filter.9 accept_filt_get.9
 MLINKS+=alq.9 ALQ.9 \
 	alq.9 alq_close.9 \
 	alq.9 alq_flush.9 \
 	alq.9 alq_get.9 \
 	alq.9 alq_getn.9 \
 	alq.9 alq_open.9 \
 	alq.9 alq_open_flags.9 \
 	alq.9 alq_post.9 \
 	alq.9 alq_post_flags.9 \
 	alq.9 alq_write.9 \
 	alq.9 alq_writen.9
 MLINKS+=altq.9 ALTQ.9
 MLINKS+=atomic.9 atomic_add.9 \
 	atomic.9 atomic_clear.9 \
 	atomic.9 atomic_cmpset.9 \
 	atomic.9 atomic_fcmpset.9 \
 	atomic.9 atomic_fetchadd.9 \
 	atomic.9 atomic_load.9 \
 	atomic.9 atomic_readandclear.9 \
 	atomic.9 atomic_set.9 \
 	atomic.9 atomic_store.9 \
 	atomic.9 atomic_subtract.9 \
 	atomic.9 atomic_swap.9 \
 	atomic.9 atomic_testandclear.9 \
 	atomic.9 atomic_testandset.9 \
 	atomic.9 atomic_thread_fence.9
 MLINKS+=bhnd.9 BHND_MATCH_BOARD_TYPE.9 \
 	bhnd.9 BHND_MATCH_BOARD_VENDOR.9 \
 	bhnd.9 BHND_MATCH_CHIP_ID.9 \
 	bhnd.9 BHND_MATCH_CHIP_PKG.9 \
 	bhnd.9 BHND_MATCH_CHIP_REV.9 \
 	bhnd.9 BHND_MATCH_CORE_ID.9 \
 	bhnd.9 BHND_MATCH_CORE_VENDOR.9 \
 	bhnd.9 bhnd_activate_resource.9 \
 	bhnd.9 bhnd_alloc_pmu.9 \
 	bhnd.9 bhnd_alloc_resource.9 \
 	bhnd.9 bhnd_alloc_resource_any.9 \
 	bhnd.9 bhnd_alloc_resources.9 \
 	bhnd.9 bhnd_board_matches.9 \
 	bhnd.9 bhnd_bus_match_child.9 \
 	bhnd.9 bhnd_bus_read_1.9 \
 	bhnd.9 bhnd_bus_read_2.9 \
 	bhnd.9 bhnd_bus_read_4.9 \
 	bhnd.9 bhnd_bus_read_stream_1.9 \
 	bhnd.9 bhnd_bus_read_stream_2.9 \
 	bhnd.9 bhnd_bus_read_stream_4.9 \
 	bhnd.9 bhnd_bus_write_1.9 \
 	bhnd.9 bhnd_bus_write_2.9 \
 	bhnd.9 bhnd_bus_write_4.9 \
 	bhnd.9 bhnd_bus_write_stream_1.9 \
 	bhnd.9 bhnd_bus_write_stream_2.9 \
 	bhnd.9 bhnd_bus_write_stream_4.9 \
 	bhnd.9 bhnd_chip_matches.9 \
 	bhnd.9 bhnd_core_class.9 \
 	bhnd.9 bhnd_core_get_match_desc.9 \
 	bhnd.9 bhnd_core_matches.9 \
 	bhnd.9 bhnd_core_name.9 \
 	bhnd.9 bhnd_cores_equal.9 \
 	bhnd.9 bhnd_deactivate_resource.9 \
 	bhnd.9 bhnd_decode_port_rid.9 \
 	bhnd.9 bhnd_deregister_provider.9 \
 	bhnd.9 bhnd_device_lookup.9 \
 	bhnd.9 bhnd_device_matches.9 \
 	bhnd.9 bhnd_device_quirks.9 \
 	bhnd.9 bhnd_driver_get_erom_class.9 \
 	bhnd.9 bhnd_enable_clocks.9 \
 	bhnd.9 bhnd_find_core_class.9 \
 	bhnd.9 bhnd_find_core_name.9 \
 	bhnd.9 bhnd_format_chip_id.9 \
 	bhnd.9 bhnd_get_attach_type.9 \
 	bhnd.9 bhnd_get_chipid.9 \
 	bhnd.9 bhnd_get_class.9 \
 	bhnd.9 bhnd_get_clock_freq.9 \
 	bhnd.9 bhnd_get_clock_latency.9 \
 	bhnd.9 bhnd_get_core_index.9 \
 	bhnd.9 bhnd_get_core_info.9 \
 	bhnd.9 bhnd_get_core_unit.9 \
 	bhnd.9 bhnd_get_device.9 \
 	bhnd.9 bhnd_get_device_name.9 \
 	bhnd.9 bhnd_get_dma_translation.9 \
 	bhnd.9 bhnd_get_hwrev.9 \
 	bhnd.9 bhnd_get_intr_count.9 \
 	bhnd.9 bhnd_get_intr_ivec.9 \
 	bhnd.9 bhnd_get_port_count.9 \
 	bhnd.9 bhnd_get_port_rid.9 \
 	bhnd.9 bhnd_get_region_addr.9 \
 	bhnd.9 bhnd_get_region_count.9 \
 	bhnd.9 bhnd_get_vendor.9 \
 	bhnd.9 bhnd_get_vendor_name.9 \
 	bhnd.9 bhnd_hwrev_matches.9 \
 	bhnd.9 bhnd_is_hw_suspended.9 \
 	bhnd.9 bhnd_is_region_valid.9 \
 	bhnd.9 bhnd_map_intr.9 \
 	bhnd.9 bhnd_match_core.9 \
 	bhnd.9 bhnd_nvram_getvar.9 \
 	bhnd.9 bhnd_nvram_getvar_array.9 \
 	bhnd.9 bhnd_nvram_getvar_int.9 \
 	bhnd.9 bhnd_nvram_getvar_int16.9 \
 	bhnd.9 bhnd_nvram_getvar_int32.9 \
 	bhnd.9 bhnd_nvram_getvar_int8.9 \
 	bhnd.9 bhnd_nvram_getvar_str.9 \
 	bhnd.9 bhnd_nvram_getvar_uint.9 \
 	bhnd.9 bhnd_nvram_getvar_uint16.9 \
 	bhnd.9 bhnd_nvram_getvar_uint32.9 \
 	bhnd.9 bhnd_nvram_getvar_uint8.9 \
 	bhnd.9 bhnd_nvram_string_array_next.9 \
 	bhnd.9 bhnd_read_board_info.9 \
 	bhnd.9 bhnd_read_config.9 \
 	bhnd.9 bhnd_read_ioctl.9 \
 	bhnd.9 bhnd_read_iost.9 \
 	bhnd.9 bhnd_register_provider.9 \
 	bhnd.9 bhnd_release_ext_rsrc.9 \
 	bhnd.9 bhnd_release_pmu.9 \
 	bhnd.9 bhnd_release_provider.9 \
 	bhnd.9 bhnd_release_resource.9 \
 	bhnd.9 bhnd_release_resources.9 \
 	bhnd.9 bhnd_request_clock.9 \
 	bhnd.9 bhnd_request_ext_rsrc.9 \
 	bhnd.9 bhnd_reset_hw.9 \
 	bhnd.9 bhnd_retain_provider.9 \
 	bhnd.9 bhnd_set_custom_core_desc.9 \
 	bhnd.9 bhnd_set_default_core_desc.9 \
 	bhnd.9 bhnd_suspend_hw.9 \
 	bhnd.9 bhnd_unmap_intr.9 \
 	bhnd.9 bhnd_vendor_name.9 \
 	bhnd.9 bhnd_write_config.9 \
 	bhnd.9 bhnd_write_ioctl.9
 MLINKS+=bhnd_erom.9 bhnd_erom_alloc.9 \
 	bhnd_erom.9 bhnd_erom_dump.9 \
 	bhnd_erom.9 bhnd_erom_fini_static.9 \
 	bhnd_erom.9 bhnd_erom_free.9 \
 	bhnd_erom.9 bhnd_erom_free_core_table.9 \
 	bhnd_erom.9 bhnd_erom_get_core_table.9 \
 	bhnd_erom.9 bhnd_erom_init_static.9 \
 	bhnd_erom.9 bhnd_erom_io.9 \
 	bhnd_erom.9 bhnd_erom_io_fini.9 \
 	bhnd_erom.9 bhnd_erom_io_map.9 \
 	bhnd_erom.9 bhnd_erom_io_read.9 \
 	bhnd_erom.9 bhnd_erom_iobus_init.9 \
 	bhnd_erom.9 bhnd_erom_iores_new.9 \
 	bhnd_erom.9 bhnd_erom_lookup_core.9 \
 	bhnd_erom.9 bhnd_erom_lookup_core_addr.9 \
 	bhnd_erom.9 bhnd_erom_probe.9 \
 	bhnd_erom.9 bhnd_erom_probe_driver_classes.9
 MLINKS+=bitset.9 BITSET_DEFINE.9 \
 	bitset.9 BITSET_T_INITIALIZER.9 \
 	bitset.9 BITSET_FSET.9 \
 	bitset.9 BIT_CLR.9 \
 	bitset.9 BIT_COPY.9 \
 	bitset.9 BIT_ISSET.9 \
 	bitset.9 BIT_SET.9 \
 	bitset.9 BIT_ZERO.9 \
 	bitset.9 BIT_FILL.9 \
 	bitset.9 BIT_SETOF.9 \
 	bitset.9 BIT_EMPTY.9 \
 	bitset.9 BIT_ISFULLSET.9 \
 	bitset.9 BIT_FFS.9 \
 	bitset.9 BIT_COUNT.9 \
 	bitset.9 BIT_SUBSET.9 \
 	bitset.9 BIT_OVERLAP.9 \
 	bitset.9 BIT_CMP.9 \
 	bitset.9 BIT_OR.9 \
 	bitset.9 BIT_AND.9 \
 	bitset.9 BIT_NAND.9 \
 	bitset.9 BIT_CLR_ATOMIC.9 \
 	bitset.9 BIT_SET_ATOMIC.9 \
 	bitset.9 BIT_SET_ATOMIC_ACQ.9 \
 	bitset.9 BIT_AND_ATOMIC.9 \
 	bitset.9 BIT_OR_ATOMIC.9 \
 	bitset.9 BIT_COPY_STORE_REL.9
 MLINKS+=bpf.9 bpfattach.9 \
 	bpf.9 bpfattach2.9 \
 	bpf.9 bpfdetach.9 \
 	bpf.9 bpf_filter.9 \
 	bpf.9 bpf_mtap.9 \
 	bpf.9 bpf_mtap2.9 \
 	bpf.9 bpf_tap.9 \
 	bpf.9 bpf_validate.9
 MLINKS+=buf.9 bp.9
 MLINKS+=buf_ring.9 buf_ring_alloc.9 \
 	buf_ring.9 buf_ring_free.9 \
 	buf_ring.9 buf_ring_enqueue.9 \
 	buf_ring.9 buf_ring_enqueue_bytes.9 \
 	buf_ring.9 buf_ring_dequeue_mc.9 \
 	buf_ring.9 buf_ring_dequeue_sc.9 \
 	buf_ring.9 buf_ring_count.9 \
 	buf_ring.9 buf_ring_empty.9 \
 	buf_ring.9 buf_ring_full.9 \
 	buf_ring.9 buf_ring_peek.9
 MLINKS+=bus_activate_resource.9 bus_deactivate_resource.9
 MLINKS+=bus_alloc_resource.9 bus_alloc_resource_any.9
 MLINKS+=BUS_BIND_INTR.9 bus_bind_intr.9
 MLINKS+=BUS_DESCRIBE_INTR.9 bus_describe_intr.9
 MLINKS+=bus_dma.9 busdma.9 \
 	bus_dma.9 bus_dmamap_create.9 \
 	bus_dma.9 bus_dmamap_destroy.9 \
 	bus_dma.9 bus_dmamap_load.9 \
 	bus_dma.9 bus_dmamap_load_bio.9 \
 	bus_dma.9 bus_dmamap_load_ccb.9 \
 	bus_dma.9 bus_dmamap_load_mbuf.9 \
 	bus_dma.9 bus_dmamap_load_mbuf_sg.9 \
 	bus_dma.9 bus_dmamap_load_uio.9 \
 	bus_dma.9 bus_dmamap_sync.9 \
 	bus_dma.9 bus_dmamap_unload.9 \
 	bus_dma.9 bus_dmamem_alloc.9 \
 	bus_dma.9 bus_dmamem_free.9 \
 	bus_dma.9 bus_dma_tag_create.9 \
 	bus_dma.9 bus_dma_tag_destroy.9
 MLINKS+=bus_generic_read_ivar.9 bus_generic_write_ivar.9
 MLINKS+=BUS_GET_CPUS.9 bus_get_cpus.9
 MLINKS+=bus_map_resource.9 bus_unmap_resource.9 \
 	bus_map_resource.9 resource_init_map_request.9
 MLINKS+=BUS_READ_IVAR.9 BUS_WRITE_IVAR.9
 MLINKS+=BUS_SETUP_INTR.9 bus_setup_intr.9 \
 	BUS_SETUP_INTR.9 BUS_TEARDOWN_INTR.9 \
 	BUS_SETUP_INTR.9 bus_teardown_intr.9
 MLINKS+=bus_space.9 bus_space_alloc.9 \
 	bus_space.9 bus_space_barrier.9 \
 	bus_space.9 bus_space_copy_region_1.9 \
 	bus_space.9 bus_space_copy_region_2.9 \
 	bus_space.9 bus_space_copy_region_4.9 \
 	bus_space.9 bus_space_copy_region_8.9 \
 	bus_space.9 bus_space_copy_region_stream_1.9 \
 	bus_space.9 bus_space_copy_region_stream_2.9 \
 	bus_space.9 bus_space_copy_region_stream_4.9 \
 	bus_space.9 bus_space_copy_region_stream_8.9 \
 	bus_space.9 bus_space_free.9 \
 	bus_space.9 bus_space_map.9 \
 	bus_space.9 bus_space_read_1.9 \
 	bus_space.9 bus_space_read_2.9 \
 	bus_space.9 bus_space_read_4.9 \
 	bus_space.9 bus_space_read_8.9 \
 	bus_space.9 bus_space_read_multi_1.9 \
 	bus_space.9 bus_space_read_multi_2.9 \
 	bus_space.9 bus_space_read_multi_4.9 \
 	bus_space.9 bus_space_read_multi_8.9 \
 	bus_space.9 bus_space_read_multi_stream_1.9 \
 	bus_space.9 bus_space_read_multi_stream_2.9 \
 	bus_space.9 bus_space_read_multi_stream_4.9 \
 	bus_space.9 bus_space_read_multi_stream_8.9 \
 	bus_space.9 bus_space_read_region_1.9 \
 	bus_space.9 bus_space_read_region_2.9 \
 	bus_space.9 bus_space_read_region_4.9 \
 	bus_space.9 bus_space_read_region_8.9 \
 	bus_space.9 bus_space_read_region_stream_1.9 \
 	bus_space.9 bus_space_read_region_stream_2.9 \
 	bus_space.9 bus_space_read_region_stream_4.9 \
 	bus_space.9 bus_space_read_region_stream_8.9 \
 	bus_space.9 bus_space_read_stream_1.9 \
 	bus_space.9 bus_space_read_stream_2.9 \
 	bus_space.9 bus_space_read_stream_4.9 \
 	bus_space.9 bus_space_read_stream_8.9 \
 	bus_space.9 bus_space_set_multi_1.9 \
 	bus_space.9 bus_space_set_multi_2.9 \
 	bus_space.9 bus_space_set_multi_4.9 \
 	bus_space.9 bus_space_set_multi_8.9 \
 	bus_space.9 bus_space_set_multi_stream_1.9 \
 	bus_space.9 bus_space_set_multi_stream_2.9 \
 	bus_space.9 bus_space_set_multi_stream_4.9 \
 	bus_space.9 bus_space_set_multi_stream_8.9 \
 	bus_space.9 bus_space_set_region_1.9 \
 	bus_space.9 bus_space_set_region_2.9 \
 	bus_space.9 bus_space_set_region_4.9 \
 	bus_space.9 bus_space_set_region_8.9 \
 	bus_space.9 bus_space_set_region_stream_1.9 \
 	bus_space.9 bus_space_set_region_stream_2.9 \
 	bus_space.9 bus_space_set_region_stream_4.9 \
 	bus_space.9 bus_space_set_region_stream_8.9 \
 	bus_space.9 bus_space_subregion.9 \
 	bus_space.9 bus_space_unmap.9 \
 	bus_space.9 bus_space_write_1.9 \
 	bus_space.9 bus_space_write_2.9 \
 	bus_space.9 bus_space_write_4.9 \
 	bus_space.9 bus_space_write_8.9 \
 	bus_space.9 bus_space_write_multi_1.9 \
 	bus_space.9 bus_space_write_multi_2.9 \
 	bus_space.9 bus_space_write_multi_4.9 \
 	bus_space.9 bus_space_write_multi_8.9 \
 	bus_space.9 bus_space_write_multi_stream_1.9 \
 	bus_space.9 bus_space_write_multi_stream_2.9 \
 	bus_space.9 bus_space_write_multi_stream_4.9 \
 	bus_space.9 bus_space_write_multi_stream_8.9 \
 	bus_space.9 bus_space_write_region_1.9 \
 	bus_space.9 bus_space_write_region_2.9 \
 	bus_space.9 bus_space_write_region_4.9 \
 	bus_space.9 bus_space_write_region_8.9 \
 	bus_space.9 bus_space_write_region_stream_1.9 \
 	bus_space.9 bus_space_write_region_stream_2.9 \
 	bus_space.9 bus_space_write_region_stream_4.9 \
 	bus_space.9 bus_space_write_region_stream_8.9 \
 	bus_space.9 bus_space_write_stream_1.9 \
 	bus_space.9 bus_space_write_stream_2.9 \
 	bus_space.9 bus_space_write_stream_4.9 \
 	bus_space.9 bus_space_write_stream_8.9
 MLINKS+=byteorder.9 be16dec.9 \
 	byteorder.9 be16enc.9 \
 	byteorder.9 be16toh.9 \
 	byteorder.9 be32dec.9 \
 	byteorder.9 be32enc.9 \
 	byteorder.9 be32toh.9 \
 	byteorder.9 be64dec.9 \
 	byteorder.9 be64enc.9 \
 	byteorder.9 be64toh.9 \
 	byteorder.9 bswap16.9 \
 	byteorder.9 bswap32.9 \
 	byteorder.9 bswap64.9 \
 	byteorder.9 htobe16.9 \
 	byteorder.9 htobe32.9 \
 	byteorder.9 htobe64.9 \
 	byteorder.9 htole16.9 \
 	byteorder.9 htole32.9 \
 	byteorder.9 htole64.9 \
 	byteorder.9 le16dec.9 \
 	byteorder.9 le16enc.9 \
 	byteorder.9 le16toh.9 \
 	byteorder.9 le32dec.9 \
 	byteorder.9 le32enc.9 \
 	byteorder.9 le32toh.9 \
 	byteorder.9 le64dec.9 \
 	byteorder.9 le64enc.9 \
 	byteorder.9 le64toh.9
 MLINKS+=cnv.9 cnvlist.9 \
 	cnv.9 cnvlist_free_binary.9 \
 	cnv.9 cnvlist_free_bool.9 \
 	cnv.9 cnvlist_free_bool_array.9 \
 	cnv.9 cnvlist_free_descriptor.9 \
 	cnv.9 cnvlist_free_descriptor_array.9 \
 	cnv.9 cnvlist_free_null.9 \
 	cnv.9 cnvlist_free_number.9 \
 	cnv.9 cnvlist_free_number_array.9 \
 	cnv.9 cnvlist_free_nvlist.9 \
 	cnv.9 cnvlist_free_nvlist_array.9 \
 	cnv.9 cnvlist_free_string.9 \
 	cnv.9 cnvlist_free_string_array.9 \
 	cnv.9 cnvlist_get_binary.9 \
 	cnv.9 cnvlist_get_bool.9 \
 	cnv.9 cnvlist_get_bool_array.9 \
 	cnv.9 cnvlist_get_descriptor.9 \
 	cnv.9 cnvlist_get_descriptor_array.9 \
 	cnv.9 cnvlist_get_number.9 \
 	cnv.9 cnvlist_get_number_array.9 \
 	cnv.9 cnvlist_get_nvlist.9 \
 	cnv.9 cnvlist_get_nvlist_array.9 \
 	cnv.9 cnvlist_get_string.9 \
 	cnv.9 cnvlist_get_string_array.9 \
 	cnv.9 cnvlist_take_binary.9 \
 	cnv.9 cnvlist_take_bool.9 \
 	cnv.9 cnvlist_take_bool_array.9 \
 	cnv.9 cnvlist_take_descriptor.9 \
 	cnv.9 cnvlist_take_descriptor_array.9 \
 	cnv.9 cnvlist_take_number.9 \
 	cnv.9 cnvlist_take_number_array.9 \
 	cnv.9 cnvlist_take_nvlist.9 \
 	cnv.9 cnvlist_take_nvlist_array.9 \
 	cnv.9 cnvlist_take_string.9 \
 	cnv.9 cnvlist_take_string_array.9
 MLINKS+=condvar.9 cv_broadcast.9 \
 	condvar.9 cv_broadcastpri.9 \
 	condvar.9 cv_destroy.9 \
 	condvar.9 cv_init.9 \
 	condvar.9 cv_signal.9 \
 	condvar.9 cv_timedwait.9 \
 	condvar.9 cv_timedwait_sig.9 \
 	condvar.9 cv_timedwait_sig_sbt.9 \
 	condvar.9 cv_wait.9 \
 	condvar.9 cv_wait_sig.9 \
 	condvar.9 cv_wait_unlock.9 \
 	condvar.9 cv_wmesg.9
 MLINKS+=config_intrhook.9 config_intrhook_disestablish.9 \
 	config_intrhook.9 config_intrhook_establish.9 \
 	config_intrhook.9 config_intrhook_oneshot.9
 MLINKS+=contigmalloc.9 contigmalloc_domainset.9 \
 	contigmalloc.9 contigfree.9
 MLINKS+=casuword.9 casueword.9 \
 	casuword.9 casueword32.9 \
 	casuword.9 casuword32.9
 MLINKS+=copy.9 copyin.9 \
 	copy.9 copyin_nofault.9 \
 	copy.9 copyinstr.9 \
 	copy.9 copyout.9 \
 	copy.9 copyout_nofault.9 \
 	copy.9 copystr.9
 MLINKS+=counter.9 counter_u64_alloc.9 \
 	counter.9 counter_u64_free.9 \
 	counter.9 counter_u64_add.9 \
 	counter.9 counter_enter.9 \
 	counter.9 counter_exit.9 \
 	counter.9 counter_u64_add_protected.9 \
 	counter.9 counter_u64_fetch.9 \
 	counter.9 counter_u64_zero.9 \
 	counter.9 SYSCTL_COUNTER_U64.9 \
 	counter.9 SYSCTL_ADD_COUNTER_U64.9 \
 	counter.9 SYSCTL_COUNTER_U64_ARRAY.9 \
 	counter.9 SYSCTL_ADD_COUNTER_U64_ARRAY.9
 MLINKS+=cpuset.9 CPUSET_T_INITIALIZER.9 \
 	cpuset.9 CPUSET_FSET.9 \
 	cpuset.9 CPU_CLR.9 \
 	cpuset.9 CPU_COPY.9 \
 	cpuset.9 CPU_ISSET.9 \
 	cpuset.9 CPU_SET.9 \
 	cpuset.9 CPU_ZERO.9 \
 	cpuset.9 CPU_FILL.9 \
 	cpuset.9 CPU_SETOF.9 \
 	cpuset.9 CPU_EMPTY.9 \
 	cpuset.9 CPU_ISFULLSET.9 \
 	cpuset.9 CPU_FFS.9 \
 	cpuset.9 CPU_COUNT.9 \
 	cpuset.9 CPU_SUBSET.9 \
 	cpuset.9 CPU_OVERLAP.9 \
 	cpuset.9 CPU_CMP.9 \
 	cpuset.9 CPU_OR.9 \
 	cpuset.9 CPU_AND.9 \
 	cpuset.9 CPU_NAND.9 \
 	cpuset.9 CPU_CLR_ATOMIC.9 \
 	cpuset.9 CPU_SET_ATOMIC.9 \
 	cpuset.9 CPU_SET_ATOMIC_ACQ.9 \
 	cpuset.9 CPU_AND_ATOMIC.9 \
 	cpuset.9 CPU_OR_ATOMIC.9 \
 	cpuset.9 CPU_COPY_STORE_REL.9
 MLINKS+=critical_enter.9 critical.9 \
 	critical_enter.9 critical_exit.9
 MLINKS+=crypto.9 crypto_dispatch.9 \
 	crypto.9 crypto_done.9 \
 	crypto.9 crypto_freereq.9 \
 	crypto.9 crypto_freesession.9 \
 	crypto.9 crypto_get_driverid.9 \
 	crypto.9 crypto_getreq.9 \
 	crypto.9 crypto_kdispatch.9 \
 	crypto.9 crypto_kdone.9 \
 	crypto.9 crypto_kregister.9 \
 	crypto.9 crypto_newsession.9 \
 	crypto.9 crypto_register.9 \
 	crypto.9 crypto_unblock.9 \
 	crypto.9 crypto_unregister.9 \
 	crypto.9 crypto_unregister_all.9
 MLINKS+=DB_COMMAND.9 DB_SHOW_ALL_COMMAND.9 \
 	DB_COMMAND.9 DB_SHOW_COMMAND.9
 MLINKS+=DECLARE_MODULE.9 DECLARE_MODULE_TIED.9
 MLINKS+=dev_clone.9 drain_dev_clone_events.9
 MLINKS+=dev_refthread.9 devvn_refthread.9 \
 	dev_refthread.9 dev_relthread.9
 MLINKS+=devfs_set_cdevpriv.9 devfs_clear_cdevpriv.9 \
 	devfs_set_cdevpriv.9 devfs_get_cdevpriv.9
 MLINKS+=device_add_child.9 device_add_child_ordered.9
 MLINKS+=device_enable.9 device_disable.9 \
 	device_enable.9 device_is_enabled.9
 MLINKS+=device_get_ivars.9 device_set_ivars.9
 MLINKS+=device_get_name.9 device_get_nameunit.9
 MLINKS+=device_get_state.9 device_busy.9 \
 	device_get_state.9 device_is_alive.9 \
 	device_get_state.9 device_is_attached.9 \
 	device_get_state.9 device_unbusy.9
 MLINKS+=device_get_sysctl.9 device_get_sysctl_ctx.9 \
 	device_get_sysctl.9 device_get_sysctl_tree.9
 MLINKS+=device_quiet.9 device_is_quiet.9 \
 	device_quiet.9 device_verbose.9
 MLINKS+=device_set_desc.9 device_get_desc.9 \
 	device_set_desc.9 device_set_desc_copy.9
 MLINKS+=device_set_flags.9 device_get_flags.9
 MLINKS+=devstat.9 devicestat.9 \
 	devstat.9 devstat_add_entry.9 \
 	devstat.9 devstat_end_transaction.9 \
 	devstat.9 devstat_remove_entry.9 \
 	devstat.9 devstat_start_transaction.9
 MLINKS+=disk.9 disk_add_alias.9 \
 	disk.9 disk_alloc.9 \
 	disk.9 disk_create.9 \
 	disk.9 disk_destroy.9 \
 	disk.9 disk_gone.9 \
 	disk.9 disk_resize.9
 MLINKS+=dnv.9 dnvlist.9 \
 	dnv.9 dnvlist_get_binary.9 \
 	dnv.9 dnvlist_get_bool.9 \
 	dnv.9 dnvlist_get_descriptor.9 \
 	dnv.9 dnvlist_get_number.9 \
 	dnv.9 dnvlist_get_nvlist.9 \
 	dnv.9 dnvlist_get_string.9 \
 	dnv.9 dnvlist_take_binary.9 \
 	dnv.9 dnvlist_take_bool.9 \
 	dnv.9 dnvlist_take_descriptor.9 \
 	dnv.9 dnvlist_take_number.9 \
 	dnv.9 dnvlist_take_nvlist.9 \
 	dnv.9 dnvlist_take_string.9
 MLINKS+=domain.9 DOMAIN_SET.9 \
 	domain.9 domain_add.9 \
 	domain.9 pfctlinput.9 \
 	domain.9 pfctlinput2.9 \
 	domain.9 pffinddomain.9 \
 	domain.9 pffindproto.9 \
 	domain.9 pffindtype.9
 MLINKS+=drbr.9 drbr_free.9 \
 	drbr.9 drbr_enqueue.9 \
 	drbr.9 drbr_dequeue.9 \
 	drbr.9 drbr_dequeue_cond.9 \
 	drbr.9 drbr_flush.9 \
 	drbr.9 drbr_empty.9 \
 	drbr.9 drbr_inuse.9 \
 	drbr.9 drbr_stats_update.9
 MLINKS+=DRIVER_MODULE.9 DRIVER_MODULE_ORDERED.9 \
 	DRIVER_MODULE.9 EARLY_DRIVER_MODULE.9 \
 	DRIVER_MODULE.9 EARLY_DRIVER_MODULE_ORDERED.9
 MLINKS+=EVENTHANDLER.9 EVENTHANDLER_DECLARE.9 \
 	EVENTHANDLER.9 EVENTHANDLER_DEFINE.9 \
 	EVENTHANDLER.9 EVENTHANDLER_DEREGISTER.9 \
 	EVENTHANDLER.9 eventhandler_deregister.9 \
 	EVENTHANDLER.9 eventhandler_find_list.9 \
 	EVENTHANDLER.9 EVENTHANDLER_INVOKE.9 \
 	EVENTHANDLER.9 eventhandler_prune_list.9 \
 	EVENTHANDLER.9 EVENTHANDLER_REGISTER.9 \
 	EVENTHANDLER.9 eventhandler_register.9
 MLINKS+=eventtimers.9 et_register.9 \
 	eventtimers.9 et_deregister.9 \
 	eventtimers.9 et_ban.9 \
 	eventtimers.9 et_find.9 \
 	eventtimers.9 et_free.9 \
 	eventtimers.9 et_init.9 \
 	eventtimers.9 ET_LOCK.9 \
 	eventtimers.9 ET_UNLOCK.9 \
 	eventtimers.9 et_start.9 \
 	eventtimers.9 et_stop.9
 MLINKS+=fail.9 KFAIL_POINT_CODE.9 \
 	fail.9 KFAIL_POINT_ERROR.9 \
 	fail.9 KFAIL_POINT_GOTO.9 \
 	fail.9 KFAIL_POINT_RETURN.9 \
 	fail.9 KFAIL_POINT_RETURN_VOID.9
 MLINKS+=fdt_pinctrl.9 fdt_pinctrl_configure.9 \
 	fdt_pinctrl.9 fdt_pinctrl_configure_by_name.9 \
 	fdt_pinctrl.9 fdt_pinctrl_configure_tree.9 \
 	fdt_pinctrl.9 fdt_pinctrl_register.9
 MLINKS+=fetch.9 fubyte.9 \
 	fetch.9 fuword.9 \
 	fetch.9 fuword16.9 \
 	fetch.9 fuword32.9 \
 	fetch.9 fuword64.9 \
 	fetch.9 fueword.9 \
 	fetch.9 fueword32.9 \
 	fetch.9 fueword64.9
 MLINKS+=firmware.9 firmware_get.9 \
 	firmware.9 firmware_put.9 \
 	firmware.9 firmware_register.9 \
 	firmware.9 firmware_unregister.9
 MLINKS+=fpu_kern.9 fpu_kern_alloc_ctx.9 \
 	fpu_kern.9 fpu_kern_free_ctx.9 \
 	fpu_kern.9 fpu_kern_enter.9 \
 	fpu_kern.9 fpu_kern_leave.9 \
 	fpu_kern.9 fpu_kern_thread.9 \
 	fpu_kern.9 is_fpu_kern_thread.9
 MLINKS+=g_attach.9 g_detach.9
 MLINKS+=g_bio.9 g_alloc_bio.9 \
 	g_bio.9 g_clone_bio.9 \
 	g_bio.9 g_destroy_bio.9 \
 	g_bio.9 g_duplicate_bio.9 \
 	g_bio.9 g_new_bio.9 \
 	g_bio.9 g_print_bio.9 \
 	g_bio.9 g_reset_bio.9
 MLINKS+=g_consumer.9 g_destroy_consumer.9 \
 	g_consumer.9 g_new_consumer.9
 MLINKS+=g_data.9 g_read_data.9 \
 	g_data.9 g_write_data.9
 MLINKS+=getenv.9 freeenv.9 \
 	getenv.9 getenv_int.9 \
 	getenv.9 getenv_long.9 \
 	getenv.9 getenv_string.9 \
 	getenv.9 getenv_quad.9 \
 	getenv.9 getenv_uint.9 \
 	getenv.9 getenv_ulong.9 \
 	getenv.9 kern_getenv.9 \
 	getenv.9 kern_setenv.9 \
 	getenv.9 kern_unsetenv.9 \
 	getenv.9 setenv.9 \
 	getenv.9 testenv.9 \
 	getenv.9 unsetenv.9
 MLINKS+=g_event.9 g_cancel_event.9 \
 	g_event.9 g_post_event.9 \
 	g_event.9 g_waitfor_event.9
 MLINKS+=g_geom.9 g_destroy_geom.9 \
 	g_geom.9 g_new_geomf.9
 MLINKS+=g_provider.9 g_destroy_provider.9 \
 	g_provider.9 g_error_provider.9 \
 	g_provider.9 g_new_providerf.9
 MLINKS+=hash.9 hash32.9 \
 	hash.9 hash32_buf.9 \
 	hash.9 hash32_str.9 \
 	hash.9 hash32_stre.9 \
 	hash.9 hash32_strn.9 \
 	hash.9 hash32_strne.9 \
 	hash.9 jenkins_hash.9 \
 	hash.9 jenkins_hash32.9
 MLINKS+=hashinit.9 hashdestroy.9 \
 	hashinit.9 hashinit_flags.9 \
 	hashinit.9 phashinit.9
 MLINKS+=hhook.9 hhook_head_register.9 \
 	hhook.9 hhook_head_deregister.9 \
 	hhook.9 hhook_head_deregister_lookup.9 \
 	hhook.9 hhook_run_hooks.9 \
 	hhook.9 HHOOKS_RUN_IF.9 \
 	hhook.9 HHOOKS_RUN_LOOKUP_IF.9
 MLINKS+=ieee80211.9 ieee80211_ifattach.9 \
 	ieee80211.9 ieee80211_ifdetach.9
 MLINKS+=ieee80211_amrr.9 ieee80211_amrr_choose.9 \
 	ieee80211_amrr.9 ieee80211_amrr_cleanup.9 \
 	ieee80211_amrr.9 ieee80211_amrr_init.9 \
 	ieee80211_amrr.9 ieee80211_amrr_node_init.9 \
 	ieee80211_amrr.9 ieee80211_amrr_setinterval.9 \
 	ieee80211_amrr.9 ieee80211_amrr_tx_complete.9 \
 	ieee80211_amrr.9 ieee80211_amrr_tx_update.9
 MLINKS+=ieee80211_beacon.9 ieee80211_beacon_alloc.9 \
 	ieee80211_beacon.9 ieee80211_beacon_notify.9 \
 	ieee80211_beacon.9 ieee80211_beacon_update.9
 MLINKS+=ieee80211_bmiss.9 ieee80211_beacon_miss.9
 MLINKS+=ieee80211_crypto.9 ieee80211_crypto_available.9 \
 	ieee80211_crypto.9 ieee80211_crypto_decap.9 \
 	ieee80211_crypto.9 ieee80211_crypto_delglobalkeys.9 \
 	ieee80211_crypto.9 ieee80211_crypto_delkey.9 \
 	ieee80211_crypto.9 ieee80211_crypto_demic.9 \
 	ieee80211_crypto.9 ieee80211_crypto_encap.9 \
 	ieee80211_crypto.9 ieee80211_crypto_enmic.9 \
 	ieee80211_crypto.9 ieee80211_crypto_newkey.9 \
 	ieee80211_crypto.9 ieee80211_crypto_register.9 \
 	ieee80211_crypto.9 ieee80211_crypto_reload_keys.9 \
 	ieee80211_crypto.9 ieee80211_crypto_setkey.9 \
 	ieee80211_crypto.9 ieee80211_crypto_unregister.9 \
 	ieee80211_crypto.9 ieee80211_key_update_begin.9 \
 	ieee80211_crypto.9 ieee80211_key_update_end.9 \
 	ieee80211_crypto.9 ieee80211_notify_michael_failure.9 \
 	ieee80211_crypto.9 ieee80211_notify_replay_failure.9
 MLINKS+=ieee80211_input.9 ieee80211_input_all.9
 MLINKS+=ieee80211_node.9 ieee80211_dump_node.9 \
 	ieee80211_node.9 ieee80211_dump_nodes.9 \
 	ieee80211_node.9 ieee80211_find_rxnode.9 \
 	ieee80211_node.9 ieee80211_find_rxnode_withkey.9 \
 	ieee80211_node.9 ieee80211_free_node.9 \
 	ieee80211_node.9 ieee80211_iterate_nodes.9 \
 	ieee80211_node.9 ieee80211_ref_node.9 \
 	ieee80211_node.9 ieee80211_unref_node.9
 MLINKS+=ieee80211_output.9 ieee80211_process_callback.9 \
 	ieee80211_output.9 M_SEQNO_GET.9 \
 	ieee80211_output.9 M_WME_GETAC.9
 MLINKS+=ieee80211_proto.9 ieee80211_new_state.9 \
 	ieee80211_proto.9 ieee80211_resume_all.9 \
 	ieee80211_proto.9 ieee80211_start_all.9 \
 	ieee80211_proto.9 ieee80211_stop_all.9 \
 	ieee80211_proto.9 ieee80211_suspend_all.9 \
 	ieee80211_proto.9 ieee80211_waitfor_parent.9
 MLINKS+=ieee80211_radiotap.9 ieee80211_radiotap_active.9 \
 	ieee80211_radiotap.9 ieee80211_radiotap_active_vap.9 \
 	ieee80211_radiotap.9 ieee80211_radiotap_attach.9 \
 	ieee80211_radiotap.9 ieee80211_radiotap_tx.9 \
 	ieee80211_radiotap.9 radiotap.9
 MLINKS+=ieee80211_regdomain.9 ieee80211_alloc_countryie.9 \
 	ieee80211_regdomain.9 ieee80211_init_channels.9 \
 	ieee80211_regdomain.9 ieee80211_sort_channels.9
 MLINKS+=ieee80211_scan.9 ieee80211_add_scan.9 \
 	ieee80211_scan.9 ieee80211_bg_scan.9 \
 	ieee80211_scan.9 ieee80211_cancel_scan.9 \
 	ieee80211_scan.9 ieee80211_cancel_scan_any.9 \
 	ieee80211_scan.9 ieee80211_check_scan.9 \
 	ieee80211_scan.9 ieee80211_check_scan_current.9 \
 	ieee80211_scan.9 ieee80211_flush.9 \
 	ieee80211_scan.9 ieee80211_probe_curchan.9 \
 	ieee80211_scan.9 ieee80211_scan_assoc_fail.9 \
 	ieee80211_scan.9 ieee80211_scan_done.9 \
 	ieee80211_scan.9 ieee80211_scan_dump_channels.9 \
 	ieee80211_scan.9 ieee80211_scan_flush.9 \
 	ieee80211_scan.9 ieee80211_scan_iterate.9 \
 	ieee80211_scan.9 ieee80211_scan_next.9 \
 	ieee80211_scan.9 ieee80211_scan_timeout.9 \
 	ieee80211_scan.9 ieee80211_scanner_get.9 \
 	ieee80211_scan.9 ieee80211_scanner_register.9 \
 	ieee80211_scan.9 ieee80211_scanner_unregister.9 \
 	ieee80211_scan.9 ieee80211_scanner_unregister_all.9 \
 	ieee80211_scan.9 ieee80211_start_scan.9
 MLINKS+=ieee80211_vap.9 ieee80211_vap_attach.9 \
 	ieee80211_vap.9 ieee80211_vap_detach.9 \
 	ieee80211_vap.9 ieee80211_vap_setup.9
 MLINKS+=iflibdd.9 ifdi_attach_pre.9 \
 	iflibdd.9 ifdi_attach_post.9 \
 	iflibdd.9 ifdi_detach.9 \
 	iflibdd.9 ifdi_get_counter.9 \
 	iflibdd.9 ifdi_i2c_req.9 \
 	iflibdd.9 ifdi_init.9 \
 	iflibdd.9 ifdi_intr_enable.9 \
 	iflibdd.9 ifdi_intr_disable.9 \
 	iflibdd.9 ifdi_led_func.9 \
 	iflibdd.9 ifdi_link_intr_enable.9 \
 	iflibdd.9 ifdi_media_set.9 \
 	iflibdd.9 ifdi_media_status.9 \
 	iflibdd.9 ifdi_media_change.9 \
 	iflibdd.9 ifdi_mtu_set.9 \
 	iflibdd.9 ifdi_multi_set.9 \
 	iflibdd.9 ifdi_promisc_set.9 \
 	iflibdd.9 ifdi_queues_alloc.9 \
 	iflibdd.9 ifdi_queues_free.9 \
 	iflibdd.9 ifdi_queue_intr_enable.9 \
 	iflibdd.9 ifdi_resume.9 \
 	iflibdd.9 ifdi_rxq_setup.9 \
 	iflibdd.9 ifdi_stop.9 \
 	iflibdd.9 ifdi_suspend.9 \
 	iflibdd.9 ifdi_sysctl_int_delay.9 \
 	iflibdd.9 ifdi_timer.9 \
 	iflibdd.9 ifdi_txq_setup.9 \
 	iflibdd.9 ifdi_update_admin_status.9 \
 	iflibdd.9 ifdi_vf_add.9 \
 	iflibdd.9 ifdi_vflr_handle.9 \
 	iflibdd.9 ifdi_vlan_register.9 \
 	iflibdd.9 ifdi_vlan_unregister.9 \
 	iflibdd.9 ifdi_watchdog_reset.9 \
 	iflibdd.9 iov_init.9 \
 	iflibdd.9 iov_uinit.9
 MLINKS+=iflibdi.9 iflib_add_int_delay_sysctl.9 \
 	iflibdi.9 iflib_device_attach.9 \
 	iflibdi.9 iflib_device_deregister.9 \
 	iflibdi.9 iflib_device_detach.9 \
 	iflibdi.9 iflib_device_suspend.9 \
 	iflibdi.9 iflib_device_register.9 \
 	iflibdi.9 iflib_device_resume.9 \
 	iflibdi.9 iflib_led_create.9 \
 	iflibdi.9 iflib_irq_alloc.9 \
 	iflibdi.9 iflib_irq_alloc_generic.9 \
 	iflibdi.9 iflib_link_intr_deferred.9 \
 	iflibdi.9 iflib_link_state_change.9 \
 	iflibdi.9 iflib_rx_intr_deferred.9 \
 	iflibdi.9 iflib_tx_intr_deferred.9
 MLINKS+=iflibtxrx.9 isc_rxd_available.9 \
 	iflibtxrx.9 isc_rxd_refill.9 \
 	iflibtxrx.9 isc_rxd_flush.9 \
 	iflibtxrx.9 isc_rxd_pkt_get.9 \
 	iflibtxrx.9 isc_txd_credits_update.9 \
 	iflibtxrx.9 isc_txd_encap.9 \
 	iflibtxrx.9 isc_txd_flush.9
 MLINKS+=ifnet.9 if_addmulti.9 \
 	ifnet.9 if_alloc.9 \
 	ifnet.9 if_allmulti.9 \
 	ifnet.9 if_attach.9 \
 	ifnet.9 if_data.9 \
 	ifnet.9 IF_DEQUEUE.9 \
 	ifnet.9 if_delmulti.9 \
 	ifnet.9 if_detach.9 \
 	ifnet.9 if_down.9 \
 	ifnet.9 if_findmulti.9 \
 	ifnet.9 if_free.9 \
 	ifnet.9 if_free_type.9 \
 	ifnet.9 if_up.9 \
 	ifnet.9 ifa_free.9 \
 	ifnet.9 ifa_ifwithaddr.9 \
 	ifnet.9 ifa_ifwithdstaddr.9 \
 	ifnet.9 ifa_ifwithnet.9 \
 	ifnet.9 ifa_ref.9 \
 	ifnet.9 ifaddr.9 \
 	ifnet.9 ifaddr_byindex.9 \
 	ifnet.9 ifaof_ifpforaddr.9 \
 	ifnet.9 ifioctl.9 \
 	ifnet.9 ifpromisc.9 \
 	ifnet.9 ifqueue.9 \
 	ifnet.9 ifunit.9 \
 	ifnet.9 ifunit_ref.9
 MLINKS+=insmntque.9 insmntque1.9
 MLINKS+=ithread.9 ithread_add_handler.9 \
 	ithread.9 ithread_create.9 \
 	ithread.9 ithread_destroy.9 \
 	ithread.9 ithread_priority.9 \
 	ithread.9 ithread_remove_handler.9 \
 	ithread.9 ithread_schedule.9
 MLINKS+=kernacc.9 useracc.9
 MLINKS+=kernel_mount.9 free_mntarg.9 \
 	kernel_mount.9 kernel_vmount.9 \
 	kernel_mount.9 mount_arg.9 \
 	kernel_mount.9 mount_argb.9 \
 	kernel_mount.9 mount_argf.9 \
 	kernel_mount.9 mount_argsu.9
 MLINKS+=khelp.9 khelp_add_hhook.9 \
 	khelp.9 KHELP_DECLARE_MOD.9 \
 	khelp.9 KHELP_DECLARE_MOD_UMA.9 \
 	khelp.9 khelp_destroy_osd.9 \
 	khelp.9 khelp_get_id.9 \
 	khelp.9 khelp_get_osd.9 \
 	khelp.9 khelp_init_osd.9 \
 	khelp.9 khelp_remove_hhook.9
 MLINKS+=kobj.9 DEFINE_CLASS.9 \
 	kobj.9 kobj_class_compile.9 \
 	kobj.9 kobj_class_compile_static.9 \
 	kobj.9 kobj_class_free.9 \
 	kobj.9 kobj_create.9 \
 	kobj.9 kobj_delete.9 \
 	kobj.9 kobj_init.9 \
 	kobj.9 kobj_init_static.9
 MLINKS+=kproc.9 kproc_create.9 \
 	kproc.9 kproc_exit.9 \
 	kproc.9 kproc_kthread_add.9 \
 	kproc.9 kproc_resume.9 \
 	kproc.9 kproc_shutdown.9 \
 	kproc.9 kproc_start.9 \
 	kproc.9 kproc_suspend.9 \
 	kproc.9 kproc_suspend_check.9 \
 	kproc.9 kthread_create.9
 MLINKS+=kqueue.9 knlist_add.9 \
 	kqueue.9 knlist_clear.9 \
 	kqueue.9 knlist_delete.9 \
 	kqueue.9 knlist_destroy.9 \
 	kqueue.9 knlist_empty.9 \
 	kqueue.9 knlist_init.9 \
 	kqueue.9 knlist_init_mtx.9 \
 	kqueue.9 knlist_init_rw_reader.9 \
 	kqueue.9 knlist_remove.9 \
 	kqueue.9 knlist_remove_inevent.9 \
 	kqueue.9 knote_fdclose.9 \
 	kqueue.9 KNOTE_LOCKED.9 \
 	kqueue.9 KNOTE_UNLOCKED.9 \
 	kqueue.9 kqfd_register.9 \
 	kqueue.9 kqueue_add_filteropts.9 \
 	kqueue.9 kqueue_del_filteropts.9
 MLINKS+=kthread.9 kthread_add.9 \
 	kthread.9 kthread_exit.9 \
 	kthread.9 kthread_resume.9 \
 	kthread.9 kthread_shutdown.9 \
 	kthread.9 kthread_start.9 \
 	kthread.9 kthread_suspend.9 \
 	kthread.9 kthread_suspend_check.9
 MLINKS+=ktr.9 CTR0.9 \
 	ktr.9 CTR1.9 \
 	ktr.9 CTR2.9 \
 	ktr.9 CTR3.9 \
 	ktr.9 CTR4.9 \
 	ktr.9 CTR5.9 \
 	ktr.9 CTR6.9
 MLINKS+=lock.9 lockdestroy.9 \
 	lock.9 lockinit.9 \
 	lock.9 lockmgr.9 \
 	lock.9 lockmgr_args.9 \
 	lock.9 lockmgr_args_rw.9 \
 	lock.9 lockmgr_assert.9 \
 	lock.9 lockmgr_disown.9 \
 	lock.9 lockmgr_printinfo.9 \
 	lock.9 lockmgr_recursed.9 \
 	lock.9 lockmgr_rw.9 \
 	lock.9 lockstatus.9
 MLINKS+=LOCK_PROFILING.9 MUTEX_PROFILING.9
 MLINKS+=make_dev.9 destroy_dev.9 \
 	make_dev.9 destroy_dev_drain.9 \
 	make_dev.9 destroy_dev_sched.9 \
 	make_dev.9 destroy_dev_sched_cb.9 \
 	make_dev.9 dev_depends.9 \
 	make_dev.9 make_dev_alias.9 \
 	make_dev.9 make_dev_alias_p.9 \
 	make_dev.9 make_dev_cred.9 \
 	make_dev.9 make_dev_credf.9 \
 	make_dev.9 make_dev_p.9 \
 	make_dev.9 make_dev_s.9
 MLINKS+=malloc.9 free.9 \
 	malloc.9 malloc_domainset.9 \
 	malloc.9 free_domain.9 \
 	malloc.9 mallocarray.9 \
 	malloc.9 MALLOC_DECLARE.9 \
 	malloc.9 MALLOC_DEFINE.9 \
 	malloc.9 realloc.9 \
 	malloc.9 reallocf.9
 MLINKS+=mbchain.9 mb_detach.9 \
 	mbchain.9 mb_done.9 \
 	mbchain.9 mb_fixhdr.9 \
 	mbchain.9 mb_init.9 \
 	mbchain.9 mb_initm.9 \
 	mbchain.9 mb_put_int64be.9 \
 	mbchain.9 mb_put_int64le.9 \
 	mbchain.9 mb_put_mbuf.9 \
 	mbchain.9 mb_put_mem.9 \
 	mbchain.9 mb_put_uint16be.9 \
 	mbchain.9 mb_put_uint16le.9 \
 	mbchain.9 mb_put_uint32be.9 \
 	mbchain.9 mb_put_uint32le.9 \
 	mbchain.9 mb_put_uint8.9 \
 	mbchain.9 mb_put_uio.9 \
 	mbchain.9 mb_reserve.9
 MLINKS+=\
 	mbuf.9 m_adj.9 \
 	mbuf.9 m_align.9 \
 	mbuf.9 M_ALIGN.9 \
 	mbuf.9 m_append.9 \
 	mbuf.9 m_apply.9 \
 	mbuf.9 m_cat.9 \
 	mbuf.9 m_catpkt.9 \
 	mbuf.9 MCHTYPE.9 \
 	mbuf.9 MCLGET.9 \
 	mbuf.9 m_collapse.9 \
 	mbuf.9 m_copyback.9 \
 	mbuf.9 m_copydata.9 \
 	mbuf.9 m_copym.9 \
 	mbuf.9 m_copypacket.9 \
 	mbuf.9 m_copyup.9 \
 	mbuf.9 m_defrag.9 \
 	mbuf.9 m_devget.9 \
 	mbuf.9 m_dup.9 \
 	mbuf.9 m_dup_pkthdr.9 \
 	mbuf.9 MEXTADD.9 \
 	mbuf.9 m_fixhdr.9 \
 	mbuf.9 m_free.9 \
 	mbuf.9 m_freem.9 \
 	mbuf.9 MGET.9 \
 	mbuf.9 m_get.9 \
 	mbuf.9 m_get2.9 \
 	mbuf.9 m_getjcl.9 \
 	mbuf.9 m_getcl.9 \
 	mbuf.9 MGETHDR.9 \
 	mbuf.9 m_gethdr.9 \
 	mbuf.9 m_getm.9 \
 	mbuf.9 m_getptr.9 \
 	mbuf.9 MH_ALIGN.9 \
 	mbuf.9 M_LEADINGSPACE.9 \
 	mbuf.9 m_length.9 \
 	mbuf.9 M_MOVE_PKTHDR.9 \
 	mbuf.9 m_move_pkthdr.9 \
 	mbuf.9 M_PREPEND.9 \
 	mbuf.9 m_prepend.9 \
 	mbuf.9 m_pulldown.9 \
 	mbuf.9 m_pullup.9 \
 	mbuf.9 m_split.9 \
 	mbuf.9 mtod.9 \
 	mbuf.9 M_TRAILINGSPACE.9 \
 	mbuf.9 m_unshare.9 \
 	mbuf.9 M_WRITABLE.9
 MLINKS+=\
 	mbuf_tags.9 m_tag_alloc.9 \
 	mbuf_tags.9 m_tag_copy.9 \
 	mbuf_tags.9 m_tag_copy_chain.9 \
 	mbuf_tags.9 m_tag_delete.9 \
 	mbuf_tags.9 m_tag_delete_chain.9 \
 	mbuf_tags.9 m_tag_delete_nonpersistent.9 \
 	mbuf_tags.9 m_tag_find.9 \
 	mbuf_tags.9 m_tag_first.9 \
 	mbuf_tags.9 m_tag_free.9 \
 	mbuf_tags.9 m_tag_get.9 \
 	mbuf_tags.9 m_tag_init.9 \
 	mbuf_tags.9 m_tag_locate.9 \
 	mbuf_tags.9 m_tag_next.9 \
 	mbuf_tags.9 m_tag_prepend.9 \
 	mbuf_tags.9 m_tag_unlink.9
 MLINKS+=MD5.9 MD5Init.9 \
 	MD5.9 MD5Transform.9
 MLINKS+=mdchain.9 md_append_record.9 \
 	mdchain.9 md_done.9 \
 	mdchain.9 md_get_int64.9 \
 	mdchain.9 md_get_int64be.9 \
 	mdchain.9 md_get_int64le.9 \
 	mdchain.9 md_get_mbuf.9 \
 	mdchain.9 md_get_mem.9 \
 	mdchain.9 md_get_uint16.9 \
 	mdchain.9 md_get_uint16be.9 \
 	mdchain.9 md_get_uint16le.9 \
 	mdchain.9 md_get_uint32.9 \
 	mdchain.9 md_get_uint32be.9 \
 	mdchain.9 md_get_uint32le.9 \
 	mdchain.9 md_get_uint8.9 \
 	mdchain.9 md_get_uio.9 \
 	mdchain.9 md_initm.9 \
 	mdchain.9 md_next_record.9
 MLINKS+=microtime.9 bintime.9 \
 	microtime.9 getbintime.9 \
 	microtime.9 getmicrotime.9 \
 	microtime.9 getnanotime.9 \
 	microtime.9 nanotime.9
 MLINKS+=microuptime.9 binuptime.9 \
 	microuptime.9 getbinuptime.9 \
 	microuptime.9 getmicrouptime.9 \
 	microuptime.9 getnanouptime.9 \
 	microuptime.9 getsbinuptime.9 \
 	microuptime.9 nanouptime.9 \
 	microuptime.9 sbinuptime.9
 MLINKS+=mi_switch.9 cpu_switch.9 \
 	mi_switch.9 cpu_throw.9
 MLINKS+=mod_cc.9 CCV.9 \
 	mod_cc.9 DECLARE_CC_MODULE.9
 MLINKS+=mtx_pool.9 mtx_pool_alloc.9 \
 	mtx_pool.9 mtx_pool_create.9 \
 	mtx_pool.9 mtx_pool_destroy.9 \
 	mtx_pool.9 mtx_pool_find.9 \
 	mtx_pool.9 mtx_pool_lock.9 \
 	mtx_pool.9 mtx_pool_lock_spin.9 \
 	mtx_pool.9 mtx_pool_unlock.9 \
 	mtx_pool.9 mtx_pool_unlock_spin.9
 MLINKS+=mutex.9 mtx_assert.9 \
 	mutex.9 mtx_destroy.9 \
 	mutex.9 mtx_init.9 \
 	mutex.9 mtx_initialized.9 \
 	mutex.9 mtx_lock.9 \
 	mutex.9 mtx_lock_flags.9 \
 	mutex.9 mtx_lock_spin.9 \
 	mutex.9 mtx_lock_spin_flags.9 \
 	mutex.9 mtx_owned.9 \
 	mutex.9 mtx_recursed.9 \
 	mutex.9 mtx_sleep.9 \
 	mutex.9 MTX_SYSINIT.9 \
 	mutex.9 mtx_trylock.9 \
 	mutex.9 mtx_trylock_flags.9 \
 	mutex.9 mtx_trylock_spin.9 \
 	mutex.9 mtx_trylock_spin_flags.9 \
 	mutex.9 mtx_unlock.9 \
 	mutex.9 mtx_unlock_flags.9 \
 	mutex.9 mtx_unlock_spin.9 \
 	mutex.9 mtx_unlock_spin_flags.9
 MLINKS+=namei.9 NDFREE.9 \
 	namei.9 NDINIT.9
 MLINKS+=netisr.9 netisr_clearqdrops.9 \
 	netisr.9 netisr_default_flow2cpu.9 \
 	netisr.9 netisr_dispatch.9 \
 	netisr.9 netisr_dispatch_src.9 \
 	netisr.9 netisr_get_cpucount.9 \
 	netisr.9 netisr_get_cpuid.9 \
 	netisr.9 netisr_getqdrops.9 \
 	netisr.9 netisr_getqlimit.9 \
 	netisr.9 netisr_queue.9 \
 	netisr.9 netisr_queue_src.9 \
 	netisr.9 netisr_register.9 \
 	netisr.9 netisr_setqlimit.9 \
 	netisr.9 netisr_unregister.9
 MLINKS+=nv.9 libnv.9 \
 	nv.9 nvlist.9 \
 	nv.9 nvlist_add_binary.9 \
 	nv.9 nvlist_add_bool.9 \
 	nv.9 nvlist_add_bool_array.9 \
 	nv.9 nvlist_add_descriptor.9 \
 	nv.9 nvlist_add_descriptor_array.9 \
 	nv.9 nvlist_add_null.9 \
 	nv.9 nvlist_add_number.9 \
 	nv.9 nvlist_add_number_array.9 \
 	nv.9 nvlist_add_nvlist.9 \
 	nv.9 nvlist_add_nvlist_array.9 \
 	nv.9 nvlist_add_string.9 \
 	nv.9 nvlist_add_stringf.9 \
 	nv.9 nvlist_add_stringv.9 \
 	nv.9 nvlist_add_string_array.9 \
 	nv.9 nvlist_clone.9 \
 	nv.9 nvlist_create.9 \
 	nv.9 nvlist_destroy.9 \
 	nv.9 nvlist_dump.9 \
 	nv.9 nvlist_empty.9 \
 	nv.9 nvlist_error.9 \
 	nv.9 nvlist_exists.9 \
 	nv.9 nvlist_exists_binary.9 \
 	nv.9 nvlist_exists_bool.9 \
 	nv.9 nvlist_exists_bool_array.9 \
 	nv.9 nvlist_exists_descriptor.9 \
 	nv.9 nvlist_exists_descriptor_array.9 \
 	nv.9 nvlist_exists_null.9 \
 	nv.9 nvlist_exists_number.9 \
 	nv.9 nvlist_exists_number_array.9 \
 	nv.9 nvlist_exists_nvlist.9 \
 	nv.9 nvlist_exists_nvlist_array.9 \
 	nv.9 nvlist_exists_string.9 \
 	nv.9 nvlist_exists_type.9 \
 	nv.9 nvlist_fdump.9 \
 	nv.9 nvlist_flags.9 \
 	nv.9 nvlist_free.9 \
 	nv.9 nvlist_free_binary.9 \
 	nv.9 nvlist_free_bool.9 \
 	nv.9 nvlist_free_bool_array.9 \
 	nv.9 nvlist_free_descriptor.9 \
 	nv.9 nvlist_free_descriptor_array.9 \
 	nv.9 nvlist_free_null.9 \
 	nv.9 nvlist_free_number.9 \
 	nv.9 nvlist_free_number_array.9 \
 	nv.9 nvlist_free_nvlist.9 \
 	nv.9 nvlist_free_nvlist_array.9 \
 	nv.9 nvlist_free_string.9 \
 	nv.9 nvlist_free_string_array.9 \
 	nv.9 nvlist_free_type.9 \
 	nv.9 nvlist_get_binary.9 \
 	nv.9 nvlist_get_bool.9 \
 	nv.9 nvlist_get_bool_array.9 \
 	nv.9 nvlist_get_descriptor.9 \
 	nv.9 nvlist_get_descriptor_array.9 \
 	nv.9 nvlist_get_number.9 \
 	nv.9 nvlist_get_number_array.9 \
 	nv.9 nvlist_get_nvlist.9 \
 	nv.9 nvlist_get_nvlist_array.9 \
 	nv.9 nvlist_get_parent.9 \
 	nv.9 nvlist_get_string.9 \
 	nv.9 nvlist_get_string_array.9 \
 	nv.9 nvlist_move_binary.9 \
 	nv.9 nvlist_move_descriptor.9 \
 	nv.9 nvlist_move_descriptor_array.9 \
 	nv.9 nvlist_move_nvlist.9 \
 	nv.9 nvlist_move_nvlist_array.9 \
 	nv.9 nvlist_move_string.9 \
 	nv.9 nvlist_move_string_array.9 \
 	nv.9 nvlist_next.9 \
 	nv.9 nvlist_pack.9 \
 	nv.9 nvlist_recv.9 \
 	nv.9 nvlist_send.9 \
 	nv.9 nvlist_set_error.9 \
 	nv.9 nvlist_size.9 \
 	nv.9 nvlist_take_binary.9 \
 	nv.9 nvlist_take_bool.9 \
 	nv.9 nvlist_take_bool_array.9 \
 	nv.9 nvlist_take_descriptor.9 \
 	nv.9 nvlist_take_descriptor_array.9 \
 	nv.9 nvlist_take_number.9 \
 	nv.9 nvlist_take_number_array.9 \
 	nv.9 nvlist_take_nvlist.9 \
 	nv.9 nvlist_take_nvlist_array.9 \
 	nv.9 nvlist_take_string.9 \
 	nv.9 nvlist_take_string_array.9 \
 	nv.9 nvlist_unpack.9 \
 	nv.9 nvlist_xfer.9
 MLINKS+=OF_child.9 OF_parent.9 \
 	OF_child.9 OF_peer.9
 MLINKS+=OF_device_from_xref.9 OF_device_register_xref.9 \
 	OF_device_from_xref.9 OF_xref_from_device.9
 MLINKS+=OF_getprop.9 OF_getencprop.9 \
 	OF_getprop.9 OF_getencprop_alloc.9 \
 	OF_getprop.9 OF_getencprop_alloc_multi.9 \
 	OF_getprop.9 OF_getprop_alloc.9 \
 	OF_getprop.9 OF_getprop_alloc_multi.9 \
 	OF_getprop.9 OF_getproplen.9 \
 	OF_getprop.9 OF_hasprop.9 \
 	OF_getprop.9 OF_nextprop.9 \
 	OF_getprop.9 OF_prop_free.9 \
 	OF_getprop.9 OF_searchencprop.9 \
 	OF_getprop.9 OF_searchprop.9 \
 	OF_getprop.9 OF_setprop.9
 MLINKS+=OF_node_from_xref.9 OF_xref_from_node.9
 MLINKS+=ofw_bus_is_compatible.9 ofw_bus_is_compatible_strict.9 \
 	ofw_bus_is_compatible.9 ofw_bus_node_is_compatible.9 \
 	ofw_bus_is_compatible.9 ofw_bus_search_compatible.9
 MLINKS+= ofw_bus_status_okay.9 ofw_bus_get_status.9 \
 	ofw_bus_status_okay.9 ofw_bus_node_status_okay.9
 MLINKS+=osd.9 osd_call.9 \
 	osd.9 osd_del.9 \
 	osd.9 osd_deregister.9 \
 	osd.9 osd_exit.9 \
 	osd.9 osd_get.9 \
 	osd.9 osd_register.9 \
 	osd.9 osd_set.9
 MLINKS+=panic.9 vpanic.9
 MLINKS+=PCBGROUP.9 in_pcbgroup_byhash.9 \
 	PCBGROUP.9 in_pcbgroup_byinpcb.9 \
 	PCBGROUP.9 in_pcbgroup_destroy.9 \
 	PCBGROUP.9 in_pcbgroup_enabled.9 \
 	PCBGROUP.9 in_pcbgroup_init.9 \
 	PCBGROUP.9 in_pcbgroup_remove.9 \
 	PCBGROUP.9 in_pcbgroup_update.9 \
 	PCBGROUP.9 in_pcbgroup_update_mbuf.9 \
 	PCBGROUP.9 in6_pcbgroup_byhash.9
 MLINKS+=pci.9 pci_alloc_msi.9 \
 	pci.9 pci_alloc_msix.9 \
 	pci.9 pci_disable_busmaster.9 \
 	pci.9 pci_disable_io.9 \
 	pci.9 pci_enable_busmaster.9 \
 	pci.9 pci_enable_io.9 \
 	pci.9 pci_find_bsf.9 \
 	pci.9 pci_find_cap.9 \
 	pci.9 pci_find_dbsf.9 \
 	pci.9 pci_find_device.9 \
 	pci.9 pci_find_extcap.9 \
 	pci.9 pci_find_htcap.9 \
 	pci.9 pci_find_pcie_root_port.9 \
 	pci.9 pci_get_id.9 \
 	pci.9 pci_get_max_read_req.9 \
 	pci.9 pci_get_powerstate.9 \
 	pci.9 pci_get_vpd_ident.9 \
 	pci.9 pci_get_vpd_readonly.9 \
 	pci.9 pci_iov_attach.9 \
 	pci.9 pci_iov_attach_name.9 \
 	pci.9 pci_iov_detach.9 \
 	pci.9 pci_msi_count.9 \
 	pci.9 pci_msix_count.9 \
 	pci.9 pci_msix_pba_bar.9 \
 	pci.9 pci_msix_table_bar.9 \
 	pci.9 pci_pending_msix.9 \
 	pci.9 pci_read_config.9 \
 	pci.9 pci_release_msi.9 \
 	pci.9 pci_remap_msix.9 \
 	pci.9 pci_restore_state.9 \
 	pci.9 pci_save_state.9 \
 	pci.9 pci_set_powerstate.9 \
 	pci.9 pci_set_max_read_req.9 \
 	pci.9 pci_write_config.9 \
 	pci.9 pcie_adjust_config.9 \
 	pci.9 pcie_flr.9 \
 	pci.9 pcie_max_completion_timeout.9 \
 	pci.9 pcie_read_config.9 \
 	pci.9 pcie_wait_for_pending_transactions.9 \
 	pci.9 pcie_write_config.9
 MLINKS+=pci_iov_schema.9 pci_iov_schema_alloc_node.9 \
 	pci_iov_schema.9 pci_iov_schema_add_bool.9 \
 	pci_iov_schema.9 pci_iov_schema_add_string.9 \
 	pci_iov_schema.9 pci_iov_schema_add_uint8.9 \
 	pci_iov_schema.9 pci_iov_schema_add_uint16.9 \
 	pci_iov_schema.9 pci_iov_schema_add_uint32.9 \
 	pci_iov_schema.9 pci_iov_schema_add_uint64.9 \
 	pci_iov_schema.9 pci_iov_schema_add_unicast_mac.9
 MLINKS+=pfil.9 pfil_add_hook.9 \
 	pfil.9 pfil_head_register.9 \
 	pfil.9 pfil_head_unregister.9 \
 	pfil.9 pfil_remove_hook.9 \
 	pfil.9 pfil_run_hooks.9 \
 	pfil.9 pfil_link.9
 MLINKS+=pfind.9 zpfind.9
 MLINKS+=PHOLD.9 PRELE.9 \
 	PHOLD.9 _PHOLD.9 \
 	PHOLD.9 _PRELE.9 \
 	PHOLD.9 PROC_ASSERT_HELD.9 \
 	PHOLD.9 PROC_ASSERT_NOT_HELD.9
 MLINKS+=pmap_copy.9 pmap_copy_page.9
 MLINKS+=pmap_extract.9 pmap_extract_and_hold.9
 MLINKS+=pmap_init.9 pmap_init2.9
 MLINKS+=pmap_is_modified.9 pmap_ts_referenced.9
 MLINKS+=pmap_pinit.9 pmap_pinit0.9 \
 	pmap_pinit.9 pmap_pinit2.9
 MLINKS+=pmap_qenter.9 pmap_qremove.9
 MLINKS+=pmap_quick_enter_page.9 pmap_quick_remove_page.9
 MLINKS+=pmap_remove.9 pmap_remove_all.9 \
 	pmap_remove.9 pmap_remove_pages.9
 MLINKS+=pmap_resident_count.9 pmap_wired_count.9
 MLINKS+=pmap_zero_page.9 pmap_zero_area.9
 MLINKS+=printf.9 log.9 \
 	printf.9 tprintf.9 \
 	printf.9 uprintf.9
 MLINKS+=priv.9 priv_check.9 \
 	priv.9 priv_check_cred.9
 MLINKS+=proc_rwmem.9 proc_readmem.9 \
 	proc_rwmem.9 proc_writemem.9
 MLINKS+=psignal.9 gsignal.9 \
 	psignal.9 pgsignal.9 \
 	psignal.9 tdsignal.9
 MLINKS+=random.9 arc4rand.9 \
 	random.9 arc4random.9 \
 	random.9 read_random.9 \
 	random.9 read_random_uio.9 \
 	random.9 srandom.9
 MLINKS+=random_harvest.9 random_harvest_direct.9 \
 	random_harvest.9 random_harvest_fast.9 \
 	random_harvest.9 random_harvest_queue.9
 MLINKS+=ratecheck.9 ppsratecheck.9
 MLINKS+=refcount.9 refcount_acquire.9 \
 	refcount.9 refcount_init.9 \
 	refcount.9 refcount_release.9
 MLINKS+=resource_int_value.9 resource_long_value.9 \
 	resource_int_value.9 resource_string_value.9
 MLINKS+=rman.9 rman_activate_resource.9 \
 	rman.9 rman_adjust_resource.9 \
 	rman.9 rman_deactivate_resource.9 \
 	rman.9 rman_fini.9 \
 	rman.9 rman_first_free_region.9 \
 	rman.9 rman_get_bushandle.9 \
 	rman.9 rman_get_bustag.9 \
 	rman.9 rman_get_device.9 \
 	rman.9 rman_get_end.9 \
 	rman.9 rman_get_flags.9 \
 	rman.9 rman_get_mapping.9 \
 	rman.9 rman_get_rid.9 \
 	rman.9 rman_get_size.9 \
 	rman.9 rman_get_start.9 \
 	rman.9 rman_get_virtual.9 \
 	rman.9 rman_init.9 \
 	rman.9 rman_init_from_resource.9 \
 	rman.9 rman_is_region_manager.9 \
 	rman.9 rman_last_free_region.9 \
 	rman.9 rman_make_alignment_flags.9 \
 	rman.9 rman_manage_region.9 \
 	rman.9 rman_release_resource.9 \
 	rman.9 rman_reserve_resource.9 \
 	rman.9 rman_reserve_resource_bound.9 \
 	rman.9 rman_set_bushandle.9 \
 	rman.9 rman_set_bustag.9 \
 	rman.9 rman_set_mapping.9 \
 	rman.9 rman_set_rid.9 \
 	rman.9 rman_set_virtual.9
 MLINKS+=rmlock.9 rm_assert.9 \
 	rmlock.9 rm_destroy.9 \
 	rmlock.9 rm_init.9 \
 	rmlock.9 rm_init_flags.9 \
 	rmlock.9 rm_rlock.9 \
 	rmlock.9 rm_runlock.9 \
 	rmlock.9 rm_sleep.9 \
 	rmlock.9 RM_SYSINIT.9 \
 	rmlock.9 RM_SYSINIT_FLAGS.9 \
 	rmlock.9 rm_try_rlock.9 \
 	rmlock.9 rm_wlock.9 \
 	rmlock.9 rm_wowned.9 \
 	rmlock.9 rm_wunlock.9
 MLINKS+=rtalloc.9 rtalloc1.9 \
 	rtalloc.9 rtalloc_ign.9 \
 	rtalloc.9 RT_ADDREF.9 \
 	rtalloc.9 RT_LOCK.9 \
 	rtalloc.9 RT_REMREF.9 \
 	rtalloc.9 RT_RTFREE.9 \
 	rtalloc.9 RT_UNLOCK.9 \
 	rtalloc.9 RTFREE_LOCKED.9 \
 	rtalloc.9 RTFREE.9 \
 	rtalloc.9 rtfree.9 \
 	rtalloc.9 rtalloc1_fib.9 \
 	rtalloc.9 rtalloc_ign_fib.9 \
 	rtalloc.9 rtalloc_fib.9
 MLINKS+=runqueue.9 choosethread.9 \
 	runqueue.9 procrunnable.9 \
 	runqueue.9 remrunqueue.9 \
 	runqueue.9 setrunqueue.9
 MLINKS+=rwlock.9 rw_assert.9 \
 	rwlock.9 rw_destroy.9 \
 	rwlock.9 rw_downgrade.9 \
 	rwlock.9 rw_init.9 \
 	rwlock.9 rw_init_flags.9 \
 	rwlock.9 rw_initialized.9 \
 	rwlock.9 rw_rlock.9 \
 	rwlock.9 rw_runlock.9 \
 	rwlock.9 rw_unlock.9 \
 	rwlock.9 rw_sleep.9 \
 	rwlock.9 RW_SYSINIT.9 \
 	rwlock.9 RW_SYSINIT_FLAGS.9 \
 	rwlock.9 rw_try_rlock.9 \
 	rwlock.9 rw_try_upgrade.9 \
 	rwlock.9 rw_try_wlock.9 \
 	rwlock.9 rw_wlock.9 \
 	rwlock.9 rw_wowned.9 \
 	rwlock.9 rw_wunlock.9
 MLINKS+=sbuf.9 sbuf_bcat.9 \
 	sbuf.9 sbuf_bcopyin.9 \
 	sbuf.9 sbuf_bcpy.9 \
 	sbuf.9 sbuf_cat.9 \
 	sbuf.9 sbuf_clear.9 \
 	sbuf.9 sbuf_clear_flags.9 \
 	sbuf.9 sbuf_copyin.9 \
 	sbuf.9 sbuf_cpy.9 \
 	sbuf.9 sbuf_data.9 \
 	sbuf.9 sbuf_delete.9 \
 	sbuf.9 sbuf_done.9 \
 	sbuf.9 sbuf_error.9 \
 	sbuf.9 sbuf_finish.9 \
 	sbuf.9 sbuf_get_flags.9 \
 	sbuf.9 sbuf_hexdump.9 \
 	sbuf.9 sbuf_len.9 \
 	sbuf.9 sbuf_new.9 \
 	sbuf.9 sbuf_new_auto.9 \
 	sbuf.9 sbuf_new_for_sysctl.9 \
 	sbuf.9 sbuf_printf.9 \
 	sbuf.9 sbuf_putc.9 \
 	sbuf.9 sbuf_set_drain.9 \
 	sbuf.9 sbuf_set_flags.9 \
 	sbuf.9 sbuf_setpos.9 \
 	sbuf.9 sbuf_start_section.9 \
 	sbuf.9 sbuf_end_section.9  \
 	sbuf.9 sbuf_trim.9 \
 	sbuf.9 sbuf_vprintf.9
 MLINKS+=scheduler.9 curpriority_cmp.9 \
 	scheduler.9 maybe_resched.9 \
 	scheduler.9 propagate_priority.9 \
 	scheduler.9 resetpriority.9 \
 	scheduler.9 roundrobin.9 \
 	scheduler.9 roundrobin_interval.9 \
 	scheduler.9 schedclock.9 \
 	scheduler.9 schedcpu.9 \
 	scheduler.9 sched_setup.9 \
 	scheduler.9 setrunnable.9 \
 	scheduler.9 updatepri.9
 MLINKS+=SDT.9 SDT_PROVIDER_DECLARE.9 \
 	SDT.9 SDT_PROVIDER_DEFINE.9 \
 	SDT.9 SDT_PROBE_DECLARE.9 \
 	SDT.9 SDT_PROBE_DEFINE.9 \
 	SDT.9 SDT_PROBE.9
 MLINKS+=securelevel_gt.9 securelevel_ge.9
 MLINKS+=selrecord.9 seldrain.9 \
 	selrecord.9 selwakeup.9
 MLINKS+=sema.9 sema_destroy.9 \
 	sema.9 sema_init.9 \
 	sema.9 sema_post.9 \
 	sema.9 sema_timedwait.9 \
 	sema.9 sema_trywait.9 \
 	sema.9 sema_value.9 \
 	sema.9 sema_wait.9
 MLINKS+=sf_buf.9 sf_buf_alloc.9 \
 	sf_buf.9 sf_buf_free.9 \
 	sf_buf.9 sf_buf_kva.9 \
 	sf_buf.9 sf_buf_page.9
 MLINKS+=sglist.9 sglist_alloc.9 \
 	sglist.9 sglist_append.9 \
 	sglist.9 sglist_append_bio.9 \
 	sglist.9 sglist_append_mbuf.9 \
 	sglist.9 sglist_append_phys.9 \
 	sglist.9 sglist_append_sglist.9 \
 	sglist.9 sglist_append_uio.9 \
 	sglist.9 sglist_append_user.9 \
 	sglist.9 sglist_append_vmpages.9 \
 	sglist.9 sglist_build.9 \
 	sglist.9 sglist_clone.9 \
 	sglist.9 sglist_consume_uio.9 \
 	sglist.9 sglist_count.9 \
 	sglist.9 sglist_count_vmpages.9 \
 	sglist.9 sglist_free.9 \
 	sglist.9 sglist_hold.9 \
 	sglist.9 sglist_init.9 \
 	sglist.9 sglist_join.9 \
 	sglist.9 sglist_length.9 \
 	sglist.9 sglist_reset.9 \
 	sglist.9 sglist_slice.9 \
 	sglist.9 sglist_split.9
 MLINKS+=shm_map.9 shm_unmap.9
 MLINKS+=signal.9 cursig.9 \
 	signal.9 execsigs.9 \
 	signal.9 issignal.9 \
 	signal.9 killproc.9 \
 	signal.9 pgsigio.9 \
 	signal.9 postsig.9 \
 	signal.9 SETSETNEQ.9 \
 	signal.9 SETSETOR.9 \
 	signal.9 SIGADDSET.9 \
 	signal.9 SIG_CONTSIGMASK.9 \
 	signal.9 SIGDELSET.9 \
 	signal.9 SIGEMPTYSET.9 \
 	signal.9 sigexit.9 \
 	signal.9 SIGFILLSET.9 \
 	signal.9 siginit.9 \
 	signal.9 SIGISEMPTY.9 \
 	signal.9 SIGISMEMBER.9 \
 	signal.9 SIGNOTEMPTY.9 \
 	signal.9 signotify.9 \
 	signal.9 SIGPENDING.9 \
 	signal.9 SIGSETAND.9 \
 	signal.9 SIGSETCANTMASK.9 \
 	signal.9 SIGSETEQ.9 \
 	signal.9 SIGSETNAND.9 \
 	signal.9 SIG_STOPSIGMASK.9 \
 	signal.9 trapsignal.9
 MLINKS+=sleep.9 msleep.9 \
 	sleep.9 msleep_sbt.9 \
 	sleep.9 msleep_spin.9 \
 	sleep.9 msleep_spin_sbt.9 \
 	sleep.9 pause.9 \
 	sleep.9 pause_sig.9 \
 	sleep.9 pause_sbt.9 \
 	sleep.9 tsleep.9 \
 	sleep.9 tsleep_sbt.9 \
 	sleep.9 wakeup.9 \
 	sleep.9 wakeup_one.9
 MLINKS+=sleepqueue.9 init_sleepqueues.9 \
 	sleepqueue.9 sleepq_abort.9 \
 	sleepqueue.9 sleepq_add.9 \
 	sleepqueue.9 sleepq_alloc.9 \
 	sleepqueue.9 sleepq_broadcast.9 \
 	sleepqueue.9 sleepq_free.9 \
 	sleepqueue.9 sleepq_lookup.9 \
 	sleepqueue.9 sleepq_lock.9 \
 	sleepqueue.9 sleepq_release.9 \
 	sleepqueue.9 sleepq_remove.9 \
 	sleepqueue.9 sleepq_set_timeout.9 \
 	sleepqueue.9 sleepq_set_timeout_sbt.9 \
 	sleepqueue.9 sleepq_signal.9 \
 	sleepqueue.9 sleepq_sleepcnt.9 \
 	sleepqueue.9 sleepq_timedwait.9 \
 	sleepqueue.9 sleepq_timedwait_sig.9 \
 	sleepqueue.9 sleepq_type.9 \
 	sleepqueue.9 sleepq_wait.9 \
 	sleepqueue.9 sleepq_wait_sig.9
 MLINKS+=socket.9 soabort.9 \
 	socket.9 soaccept.9 \
 	socket.9 sobind.9 \
 	socket.9 socheckuid.9 \
 	socket.9 soclose.9 \
 	socket.9 soconnect.9 \
 	socket.9 socreate.9 \
 	socket.9 sodisconnect.9 \
 	socket.9 sodtor_set.9 \
 	socket.9 sodupsockaddr.9 \
 	socket.9 sofree.9 \
 	socket.9 sogetopt.9 \
 	socket.9 sohasoutofband.9 \
 	socket.9 solisten.9 \
 	socket.9 solisten_proto.9 \
 	socket.9 solisten_proto_check.9 \
 	socket.9 sonewconn.9 \
 	socket.9 sooptcopyin.9 \
 	socket.9 sooptcopyout.9 \
 	socket.9 sopoll.9 \
 	socket.9 sopoll_generic.9 \
 	socket.9 soreceive.9 \
 	socket.9 soreceive_dgram.9 \
 	socket.9 soreceive_generic.9 \
 	socket.9 soreceive_stream.9 \
 	socket.9 soreserve.9 \
 	socket.9 sorflush.9 \
 	socket.9 sosend.9 \
 	socket.9 sosend_dgram.9 \
 	socket.9 sosend_generic.9 \
 	socket.9 sosetopt.9 \
 	socket.9 soshutdown.9 \
 	socket.9 sotoxsocket.9 \
 	socket.9 soupcall_clear.9 \
 	socket.9 soupcall_set.9 \
 	socket.9 sowakeup.9
 MLINKS+=stack.9 stack_copy.9 \
 	stack.9 stack_create.9 \
 	stack.9 stack_destroy.9 \
 	stack.9 stack_print.9 \
 	stack.9 stack_print_ddb.9 \
 	stack.9 stack_print_short.9 \
 	stack.9 stack_print_short_ddb.9 \
 	stack.9 stack_put.9 \
 	stack.9 stack_save.9 \
 	stack.9 stack_sbuf_print.9 \
 	stack.9 stack_sbuf_print_ddb.9 \
 	stack.9 stack_zero.9
 MLINKS+=store.9 subyte.9 \
 	store.9 suword.9 \
 	store.9 suword16.9 \
 	store.9 suword32.9 \
 	store.9 suword64.9
 MLINKS+=swi.9 swi_add.9 \
 	swi.9 swi_remove.9 \
 	swi.9 swi_sched.9
 MLINKS+=sx.9 sx_assert.9 \
 	sx.9 sx_destroy.9 \
 	sx.9 sx_downgrade.9 \
 	sx.9 sx_init.9 \
 	sx.9 sx_init_flags.9 \
 	sx.9 sx_sleep.9 \
 	sx.9 sx_slock.9 \
 	sx.9 sx_slock_sig.9 \
 	sx.9 sx_sunlock.9 \
 	sx.9 SX_SYSINIT.9 \
 	sx.9 SX_SYSINIT_FLAGS.9 \
 	sx.9 sx_try_slock.9 \
 	sx.9 sx_try_upgrade.9 \
 	sx.9 sx_try_xlock.9 \
 	sx.9 sx_unlock.9 \
 	sx.9 sx_xholder.9 \
 	sx.9 sx_xlock.9 \
 	sx.9 sx_xlock_sig.9 \
 	sx.9 sx_xlocked.9 \
 	sx.9 sx_xunlock.9
 MLINKS+=syscall_helper_register.9 syscall_helper_unregister.9 \
 	syscall_helper_register.9 SYSCALL_INIT_HELPER.9 \
 	syscall_helper_register.9 SYSCALL_INIT_HELPER_COMPAT.9 \
 	syscall_helper_register.9 SYSCALL_INIT_HELPER_COMPAT_F.9 \
 	syscall_helper_register.9 SYSCALL_INIT_HELPER_F.9
 MLINKS+=sysctl.9 SYSCTL_DECL.9 \
 	sysctl.9 SYSCTL_ADD_INT.9 \
 	sysctl.9 SYSCTL_ADD_LONG.9 \
 	sysctl.9 SYSCTL_ADD_NODE.9 \
 	sysctl.9 SYSCTL_ADD_NODE_WITH_LABEL.9 \
 	sysctl.9 SYSCTL_ADD_OPAQUE.9 \
 	sysctl.9 SYSCTL_ADD_PROC.9 \
 	sysctl.9 SYSCTL_ADD_QUAD.9 \
 	sysctl.9 SYSCTL_ADD_ROOT_NODE.9 \
 	sysctl.9 SYSCTL_ADD_S8.9 \
 	sysctl.9 SYSCTL_ADD_S16.9 \
 	sysctl.9 SYSCTL_ADD_S32.9 \
 	sysctl.9 SYSCTL_ADD_S64.9 \
 	sysctl.9 SYSCTL_ADD_STRING.9 \
 	sysctl.9 SYSCTL_ADD_STRUCT.9 \
 	sysctl.9 SYSCTL_ADD_U8.9 \
 	sysctl.9 SYSCTL_ADD_U16.9 \
 	sysctl.9 SYSCTL_ADD_U32.9 \
 	sysctl.9 SYSCTL_ADD_U64.9 \
 	sysctl.9 SYSCTL_ADD_UAUTO.9 \
 	sysctl.9 SYSCTL_ADD_UINT.9 \
 	sysctl.9 SYSCTL_ADD_ULONG.9 \
 	sysctl.9 SYSCTL_ADD_UQUAD.9 \
 	sysctl.9 SYSCTL_CHILDREN.9 \
 	sysctl.9 SYSCTL_STATIC_CHILDREN.9 \
 	sysctl.9 SYSCTL_NODE_CHILDREN.9 \
 	sysctl.9 SYSCTL_PARENT.9 \
 	sysctl.9 SYSCTL_INT.9 \
 	sysctl.9 SYSCTL_INT_WITH_LABEL.9 \
 	sysctl.9 SYSCTL_LONG.9 \
 	sysctl.9 SYSCTL_NODE.9 \
 	sysctl.9 SYSCTL_NODE_WITH_LABEL.9 \
 	sysctl.9 SYSCTL_OPAQUE.9 \
 	sysctl.9 SYSCTL_PROC.9 \
 	sysctl.9 SYSCTL_QUAD.9 \
 	sysctl.9 SYSCTL_ROOT_NODE.9 \
 	sysctl.9 SYSCTL_S8.9 \
 	sysctl.9 SYSCTL_S16.9 \
 	sysctl.9 SYSCTL_S32.9 \
 	sysctl.9 SYSCTL_S64.9 \
 	sysctl.9 SYSCTL_STRING.9 \
 	sysctl.9 SYSCTL_STRUCT.9 \
 	sysctl.9 SYSCTL_U8.9 \
 	sysctl.9 SYSCTL_U16.9 \
 	sysctl.9 SYSCTL_U32.9 \
 	sysctl.9 SYSCTL_U64.9 \
 	sysctl.9 SYSCTL_UINT.9 \
 	sysctl.9 SYSCTL_ULONG.9 \
 	sysctl.9 SYSCTL_UQUAD.9
 MLINKS+=sysctl_add_oid.9 sysctl_move_oid.9 \
 	sysctl_add_oid.9 sysctl_remove_oid.9 \
 	sysctl_add_oid.9 sysctl_remove_name.9
 MLINKS+=sysctl_ctx_init.9 sysctl_ctx_entry_add.9 \
 	sysctl_ctx_init.9 sysctl_ctx_entry_del.9 \
 	sysctl_ctx_init.9 sysctl_ctx_entry_find.9 \
 	sysctl_ctx_init.9 sysctl_ctx_free.9
 MLINKS+=SYSINIT.9 SYSUNINIT.9
 MLINKS+=taskqueue.9 TASK_INIT.9 \
 	taskqueue.9 TASK_INITIALIZER.9 \
 	taskqueue.9 taskqueue_block.9 \
 	taskqueue.9 taskqueue_cancel.9 \
 	taskqueue.9 taskqueue_cancel_timeout.9 \
 	taskqueue.9 taskqueue_create.9 \
 	taskqueue.9 taskqueue_create_fast.9 \
 	taskqueue.9 TASKQUEUE_DECLARE.9 \
 	taskqueue.9 TASKQUEUE_DEFINE.9 \
 	taskqueue.9 TASKQUEUE_DEFINE_THREAD.9 \
 	taskqueue.9 taskqueue_drain.9 \
 	taskqueue.9 taskqueue_drain_all.9 \
 	taskqueue.9 taskqueue_drain_timeout.9 \
 	taskqueue.9 taskqueue_enqueue.9 \
 	taskqueue.9 taskqueue_enqueue_timeout.9 \
 	taskqueue.9 TASKQUEUE_FAST_DEFINE.9 \
 	taskqueue.9 TASKQUEUE_FAST_DEFINE_THREAD.9 \
 	taskqueue.9 taskqueue_free.9 \
 	taskqueue.9 taskqueue_member.9 \
 	taskqueue.9 taskqueue_quiesce.9 \
 	taskqueue.9 taskqueue_run.9 \
 	taskqueue.9 taskqueue_set_callback.9 \
 	taskqueue.9 taskqueue_start_threads.9 \
 	taskqueue.9 taskqueue_start_threads_pinned.9 \
 	taskqueue.9 taskqueue_unblock.9 \
 	taskqueue.9 TIMEOUT_TASK_INIT.9
 MLINKS+=tcp_functions.9 register_tcp_functions.9 \
 	tcp_functions.9 register_tcp_functions_as_name.9 \
 	tcp_functions.9 register_tcp_functions_as_names.9 \
 	tcp_functions.9 deregister_tcp_functions.9
 MLINKS+=time.9 boottime.9 \
 	time.9 time_second.9 \
 	time.9 time_uptime.9
 MLINKS+=timeout.9 callout.9 \
 	timeout.9 callout_active.9 \
 	timeout.9 callout_async_drain.9 \
 	timeout.9 callout_deactivate.9 \
 	timeout.9 callout_drain.9 \
 	timeout.9 callout_handle_init.9 \
 	timeout.9 callout_init.9 \
 	timeout.9 callout_init_mtx.9 \
 	timeout.9 callout_init_rm.9 \
 	timeout.9 callout_init_rw.9 \
 	timeout.9 callout_pending.9 \
 	timeout.9 callout_reset.9 \
 	timeout.9 callout_reset_curcpu.9 \
 	timeout.9 callout_reset_on.9 \
 	timeout.9 callout_reset_sbt.9 \
 	timeout.9 callout_reset_sbt_curcpu.9 \
 	timeout.9 callout_reset_sbt_on.9 \
 	timeout.9 callout_schedule.9 \
 	timeout.9 callout_schedule_curcpu.9 \
 	timeout.9 callout_schedule_on.9 \
 	timeout.9 callout_schedule_sbt.9 \
 	timeout.9 callout_schedule_sbt_curcpu.9 \
 	timeout.9 callout_schedule_sbt_on.9 \
 	timeout.9 callout_stop.9 \
 	timeout.9 callout_when.9 \
 	timeout.9 untimeout.9
 MLINKS+=ucred.9 crcopy.9 \
 	ucred.9 crcopysafe.9 \
 	ucred.9 crdup.9 \
 	ucred.9 crfree.9 \
 	ucred.9 crget.9 \
 	ucred.9 crhold.9 \
 	ucred.9 crsetgroups.9 \
 	ucred.9 cru2x.9
 MLINKS+=uidinfo.9 uifind.9 \
 	uidinfo.9 uifree.9 \
 	uidinfo.9 uihashinit.9 \
 	uidinfo.9 uihold.9
 MLINKS+=uio.9 uiomove.9 \
 	uio.9 uiomove_frombuf.9 \
 	uio.9 uiomove_nofault.9
 
 .if ${MK_USB} != "no"
 MAN+=	usbdi.9
 MLINKS+=usbdi.9 usbd_do_request.9 \
 	usbdi.9 usbd_do_request_flags.9 \
 	usbdi.9 usbd_errstr.9 \
 	usbdi.9 usbd_lookup_id_by_info.9 \
 	usbdi.9 usbd_lookup_id_by_uaa.9 \
 	usbdi.9 usbd_transfer_clear_stall.9 \
 	usbdi.9 usbd_transfer_drain.9 \
 	usbdi.9 usbd_transfer_pending.9 \
 	usbdi.9 usbd_transfer_poll.9 \
 	usbdi.9 usbd_transfer_setup.9 \
 	usbdi.9 usbd_transfer_start.9 \
 	usbdi.9 usbd_transfer_stop.9 \
 	usbdi.9 usbd_transfer_submit.9 \
 	usbdi.9 usbd_transfer_unsetup.9 \
 	usbdi.9 usbd_xfer_clr_flag.9 \
 	usbdi.9 usbd_xfer_frame_data.9 \
 	usbdi.9 usbd_xfer_frame_len.9 \
 	usbdi.9 usbd_xfer_get_frame.9 \
 	usbdi.9 usbd_xfer_get_priv.9 \
 	usbdi.9 usbd_xfer_is_stalled.9 \
 	usbdi.9 usbd_xfer_max_framelen.9 \
 	usbdi.9 usbd_xfer_max_frames.9 \
 	usbdi.9 usbd_xfer_max_len.9 \
 	usbdi.9 usbd_xfer_set_flag.9 \
 	usbdi.9 usbd_xfer_set_frame_data.9 \
 	usbdi.9 usbd_xfer_set_frame_len.9 \
 	usbdi.9 usbd_xfer_set_frame_offset.9 \
 	usbdi.9 usbd_xfer_set_frames.9 \
 	usbdi.9 usbd_xfer_set_interval.9 \
 	usbdi.9 usbd_xfer_set_priv.9 \
 	usbdi.9 usbd_xfer_set_stall.9 \
 	usbdi.9 usbd_xfer_set_timeout.9 \
 	usbdi.9 usbd_xfer_softc.9 \
 	usbdi.9 usbd_xfer_state.9 \
 	usbdi.9 usbd_xfer_status.9 \
 	usbdi.9 usb_fifo_alloc_buffer.9 \
 	usbdi.9 usb_fifo_attach.9 \
 	usbdi.9 usb_fifo_detach.9 \
 	usbdi.9 usb_fifo_free_buffer.9 \
 	usbdi.9 usb_fifo_get_data.9 \
 	usbdi.9 usb_fifo_get_data_buffer.9 \
 	usbdi.9 usb_fifo_get_data_error.9 \
 	usbdi.9 usb_fifo_get_data_linear.9 \
 	usbdi.9 usb_fifo_put_bytes_max.9 \
 	usbdi.9 usb_fifo_put_data.9 \
 	usbdi.9 usb_fifo_put_data_buffer.9 \
 	usbdi.9 usb_fifo_put_data_error.9 \
 	usbdi.9 usb_fifo_put_data_linear.9 \
 	usbdi.9 usb_fifo_reset.9 \
 	usbdi.9 usb_fifo_softc.9 \
 	usbdi.9 usb_fifo_wakeup.9
 .endif
 MLINKS+=vcount.9 count_dev.9
 MLINKS+=vfsconf.9 vfs_modevent.9 \
 	vfsconf.9 vfs_register.9 \
 	vfsconf.9 vfs_unregister.9
 MLINKS+=vfs_getopt.9 vfs_copyopt.9 \
 	vfs_getopt.9 vfs_filteropt.9 \
 	vfs_getopt.9 vfs_flagopt.9 \
 	vfs_getopt.9 vfs_getopts.9 \
 	vfs_getopt.9 vfs_scanopt.9 \
 	vfs_getopt.9 vfs_setopt.9 \
 	vfs_getopt.9 vfs_setopt_part.9 \
 	vfs_getopt.9 vfs_setopts.9
 MLINKS+=vhold.9 vdrop.9 \
 	vhold.9 vdropl.9 \
 	vhold.9 vholdl.9
 MLINKS+=vmem.9 vmem_add.9 \
 	vmem.9 vmem_alloc.9 \
 	vmem.9 vmem_create.9 \
 	vmem.9 vmem_destroy.9 \
 	vmem.9 vmem_free.9 \
 	vmem.9 vmem_xalloc.9 \
 	vmem.9 vmem_xfree.9  
 MLINKS+=vm_map_lock.9 vm_map_lock_downgrade.9 \
 	vm_map_lock.9 vm_map_lock_read.9 \
 	vm_map_lock.9 vm_map_lock_upgrade.9 \
 	vm_map_lock.9 vm_map_trylock.9 \
 	vm_map_lock.9 vm_map_trylock_read.9 \
 	vm_map_lock.9 vm_map_unlock.9 \
 	vm_map_lock.9 vm_map_unlock_read.9
 MLINKS+=vm_map_lookup.9 vm_map_lookup_done.9
 MLINKS+=vm_map_max.9 vm_map_min.9 \
 	vm_map_max.9 vm_map_pmap.9
 MLINKS+=vm_map_stack.9 vm_map_growstack.9
 MLINKS+=vm_map_wire.9 vm_map_unwire.9
 MLINKS+=vm_page_bits.9 vm_page_clear_dirty.9 \
 	vm_page_bits.9 vm_page_dirty.9 \
 	vm_page_bits.9 vm_page_is_valid.9 \
 	vm_page_bits.9 vm_page_set_invalid.9 \
 	vm_page_bits.9 vm_page_set_validclean.9 \
 	vm_page_bits.9 vm_page_test_dirty.9 \
 	vm_page_bits.9 vm_page_undirty.9 \
 	vm_page_bits.9 vm_page_zero_invalid.9
 MLINKS+=vm_page_busy.9 vm_page_busied.9 \
 	vm_page_busy.9 vm_page_busy_downgrade.9 \
 	vm_page_busy.9 vm_page_busy_sleep.9 \
 	vm_page_busy.9 vm_page_sbusied.9 \
 	vm_page_busy.9 vm_page_sbusy.9 \
 	vm_page_busy.9 vm_page_sleep_if_busy.9 \
 	vm_page_busy.9 vm_page_sunbusy.9 \
 	vm_page_busy.9 vm_page_trysbusy.9 \
 	vm_page_busy.9 vm_page_tryxbusy.9 \
 	vm_page_busy.9 vm_page_xbusied.9 \
 	vm_page_busy.9 vm_page_xbusy.9 \
 	vm_page_busy.9 vm_page_xunbusy.9 \
 	vm_page_busy.9 vm_page_assert_sbusied.9 \
 	vm_page_busy.9 vm_page_assert_unbusied.9 \
 	vm_page_busy.9 vm_page_assert_xbusied.9
 MLINKS+=vm_page_aflag.9 vm_page_aflag_clear.9 \
 	vm_page_aflag.9 vm_page_aflag_set.9 \
 	vm_page_aflag.9 vm_page_reference.9
 MLINKS+=vm_page_free.9 vm_page_free_toq.9 \
 	vm_page_free.9 vm_page_free_zero.9 \
 	vm_page_free.9 vm_page_try_to_free.9
 MLINKS+=vm_page_hold.9 vm_page_unhold.9
 MLINKS+=vm_page_insert.9 vm_page_remove.9
 MLINKS+=vm_page_wire.9 vm_page_unwire.9
 MLINKS+=VOP_ACCESS.9 VOP_ACCESSX.9
 MLINKS+=VOP_ATTRIB.9 VOP_GETATTR.9 \
 	VOP_ATTRIB.9 VOP_SETATTR.9
 MLINKS+=VOP_CREATE.9 VOP_MKDIR.9 \
 	VOP_CREATE.9 VOP_MKNOD.9 \
 	VOP_CREATE.9 VOP_SYMLINK.9
+MLINKS+=VOP_FSYNC.9 VOP_FDATASYNC.9
 MLINKS+=VOP_GETPAGES.9 VOP_PUTPAGES.9
 MLINKS+=VOP_INACTIVE.9 VOP_RECLAIM.9
 MLINKS+=VOP_LOCK.9 vn_lock.9 \
 	VOP_LOCK.9 VOP_ISLOCKED.9 \
 	VOP_LOCK.9 VOP_UNLOCK.9
 MLINKS+=VOP_OPENCLOSE.9 VOP_CLOSE.9 \
 	VOP_OPENCLOSE.9 VOP_OPEN.9
 MLINKS+=VOP_RDWR.9 VOP_READ.9 \
 	VOP_RDWR.9 VOP_WRITE.9
 MLINKS+=VOP_REMOVE.9 VOP_RMDIR.9
 MLINKS+=vnet.9 vimage.9
 MLINKS+=vref.9 VREF.9 \
 	vref.9 vrefl.9
 MLINKS+=vrele.9 vput.9 \
 	vrele.9 vunref.9
 MLINKS+=vslock.9 vsunlock.9
 MLINKS+=zone.9 uma.9 \
 	zone.9 uma_zalloc.9 \
 	zone.9 uma_zalloc_arg.9 \
 	zone.9 uma_zalloc_domain.9 \
 	zone.9 uma_zcreate.9 \
 	zone.9 uma_zdestroy.9 \
 	zone.9 uma_zfree.9 \
 	zone.9 uma_zfree_arg.9 \
 	zone.9 uma_zfree_domain.9 \
 	zone.9 uma_zone_get_cur.9 \
 	zone.9 uma_zone_get_max.9 \
 	zone.9 uma_zone_set_max.9 \
 	zone.9 uma_zone_set_warning.9 \
 	zone.9 uma_zone_set_maxaction.9
 
 .include <bsd.prog.mk>
Index: projects/capsicum-test/share/man/man9/VOP_FSYNC.9
===================================================================
--- projects/capsicum-test/share/man/man9/VOP_FSYNC.9	(revision 345709)
+++ projects/capsicum-test/share/man/man9/VOP_FSYNC.9	(revision 345710)
@@ -1,84 +1,103 @@
 .\" -*- nroff -*-
 .\"
 .\" Copyright (c) 1996 Doug Rabson
 .\"
 .\" All rights reserved.
 .\"
 .\" This program is free software.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR
 .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
 .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 24, 1996
+.Dd March 22, 2019
 .Dt VOP_FSYNC 9
 .Os
 .Sh NAME
+.Nm VOP_FDATASYNC ,
 .Nm VOP_FSYNC
 .Nd flush file system buffers for a file
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/vnode.h
 .Ft int
+.Fn VOP_FDATASYNC "struct vnode *vp" "struct thread *td"
+.Ft int
 .Fn VOP_FSYNC "struct vnode *vp" "int waitfor" "struct thread *td"
 .Sh DESCRIPTION
-This call flushes any dirty file system buffers for the file.
-It is used to implement the
+.Fn VOP_FSYNC
+ensures that a file can be recovered to its current state following a crash.
+That typically requires flushing the file's dirty buffers, its inode, and
+possibly other filesystem metadata to persistent media.
+.Fn VOP_FSYNC
+is used to implement the
 .Xr sync 2
 and
 .Xr fsync 2
 system calls.
 .Pp
 Its arguments are:
 .Bl -tag -width waitfor
 .It Fa vp
 The vnode of the file.
 .It Fa waitfor
 Whether the function should wait for I/O to complete.
 Possible values are:
 .Bl -tag -width MNT_NOWAIT
 .It Dv MNT_WAIT
 Synchronously wait for I/O to complete.
 .It Dv MNT_NOWAIT
 Start all I/O, but do not wait for it.
 .It Dv MNT_LAZY
 Push data not written by file system syncer.
 .El
 .It Fa td
 The calling thread.
 .El
+.Pp
+.Fn VOP_FDATASYNC
+is similar, but it does not require that all of the file's metadata be flushed.
+It only requires that the file's data be recoverable after a crash.
+That implies that the data itself must be flushed to disk, as well as some
+metadata such as the file's size but not necessarily its attributes.
+.Fn VOP_FDATASYNC
+should always wait for I/O to complete, as if called with
+.Dv MNT_WAIT .
+.Fn VOP_FDATASYNC
+is used to implement
+.Xr fdatasync 2 .
 .Sh LOCKS
-The file should be locked on entry.
+The vnode should be exclusively locked on entry, and stays locked on return.
 .Sh RETURN VALUES
 Zero is returned if the call is successful, otherwise an appropriate
 error code is returned.
 .Sh ERRORS
 .Bl -tag -width Er
 .It Bq Er ENOSPC
 The file system is full.
 .It Bq Er EDQUOT
 Quota exceeded.
 .El
 .Sh SEE ALSO
 .Xr vnode 9
 .Sh AUTHORS
 This manual page was written by
 .An Doug Rabson .
Index: projects/capsicum-test/share/mk/bsd.progs.mk
===================================================================
--- projects/capsicum-test/share/mk/bsd.progs.mk	(revision 345709)
+++ projects/capsicum-test/share/mk/bsd.progs.mk	(revision 345710)
@@ -1,169 +1,169 @@
 # $FreeBSD$
 # $Id: progs.mk,v 1.11 2012/11/06 17:18:54 sjg Exp $
 #
 #	@(#) Copyright (c) 2006, Simon J. Gerraty
 #
 #	This file is provided in the hope that it will
 #	be of use.  There is absolutely NO WARRANTY.
 #	Permission to copy, redistribute or otherwise
 #	use this file is hereby granted provided that 
 #	the above copyright notice and this notice are
 #	left intact. 
 #      
 #	Please send copies of changes and bug-fixes to:
 #	sjg@crufty.net
 #
 
 .MAIN: all
 
 .if defined(PROGS) || defined(PROGS_CXX)
 # we really only use PROGS below...
 PROGS += ${PROGS_CXX}
 
 .if defined(PROG)
 # just one of many
-PROG_OVERRIDE_VARS +=	BINDIR BINGRP BINOWN BINMODE DPSRCS MAN NO_SHARED \
-			NO_WERROR PROGNAME SRCS STRIP WARNS
+PROG_OVERRIDE_VARS +=	BINDIR BINGRP BINOWN BINMODE CSTD CXXSTD DPSRCS MAN \
+			NO_SHARED NO_WERROR PROGNAME SRCS STRIP WARNS
 PROG_VARS +=	CFLAGS CXXFLAGS DEBUG_FLAGS DPADD INTERNALPROG LDADD LIBADD \
 		LINKS LDFLAGS MLINKS ${PROG_OVERRIDE_VARS}
 .for v in ${PROG_VARS:O:u}
 .if empty(${PROG_OVERRIDE_VARS:M$v})
 .if defined(${v}.${PROG})
 $v += ${${v}.${PROG}}
 .elif defined(${v}_${PROG})
 $v += ${${v}_${PROG}}
 .endif
 .else
 .if defined(${v}.${PROG})
 $v = ${${v}.${PROG}}
 .elif defined(${v}_${PROG})
 $v = ${${v}_${PROG}}
 .endif
 $v ?=
 .endif
 .endfor
 
 .if ${MK_DIRDEPS_BUILD} == "yes"
 # Leave updating the Makefile.depend to the parent.
 UPDATE_DEPENDFILE = NO
 
 # Record our meta files for the parent to use.
 CLEANFILES+= ${PROG}.meta_files
 ${PROG}.meta_files: .NOMETA $${.MAKE.META.CREATED} ${_this}
 	@echo "Updating ${.TARGET}: ${.OODATE:T:[1..8]}"
 	@echo ${.MAKE.META.FILES} > ${.TARGET}
 
 .if !defined(_SKIP_BUILD)
 .END: ${PROG}.meta_files
 .endif
 .endif	# ${MK_DIRDEPS_BUILD} == "yes"
 
 # prog.mk will do the rest
 .else # !defined(PROG)
 .if !defined(_SKIP_BUILD)
 all: ${PROGS}
 .endif
 
 META_XTRAS+=	${cat ${PROGS:S/$/*.meta_files/} 2>/dev/null || true:L:sh}
 
 .if ${MK_STAGING} != "no" && !empty(PROGS)
 # Stage from parent while respecting PROGNAME and BINDIR overrides.
 .for _prog in ${PROGS}
 STAGE_DIR.prog.${_prog}= ${STAGE_OBJTOP}${BINDIR.${_prog}:UBINDIR_${_prog}:U${BINDIR}}
 STAGE_AS_SETS+=	prog.${_prog}
 STAGE_AS_prog.${_prog}=	${PROGNAME.${_prog}:UPROGNAME_${_prog}:U${_prog}}
 stage_as.prog.${_prog}: ${_prog}
 .endfor
 .endif	# ${MK_STAGING} != "no" && !empty(PROGS)
 .endif
 .endif	# PROGS || PROGS_CXX
 
 # These are handled by the main make process.
 .ifdef _RECURSING_PROGS
 MK_STAGING= no
 
 _PROGS_GLOBAL_VARS= CLEANFILES CLEANDIRS CONFGROUPS DIRS FILESGROUPS INCSGROUPS \
 		    SCRIPTS
 .for v in ${_PROGS_GLOBAL_VARS}
 $v =
 .endfor
 .endif
 
 # handle being called [bsd.]progs.mk
 .include <bsd.prog.mk>
 
 # Find common sources among the PROGS to depend on them before building
 # anything.  This allows parallelization without them each fighting over
 # the same objects.
 _PROGS_COMMON_SRCS=
 _PROGS_ALL_SRCS=
 .for p in ${PROGS}
 .for s in ${SRCS.${p}}
 .if ${_PROGS_ALL_SRCS:M${s}} && !${_PROGS_COMMON_SRCS:M${s}}
 _PROGS_COMMON_SRCS+=	${s}
 .else
 _PROGS_ALL_SRCS+=	${s}
 .endif
 .endfor
 .endfor
 .if !empty(_PROGS_COMMON_SRCS)
 _PROGS_COMMON_OBJS=	${_PROGS_COMMON_SRCS:M*.[dhly]}
 .if !empty(_PROGS_COMMON_SRCS:N*.[dhly])
 _PROGS_COMMON_OBJS+=	${_PROGS_COMMON_SRCS:N*.[dhly]:${OBJS_SRCS_FILTER:ts:}:S/$/.o/g}
 .endif
 .endif
 
 # When recursing, ensure common sources are not rebuilt in META_MODE.
 .if defined(_RECURSING_PROGS) && !empty(_PROGS_COMMON_OBJS) && \
     !empty(.MAKE.MODE:Mmeta)
 ${_PROGS_COMMON_OBJS}: .NOMETA
 .endif
 
 .if !empty(PROGS) && !defined(_RECURSING_PROGS) && !defined(PROG)
 # tell progs.mk we might want to install things
 PROGS_TARGETS+= checkdpadd clean depend install
 # Only handle removing depend files from the main process.
 _PROG_MK.cleandir=	CLEANDEPENDFILES= CLEANDEPENDDIRS=
 _PROG_MK.cleanobj=	CLEANDEPENDFILES= CLEANDEPENDDIRS=
 # Only recurse on these if there is no objdir, meaning a normal
 # 'clean' gets ran via the target defined in bsd.obj.mk.
 # Same check from cleanobj: in bsd.obj.mk
 .if ${CANONICALOBJDIR} == ${.CURDIR} || !exists(${CANONICALOBJDIR}/)
 PROGS_TARGETS+=	cleandir cleanobj
 .endif
 
 # Ensure common objects are built before recursing.
 .if !empty(_PROGS_COMMON_OBJS)
 ${PROGS}: ${_PROGS_COMMON_OBJS}
 .endif
 
 .for p in ${PROGS}
 .if defined(PROGS_CXX) && !empty(PROGS_CXX:M$p)
 # bsd.prog.mk may need to know this
 x.$p= PROG_CXX=$p
 .endif
 
 # Main PROG target
 $p ${p}_p: .PHONY .MAKE
 	(cd ${.CURDIR} && \
 	    DEPENDFILE=.depend.$p \
 	    NO_SUBDIR=1 ${MAKE} -f ${MAKEFILE} _RECURSING_PROGS=t \
 	    PROG=$p ${x.$p})
 
 # Pseudo targets for PROG, such as 'install'.
 .for t in ${PROGS_TARGETS:O:u}
 $p.$t: .PHONY .MAKE
 	(cd ${.CURDIR} && \
 	    DEPENDFILE=.depend.$p \
 	    NO_SUBDIR=1 ${MAKE} -f ${MAKEFILE} _RECURSING_PROGS=t \
 	    ${_PROG_MK.${t}} PROG=$p ${x.$p} ${@:E})
 .endfor
 .endfor
 
 # Depend main pseudo targets on all PROG.pseudo targets too.
 .for t in ${PROGS_TARGETS:O:u}
 .if make(${t})
 $t: ${PROGS:%=%.$t}
 .endif
 .endfor
 .endif	# !empty(PROGS) && !defined(_RECURSING_PROGS) && !defined(PROG)
Index: projects/capsicum-test/share/mk/bsd.sys.mk
===================================================================
--- projects/capsicum-test/share/mk/bsd.sys.mk	(revision 345709)
+++ projects/capsicum-test/share/mk/bsd.sys.mk	(revision 345710)
@@ -1,376 +1,389 @@
 # $FreeBSD$
 #
 # This file contains common settings used for building FreeBSD
 # sources.
 
 # Enable various levels of compiler warning checks.  These may be
 # overridden (e.g. if using a non-gcc compiler) by defining MK_WARNS=no.
 
 # for 4.2.1 GCC:   http://gcc.gnu.org/onlinedocs/gcc-4.2.1/gcc/Warning-Options.html
 # for current GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
 # for clang: https://clang.llvm.org/docs/DiagnosticsReference.html
 
 .include <bsd.compiler.mk>
 
 # the default is gnu99 for now
 CSTD?=		gnu99
 
 .if ${CSTD} == "c89" || ${CSTD} == "c90"
 CFLAGS+=	-std=iso9899:1990
 .elif ${CSTD} == "c94" || ${CSTD} == "c95"
 CFLAGS+=	-std=iso9899:199409
 .elif ${CSTD} == "c99"
 CFLAGS+=	-std=iso9899:1999
 .else # CSTD
 CFLAGS+=	-std=${CSTD}
 .endif # CSTD
+
+.if ${COMPILER_FEATURES:Mc++11}
+CXXSTD?=	c++11
+.elif ${COMPILER_TYPE} == "gcc"
+# Prior versions of g++ support C++98 with GNU extensions by default.
+CXXSTD?=	gnu++98
+.else
+# Assume that the compiler supports at least C++98.
+CXXSTD?=	c++98
+.endif
+CXXFLAGS+=	-std=${CXXSTD}
+# CXXSTD
+
 # -pedantic is problematic because it also imposes namespace restrictions
 #CFLAGS+=	-pedantic
 .if defined(WARNS)
 .if ${WARNS} >= 1
 CWARNFLAGS+=	-Wsystem-headers
 .if !defined(NO_WERROR) && !defined(NO_WERROR.${COMPILER_TYPE})
 CWARNFLAGS+=	-Werror
 .endif # !NO_WERROR && !NO_WERROR.${COMPILER_TYPE}
 .endif # WARNS >= 1
 .if ${WARNS} >= 2
 CWARNFLAGS+=	-Wall -Wno-format-y2k
 .endif # WARNS >= 2
 .if ${WARNS} >= 3
 CWARNFLAGS+=	-W -Wno-unused-parameter -Wstrict-prototypes\
 		-Wmissing-prototypes -Wpointer-arith
 .endif # WARNS >= 3
 .if ${WARNS} >= 4
 CWARNFLAGS+=	-Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wshadow\
 		-Wunused-parameter
 .if !defined(NO_WCAST_ALIGN) && !defined(NO_WCAST_ALIGN.${COMPILER_TYPE})
 CWARNFLAGS+=	-Wcast-align
 .endif # !NO_WCAST_ALIGN !NO_WCAST_ALIGN.${COMPILER_TYPE}
 .endif # WARNS >= 4
 .if ${WARNS} >= 6
 CWARNFLAGS+=	-Wchar-subscripts -Winline -Wnested-externs -Wredundant-decls\
 		-Wold-style-definition
 .if !defined(NO_WMISSING_VARIABLE_DECLARATIONS)
 CWARNFLAGS.clang+=	-Wmissing-variable-declarations
 .endif
 .if !defined(NO_WTHREAD_SAFETY)
 CWARNFLAGS.clang+=	-Wthread-safety
 .endif
 .endif # WARNS >= 6
 .if ${WARNS} >= 2 && ${WARNS} <= 4
 # XXX Delete -Wuninitialized by default for now -- the compiler doesn't
 # XXX always get it right.
 CWARNFLAGS+=	-Wno-uninitialized
 .endif # WARNS >=2 && WARNS <= 4
 CWARNFLAGS+=	-Wno-pointer-sign
 # Clang has more warnings enabled by default, and when using -Wall, so if WARNS
 # is set to low values, these have to be disabled explicitly.
 .if ${WARNS} <= 6
 CWARNFLAGS.clang+=	-Wno-empty-body -Wno-string-plus-int
 .if ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 30400
 CWARNFLAGS.clang+=	-Wno-unused-const-variable
 .endif
 .endif # WARNS <= 6
 .if ${WARNS} <= 3
 CWARNFLAGS.clang+=	-Wno-tautological-compare -Wno-unused-value\
 		-Wno-parentheses-equality -Wno-unused-function -Wno-enum-conversion
 .if ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 30600
 CWARNFLAGS.clang+=	-Wno-unused-local-typedef
 .endif
 .if ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 40000
 CWARNFLAGS.clang+=	-Wno-address-of-packed-member
 .endif
 .if ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 70000 && \
     ${MACHINE_CPUARCH} == "arm" && !${MACHINE_ARCH:Marmv[67]*}
 CWARNFLAGS.clang+=	-Wno-atomic-alignment
 .endif
 .endif # WARNS <= 3
 .if ${WARNS} <= 2
 CWARNFLAGS.clang+=	-Wno-switch -Wno-switch-enum -Wno-knr-promoted-parameter
 .endif # WARNS <= 2
 .if ${WARNS} <= 1
 CWARNFLAGS.clang+=	-Wno-parentheses
 .endif # WARNS <= 1
 .if defined(NO_WARRAY_BOUNDS)
 CWARNFLAGS.clang+=	-Wno-array-bounds
 .endif # NO_WARRAY_BOUNDS
 .endif # WARNS
 
 .if defined(FORMAT_AUDIT)
 WFORMAT=	1
 .endif # FORMAT_AUDIT
 .if defined(WFORMAT)
 .if ${WFORMAT} > 0
 #CWARNFLAGS+=	-Wformat-nonliteral -Wformat-security -Wno-format-extra-args
 CWARNFLAGS+=	-Wformat=2 -Wno-format-extra-args
 .if ${WARNS} <= 3
 CWARNFLAGS.clang+=	-Wno-format-nonliteral
 .endif # WARNS <= 3
 .if !defined(NO_WERROR) && !defined(NO_WERROR.${COMPILER_TYPE})
 CWARNFLAGS+=	-Werror
 .endif # !NO_WERROR && !NO_WERROR.${COMPILER_TYPE}
 .endif # WFORMAT > 0
 .endif # WFORMAT
 .if defined(NO_WFORMAT) || defined(NO_WFORMAT.${COMPILER_TYPE})
 CWARNFLAGS+=	-Wno-format
 .endif # NO_WFORMAT || NO_WFORMAT.${COMPILER_TYPE}
 
 # GCC 5.2.0
 .if ${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 50200
 CWARNFLAGS+=	-Wno-error=address			\
 		-Wno-error=array-bounds			\
 		-Wno-error=attributes			\
 		-Wno-error=bool-compare			\
 		-Wno-error=cast-align			\
 		-Wno-error=clobbered			\
 		-Wno-error=enum-compare			\
 		-Wno-error=extra			\
 		-Wno-error=inline			\
 		-Wno-error=logical-not-parentheses	\
 		-Wno-error=strict-aliasing		\
 		-Wno-error=uninitialized		\
 		-Wno-error=unused-but-set-variable	\
 		-Wno-error=unused-function		\
 		-Wno-error=unused-value
 .endif
 
 # GCC 6.1.0
 .if ${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 60100
 CWARNFLAGS+=	-Wno-error=misleading-indentation	\
 		-Wno-error=nonnull-compare		\
 		-Wno-error=shift-negative-value		\
 		-Wno-error=tautological-compare		\
 		-Wno-error=unused-const-variable
 .endif
 
 # GCC 7.1.0
 .if ${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 70100
 CWARNFLAGS+=	-Wno-error=bool-operation		\
 		-Wno-error=deprecated			\
 		-Wno-error=expansion-to-defined		\
 		-Wno-error=format-overflow		\
 		-Wno-error=format-truncation		\
 		-Wno-error=implicit-fallthrough		\
 		-Wno-error=int-in-bool-context		\
 		-Wno-error=memset-elt-size		\
 		-Wno-error=noexcept-type		\
 		-Wno-error=nonnull			\
 		-Wno-error=pointer-compare		\
 		-Wno-error=stringop-overflow
 .endif
 
 # GCC 8.1.0
 .if ${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 80100
 CWARNFLAGS+=	-Wno-error=aggressive-loop-optimizations	\
 		-Wno-error=cast-function-type			\
 		-Wno-error=catch-value				\
 		-Wno-error=multistatement-macros		\
 		-Wno-error=restrict				\
 		-Wno-error=sizeof-pointer-memaccess		\
 		-Wno-error=stringop-truncation
 .endif
 
 # How to handle FreeBSD custom printf format specifiers.
 .if ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 30600
 FORMAT_EXTENSIONS=	-D__printf__=__freebsd_kprintf__
 .else
 FORMAT_EXTENSIONS=	-fformat-extensions
 .endif
 
 .if defined(IGNORE_PRAGMA)
 CWARNFLAGS+=	-Wno-unknown-pragmas
 .endif # IGNORE_PRAGMA
 
 # We need this conditional because many places that use it
 # only enable it for some files with CLFAGS.$FILE+=${CLANG_NO_IAS}.
 # unconditionally, and can't easily use the CFLAGS.clang=
 # mechanism.
 .if ${COMPILER_TYPE} == "clang"
 CLANG_NO_IAS=	 -no-integrated-as
 .endif
 CLANG_OPT_SMALL= -mstack-alignment=8 -mllvm -inline-threshold=3\
 		 -mllvm -simplifycfg-dup-ret
 .if ${COMPILER_VERSION} >= 30500 && ${COMPILER_VERSION} < 30700
 CLANG_OPT_SMALL+= -mllvm -enable-gvn=false
 .else
 CLANG_OPT_SMALL+= -mllvm -enable-load-pre=false
 .endif
 CFLAGS.clang+=	 -Qunused-arguments
 .if ${MACHINE_CPUARCH} == "sparc64"
 # Don't emit .cfi directives, since we must use GNU as on sparc64, for now.
 CFLAGS.clang+=	 -fno-dwarf2-cfi-asm
 .endif # SPARC64
 # The libc++ headers use c++11 extensions.  These are normally silenced because
 # they are treated as system headers, but we explicitly disable that warning
 # suppression when building the base system to catch bugs in our headers.
 # Eventually we'll want to start building the base system C++ code as C++11,
 # but not yet.
 CXXFLAGS.clang+=	 -Wno-c++11-extensions
 
 .if ${MK_SSP} != "no" && \
     ${MACHINE_CPUARCH} != "arm" && ${MACHINE_CPUARCH} != "mips"
 .if (${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 30500) || \
     (${COMPILER_TYPE} == "gcc" && \
      (${COMPILER_VERSION} == 40201 || ${COMPILER_VERSION} >= 40900))
 # Don't use -Wstack-protector as it breaks world with -Werror.
 SSP_CFLAGS?=	-fstack-protector-strong
 .else
 SSP_CFLAGS?=	-fstack-protector
 .endif
 CFLAGS+=	${SSP_CFLAGS}
 .endif # SSP && !ARM && !MIPS
 
 # Additional flags passed in CFLAGS and CXXFLAGS when MK_DEBUG_FILES is
 # enabled.
 DEBUG_FILES_CFLAGS?= -g
 
 # Allow user-specified additional warning flags, plus compiler and file
 # specific flag overrides, unless we've overridden this...
 .if ${MK_WARNS} != "no"
 CFLAGS+=	${CWARNFLAGS:M*} ${CWARNFLAGS.${COMPILER_TYPE}}
 CFLAGS+=	${CWARNFLAGS.${.IMPSRC:T}}
 .endif
 
 CFLAGS+=	 ${CFLAGS.${COMPILER_TYPE}}
 CXXFLAGS+=	 ${CXXFLAGS.${COMPILER_TYPE}}
 
 AFLAGS+=	${AFLAGS.${.IMPSRC:T}}
 ACFLAGS+=	${ACFLAGS.${.IMPSRC:T}}
 CFLAGS+=	${CFLAGS.${.IMPSRC:T}}
 CXXFLAGS+=	${CXXFLAGS.${.IMPSRC:T}}
 
 LDFLAGS+=	${LDFLAGS.${LINKER_TYPE}}
 
 .if defined(SRCTOP)
 # Prevent rebuilding during install to support read-only objdirs.
 .if ${.TARGETS:M*install*} == ${.TARGETS} && empty(.MAKE.MODE:Mmeta)
 CFLAGS+=	ERROR-tried-to-rebuild-during-make-install
 .endif
 .endif
 
 # Tell bmake not to mistake standard targets for things to be searched for
 # or expect to ever be up-to-date.
 PHONY_NOTMAIN = analyze afterdepend afterinstall all beforedepend beforeinstall \
 		beforelinking build build-tools buildconfig buildfiles \
 		buildincludes check checkdpadd clean cleandepend cleandir \
 		cleanobj configure depend distclean distribute exe \
 		files html includes install installconfig installdirs \
 		installfiles installincludes lint obj objlink objs objwarn \
 		realinstall tags whereobj
 
 # we don't want ${PROG} to be PHONY
 .PHONY: ${PHONY_NOTMAIN:N${PROG:U}}
 .NOTMAIN: ${PHONY_NOTMAIN:Nall}
 
 .if ${MK_STAGING} != "no"
 .if defined(_SKIP_BUILD) || (!make(all) && !make(clean*))
 _SKIP_STAGING?= yes
 .endif
 .if ${_SKIP_STAGING:Uno} == "yes"
 staging stage_libs stage_files stage_as stage_links stage_symlinks:
 .else
 # allow targets like beforeinstall to be leveraged
 DESTDIR= ${STAGE_OBJTOP}
 .export DESTDIR
 
 .if target(beforeinstall)
 .if !empty(_LIBS) || (${MK_STAGING_PROG} != "no" && !defined(INTERNALPROG))
 staging: beforeinstall
 .endif
 .endif
 
 # normally only libs and includes are staged
 .if ${MK_STAGING_PROG} != "no" && !defined(INTERNALPROG)
 STAGE_DIR.prog= ${STAGE_OBJTOP}${BINDIR}
 
 .if !empty(PROG)
 .if defined(PROGNAME)
 STAGE_AS_SETS+= prog
 STAGE_AS_${PROG}= ${PROGNAME}
 stage_as.prog: ${PROG}
 .else
 STAGE_SETS+= prog
 stage_files.prog: ${PROG}
 STAGE_TARGETS+= stage_files
 .endif
 .endif
 .endif
 
 .if !empty(_LIBS) && !defined(INTERNALLIB)
 .if defined(SHLIBDIR) && ${SHLIBDIR} != ${LIBDIR} && ${_LIBS:Uno:M*.so.*} != ""
 STAGE_SETS+= shlib
 STAGE_DIR.shlib= ${STAGE_OBJTOP}${SHLIBDIR}
 STAGE_FILES.shlib+= ${_LIBS:M*.so.*}
 stage_files.shlib: ${_LIBS:M*.so.*}
 .endif
 
 .if defined(SHLIB_LINK) && commands(${SHLIB_LINK:R}.ld)
 STAGE_AS_SETS+= ldscript
 STAGE_AS.ldscript+= ${SHLIB_LINK:R}.ld
 stage_as.ldscript: ${SHLIB_LINK:R}.ld
 STAGE_DIR.ldscript = ${STAGE_LIBDIR}
 STAGE_AS_${SHLIB_LINK:R}.ld:= ${SHLIB_LINK}
 NO_SHLIB_LINKS=
 .endif
 
 .if target(stage_files.shlib)
 stage_libs: ${_LIBS}
 .if defined(DEBUG_FLAGS) && target(${SHLIB_NAME}.symbols)
 stage_files.shlib: ${SHLIB_NAME}.symbols
 .endif
 .else
 stage_libs: ${_LIBS}
 .endif
 .if defined(SHLIB_NAME) && defined(DEBUG_FLAGS) && target(${SHLIB_NAME}.symbols)
 stage_libs: ${SHLIB_NAME}.symbols
 .endif
 
 .endif
 
 .if !empty(INCS) || !empty(INCSGROUPS) && target(buildincludes)
 .if !defined(NO_BEFOREBUILD_INCLUDES)
 stage_includes: buildincludes
 beforebuild: stage_includes
 .endif
 .endif
 
 .for t in stage_libs stage_files stage_as
 .if target($t)
 STAGE_TARGETS+= $t
 .endif
 .endfor
 
 .if !empty(STAGE_AS_SETS)
 STAGE_TARGETS+= stage_as
 .endif
 
 .if !empty(STAGE_TARGETS) || (${MK_STAGING_PROG} != "no" && !defined(INTERNALPROG))
 
 .if !empty(LINKS)
 STAGE_TARGETS+= stage_links
 .if ${MAKE_VERSION} < 20131001
 stage_links.links: ${_LIBS} ${PROG}
 .endif
 STAGE_SETS+= links
 STAGE_LINKS.links= ${LINKS}
 .endif
 
 .if !empty(SYMLINKS)
 STAGE_TARGETS+= stage_symlinks
 STAGE_SETS+= links
 STAGE_SYMLINKS.links= ${SYMLINKS}
 .endif
 
 .endif
 
 .include <meta.stage.mk>
 .endif
 .endif
 
 .if defined(META_TARGETS)
 .for _tgt in ${META_TARGETS}
 .if target(${_tgt})
 ${_tgt}: ${META_DEPS}
 .endif
 .endfor
 .endif
Index: projects/capsicum-test/share/mk/googletest.test.inc.mk
===================================================================
--- projects/capsicum-test/share/mk/googletest.test.inc.mk	(revision 345709)
+++ projects/capsicum-test/share/mk/googletest.test.inc.mk	(revision 345710)
@@ -1,16 +1,12 @@
 # $FreeBSD$
 
-# XXX: this should be defined in bsd.sys.mk
-CXXSTD?=	c++11
-
 GTESTS_CXXFLAGS+= -DGTEST_HAS_POSIX_RE=1
 GTESTS_CXXFLAGS+= -DGTEST_HAS_PTHREAD=1
 GTESTS_CXXFLAGS+= -DGTEST_HAS_STREAM_REDIRECTION=1
 GTESTS_CXXFLAGS+= -frtti
-GTESTS_CXXFLAGS+= -std=${CXXSTD}
 
 # XXX: src.libnames.mk should handle adding this directory for libgtest's,
 # libgmock's, etc, headers.
 CXXFLAGS+=	-I${DESTDIR}${INCLUDEDIR}/private
 
 NO_WTHREAD_SAFETY=
Index: projects/capsicum-test/sys/cam/cam_periph.c
===================================================================
--- projects/capsicum-test/sys/cam/cam_periph.c	(revision 345709)
+++ projects/capsicum-test/sys/cam/cam_periph.c	(revision 345710)
@@ -1,2108 +1,2167 @@
 /*-
  * Common functions for CAM "type" (peripheral) drivers.
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999, 2000 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/devicestat.h>
 #include <sys/bus.h>
 #include <sys/sbuf.h>
+#include <sys/sysctl.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_pass.h>
 
 static	u_int		camperiphnextunit(struct periph_driver *p_drv,
 					  u_int newunit, int wired,
 					  path_id_t pathid, target_id_t target,
 					  lun_id_t lun);
 static	u_int		camperiphunit(struct periph_driver *p_drv,
 				      path_id_t pathid, target_id_t target,
 				      lun_id_t lun); 
 static	void		camperiphdone(struct cam_periph *periph, 
 					union ccb *done_ccb);
 static  void		camperiphfree(struct cam_periph *periph);
 static int		camperiphscsistatuserror(union ccb *ccb,
 					        union ccb **orig_ccb,
 						 cam_flags camflags,
 						 u_int32_t sense_flags,
 						 int *openings,
 						 u_int32_t *relsim_flags,
 						 u_int32_t *timeout,
 						 u_int32_t  *action,
 						 const char **action_string);
 static	int		camperiphscsisenseerror(union ccb *ccb,
 					        union ccb **orig_ccb,
 					        cam_flags camflags,
 					        u_int32_t sense_flags,
 					        int *openings,
 					        u_int32_t *relsim_flags,
 					        u_int32_t *timeout,
 					        u_int32_t *action,
 					        const char **action_string);
 static void		cam_periph_devctl_notify(union ccb *ccb);
 
 static int nperiph_drivers;
 static int initialized = 0;
 struct periph_driver **periph_drivers;
 
 static MALLOC_DEFINE(M_CAMPERIPH, "CAM periph", "CAM peripheral buffers");
 
 static int periph_selto_delay = 1000;
 TUNABLE_INT("kern.cam.periph_selto_delay", &periph_selto_delay);
 static int periph_noresrc_delay = 500;
 TUNABLE_INT("kern.cam.periph_noresrc_delay", &periph_noresrc_delay);
 static int periph_busy_delay = 500;
 TUNABLE_INT("kern.cam.periph_busy_delay", &periph_busy_delay);
 
+static u_int periph_mapmem_thresh = 65536;
+SYSCTL_UINT(_kern_cam, OID_AUTO, mapmem_thresh, CTLFLAG_RWTUN,
+    &periph_mapmem_thresh, 0, "Threshold for user-space buffer mapping");
 
 void
 periphdriver_register(void *data)
 {
 	struct periph_driver *drv = (struct periph_driver *)data;
 	struct periph_driver **newdrivers, **old;
 	int ndrivers;
 
 again:
 	ndrivers = nperiph_drivers + 2;
 	newdrivers = malloc(sizeof(*newdrivers) * ndrivers, M_CAMPERIPH,
 			    M_WAITOK);
 	xpt_lock_buses();
 	if (ndrivers != nperiph_drivers + 2) {
 		/*
 		 * Lost race against itself; go around.
 		 */
 		xpt_unlock_buses();
 		free(newdrivers, M_CAMPERIPH);
 		goto again;
 	}
 	if (periph_drivers)
 		bcopy(periph_drivers, newdrivers,
 		      sizeof(*newdrivers) * nperiph_drivers);
 	newdrivers[nperiph_drivers] = drv;
 	newdrivers[nperiph_drivers + 1] = NULL;
 	old = periph_drivers;
 	periph_drivers = newdrivers;
 	nperiph_drivers++;
 	xpt_unlock_buses();
 	if (old)
 		free(old, M_CAMPERIPH);
 	/* If driver marked as early or it is late now, initialize it. */
 	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) ||
 	    initialized > 1)
 		(*drv->init)();
 }
 
 int
 periphdriver_unregister(void *data)
 {
 	struct periph_driver *drv = (struct periph_driver *)data;
 	int error, n;
 
 	/* If driver marked as early or it is late now, deinitialize it. */
 	if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) ||
 	    initialized > 1) {
 		if (drv->deinit == NULL) {
 			printf("CAM periph driver '%s' doesn't have deinit.\n",
 			    drv->driver_name);
 			return (EOPNOTSUPP);
 		}
 		error = drv->deinit();
 		if (error != 0)
 			return (error);
 	}
 
 	xpt_lock_buses();
 	for (n = 0; n < nperiph_drivers && periph_drivers[n] != drv; n++)
 		;
 	KASSERT(n < nperiph_drivers,
 	    ("Periph driver '%s' was not registered", drv->driver_name));
 	for (; n + 1 < nperiph_drivers; n++)
 		periph_drivers[n] = periph_drivers[n + 1];
 	periph_drivers[n + 1] = NULL;
 	nperiph_drivers--;
 	xpt_unlock_buses();
 	return (0);
 }
 
 void
 periphdriver_init(int level)
 {
 	int	i, early;
 
 	initialized = max(initialized, level);
 	for (i = 0; periph_drivers[i] != NULL; i++) {
 		early = (periph_drivers[i]->flags & CAM_PERIPH_DRV_EARLY) ? 1 : 2;
 		if (early == initialized)
 			(*periph_drivers[i]->init)();
 	}
 }
 
 cam_status
 cam_periph_alloc(periph_ctor_t *periph_ctor,
 		 periph_oninv_t *periph_oninvalidate,
 		 periph_dtor_t *periph_dtor, periph_start_t *periph_start,
 		 char *name, cam_periph_type type, struct cam_path *path,
 		 ac_callback_t *ac_callback, ac_code code, void *arg)
 {
 	struct		periph_driver **p_drv;
 	struct		cam_sim *sim;
 	struct		cam_periph *periph;
 	struct		cam_periph *cur_periph;
 	path_id_t	path_id;
 	target_id_t	target_id;
 	lun_id_t	lun_id;
 	cam_status	status;
 	u_int		init_level;
 
 	init_level = 0;
 	/*
 	 * Handle Hot-Plug scenarios.  If there is already a peripheral
 	 * of our type assigned to this path, we are likely waiting for
 	 * final close on an old, invalidated, peripheral.  If this is
 	 * the case, queue up a deferred call to the peripheral's async
 	 * handler.  If it looks like a mistaken re-allocation, complain.
 	 */
 	if ((periph = cam_periph_find(path, name)) != NULL) {
 
 		if ((periph->flags & CAM_PERIPH_INVALID) != 0
 		 && (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) == 0) {
 			periph->flags |= CAM_PERIPH_NEW_DEV_FOUND;
 			periph->deferred_callback = ac_callback;
 			periph->deferred_ac = code;
 			return (CAM_REQ_INPROG);
 		} else {
 			printf("cam_periph_alloc: attempt to re-allocate "
 			       "valid device %s%d rejected flags %#x "
 			       "refcount %d\n", periph->periph_name,
 			       periph->unit_number, periph->flags,
 			       periph->refcount);
 		}
 		return (CAM_REQ_INVALID);
 	}
 	
 	periph = (struct cam_periph *)malloc(sizeof(*periph), M_CAMPERIPH,
 					     M_NOWAIT|M_ZERO);
 
 	if (periph == NULL)
 		return (CAM_RESRC_UNAVAIL);
 	
 	init_level++;
 
 
 	sim = xpt_path_sim(path);
 	path_id = xpt_path_path_id(path);
 	target_id = xpt_path_target_id(path);
 	lun_id = xpt_path_lun_id(path);
 	periph->periph_start = periph_start;
 	periph->periph_dtor = periph_dtor;
 	periph->periph_oninval = periph_oninvalidate;
 	periph->type = type;
 	periph->periph_name = name;
 	periph->scheduled_priority = CAM_PRIORITY_NONE;
 	periph->immediate_priority = CAM_PRIORITY_NONE;
 	periph->refcount = 1;		/* Dropped by invalidation. */
 	periph->sim = sim;
 	SLIST_INIT(&periph->ccb_list);
 	status = xpt_create_path(&path, periph, path_id, target_id, lun_id);
 	if (status != CAM_REQ_CMP)
 		goto failure;
 	periph->path = path;
 
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (strcmp((*p_drv)->driver_name, name) == 0)
 			break;
 	}
 	if (*p_drv == NULL) {
 		printf("cam_periph_alloc: invalid periph name '%s'\n", name);
 		xpt_unlock_buses();
 		xpt_free_path(periph->path);
 		free(periph, M_CAMPERIPH);
 		return (CAM_REQ_INVALID);
 	}
 	periph->unit_number = camperiphunit(*p_drv, path_id, target_id, lun_id);
 	cur_periph = TAILQ_FIRST(&(*p_drv)->units);
 	while (cur_periph != NULL
 	    && cur_periph->unit_number < periph->unit_number)
 		cur_periph = TAILQ_NEXT(cur_periph, unit_links);
 	if (cur_periph != NULL) {
 		KASSERT(cur_periph->unit_number != periph->unit_number, ("duplicate units on periph list"));
 		TAILQ_INSERT_BEFORE(cur_periph, periph, unit_links);
 	} else {
 		TAILQ_INSERT_TAIL(&(*p_drv)->units, periph, unit_links);
 		(*p_drv)->generation++;
 	}
 	xpt_unlock_buses();
 
 	init_level++;
 
 	status = xpt_add_periph(periph);
 	if (status != CAM_REQ_CMP)
 		goto failure;
 
 	init_level++;
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph created\n"));
 
 	status = periph_ctor(periph, arg);
 
 	if (status == CAM_REQ_CMP)
 		init_level++;
 
 failure:
 	switch (init_level) {
 	case 4:
 		/* Initialized successfully */
 		break;
 	case 3:
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 		xpt_remove_periph(periph);
 		/* FALLTHROUGH */
 	case 2:
 		xpt_lock_buses();
 		TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
 		xpt_unlock_buses();
 		xpt_free_path(periph->path);
 		/* FALLTHROUGH */
 	case 1:
 		free(periph, M_CAMPERIPH);
 		/* FALLTHROUGH */
 	case 0:
 		/* No cleanup to perform. */
 		break;
 	default:
 		panic("%s: Unknown init level", __func__);
 	}
 	return(status);
 }
 
 /*
  * Find a peripheral structure with the specified path, target, lun, 
  * and (optionally) type.  If the name is NULL, this function will return
  * the first peripheral driver that matches the specified path.
  */
 struct cam_periph *
 cam_periph_find(struct cam_path *path, char *name)
 {
 	struct periph_driver **p_drv;
 	struct cam_periph *periph;
 
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 
 		if (name != NULL && (strcmp((*p_drv)->driver_name, name) != 0))
 			continue;
 
 		TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
 			if (xpt_path_comp(periph->path, path) == 0) {
 				xpt_unlock_buses();
 				cam_periph_assert(periph, MA_OWNED);
 				return(periph);
 			}
 		}
 		if (name != NULL) {
 			xpt_unlock_buses();
 			return(NULL);
 		}
 	}
 	xpt_unlock_buses();
 	return(NULL);
 }
 
 /*
  * Find peripheral driver instances attached to the specified path.
  */
 int
 cam_periph_list(struct cam_path *path, struct sbuf *sb)
 {
 	struct sbuf local_sb;
 	struct periph_driver **p_drv;
 	struct cam_periph *periph;
 	int count;
 	int sbuf_alloc_len;
 
 	sbuf_alloc_len = 16;
 retry:
 	sbuf_new(&local_sb, NULL, sbuf_alloc_len, SBUF_FIXEDLEN);
 	count = 0;
 	xpt_lock_buses();
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 
 		TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) {
 			if (xpt_path_comp(periph->path, path) != 0)
 				continue;
 
 			if (sbuf_len(&local_sb) != 0)
 				sbuf_cat(&local_sb, ",");
 
 			sbuf_printf(&local_sb, "%s%d", periph->periph_name,
 				    periph->unit_number);
 
 			if (sbuf_error(&local_sb) == ENOMEM) {
 				sbuf_alloc_len *= 2;
 				xpt_unlock_buses();
 				sbuf_delete(&local_sb);
 				goto retry;
 			}
 			count++;
 		}
 	}
 	xpt_unlock_buses();
 	sbuf_finish(&local_sb);
 	sbuf_cpy(sb, sbuf_data(&local_sb));
 	sbuf_delete(&local_sb);
 	return (count);
 }
 
 int
 cam_periph_acquire(struct cam_periph *periph)
 {
 	int status;
 
 	if (periph == NULL)
 		return (EINVAL);
 
 	status = ENOENT;
 	xpt_lock_buses();
 	if ((periph->flags & CAM_PERIPH_INVALID) == 0) {
 		periph->refcount++;
 		status = 0;
 	}
 	xpt_unlock_buses();
 
 	return (status);
 }
 
 void
 cam_periph_doacquire(struct cam_periph *periph)
 {
 
 	xpt_lock_buses();
 	KASSERT(periph->refcount >= 1,
 	    ("cam_periph_doacquire() with refcount == %d", periph->refcount));
 	periph->refcount++;
 	xpt_unlock_buses();
 }
 
 void
 cam_periph_release_locked_buses(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(periph->refcount >= 1, ("periph->refcount >= 1"));
 	if (--periph->refcount == 0)
 		camperiphfree(periph);
 }
 
 void
 cam_periph_release_locked(struct cam_periph *periph)
 {
 
 	if (periph == NULL)
 		return;
 
 	xpt_lock_buses();
 	cam_periph_release_locked_buses(periph);
 	xpt_unlock_buses();
 }
 
 void
 cam_periph_release(struct cam_periph *periph)
 {
 	struct mtx *mtx;
 
 	if (periph == NULL)
 		return;
 	
 	cam_periph_assert(periph, MA_NOTOWNED);
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 	cam_periph_release_locked(periph);
 	mtx_unlock(mtx);
 }
 
 /*
  * hold/unhold act as mutual exclusion for sections of the code that
  * need to sleep and want to make sure that other sections that
  * will interfere are held off. This only protects exclusive sections
  * from each other.
  */
 int
 cam_periph_hold(struct cam_periph *periph, int priority)
 {
 	int error;
 
 	/*
 	 * Increment the reference count on the peripheral
 	 * while we wait for our lock attempt to succeed
 	 * to ensure the peripheral doesn't disappear out
 	 * from user us while we sleep.
 	 */
 
 	if (cam_periph_acquire(periph) != 0)
 		return (ENXIO);
 
 	cam_periph_assert(periph, MA_OWNED);
 	while ((periph->flags & CAM_PERIPH_LOCKED) != 0) {
 		periph->flags |= CAM_PERIPH_LOCK_WANTED;
 		if ((error = cam_periph_sleep(periph, periph, priority,
 		    "caplck", 0)) != 0) {
 			cam_periph_release_locked(periph);
 			return (error);
 		}
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			cam_periph_release_locked(periph);
 			return (ENXIO);
 		}
 	}
 
 	periph->flags |= CAM_PERIPH_LOCKED;
 	return (0);
 }
 
 void
 cam_periph_unhold(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 
 	periph->flags &= ~CAM_PERIPH_LOCKED;
 	if ((periph->flags & CAM_PERIPH_LOCK_WANTED) != 0) {
 		periph->flags &= ~CAM_PERIPH_LOCK_WANTED;
 		wakeup(periph);
 	}
 
 	cam_periph_release_locked(periph);
 }
 
 /*
  * Look for the next unit number that is not currently in use for this
  * peripheral type starting at "newunit".  Also exclude unit numbers that
  * are reserved by for future "hardwiring" unless we already know that this
  * is a potential wired device.  Only assume that the device is "wired" the
  * first time through the loop since after that we'll be looking at unit
  * numbers that did not match a wiring entry.
  */
 static u_int
 camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired,
 		  path_id_t pathid, target_id_t target, lun_id_t lun)
 {
 	struct	cam_periph *periph;
 	char	*periph_name;
 	int	i, val, dunit, r;
 	const char *dname, *strval;
 
 	periph_name = p_drv->driver_name;
 	for (;;newunit++) {
 
 		for (periph = TAILQ_FIRST(&p_drv->units);
 		     periph != NULL && periph->unit_number != newunit;
 		     periph = TAILQ_NEXT(periph, unit_links))
 			;
 
 		if (periph != NULL && periph->unit_number == newunit) {
 			if (wired != 0) {
 				xpt_print(periph->path, "Duplicate Wired "
 				    "Device entry!\n");
 				xpt_print(periph->path, "Second device (%s "
 				    "device at scbus%d target %d lun %d) will "
 				    "not be wired\n", periph_name, pathid,
 				    target, lun);
 				wired = 0;
 			}
 			continue;
 		}
 		if (wired)
 			break;
 
 		/*
 		 * Don't match entries like "da 4" as a wired down
 		 * device, but do match entries like "da 4 target 5"
 		 * or even "da 4 scbus 1". 
 		 */
 		i = 0;
 		dname = periph_name;
 		for (;;) {
 			r = resource_find_dev(&i, dname, &dunit, NULL, NULL);
 			if (r != 0)
 				break;
 			/* if no "target" and no specific scbus, skip */
 			if (resource_int_value(dname, dunit, "target", &val) &&
 			    (resource_string_value(dname, dunit, "at",&strval)||
 			     strcmp(strval, "scbus") == 0))
 				continue;
 			if (newunit == dunit)
 				break;
 		}
 		if (r != 0)
 			break;
 	}
 	return (newunit);
 }
 
 static u_int
 camperiphunit(struct periph_driver *p_drv, path_id_t pathid,
 	      target_id_t target, lun_id_t lun)
 {
 	u_int	unit;
 	int	wired, i, val, dunit;
 	const char *dname, *strval;
 	char	pathbuf[32], *periph_name;
 
 	periph_name = p_drv->driver_name;
 	snprintf(pathbuf, sizeof(pathbuf), "scbus%d", pathid);
 	unit = 0;
 	i = 0;
 	dname = periph_name;
 	for (wired = 0; resource_find_dev(&i, dname, &dunit, NULL, NULL) == 0;
 	     wired = 0) {
 		if (resource_string_value(dname, dunit, "at", &strval) == 0) {
 			if (strcmp(strval, pathbuf) != 0)
 				continue;
 			wired++;
 		}
 		if (resource_int_value(dname, dunit, "target", &val) == 0) {
 			if (val != target)
 				continue;
 			wired++;
 		}
 		if (resource_int_value(dname, dunit, "lun", &val) == 0) {
 			if (val != lun)
 				continue;
 			wired++;
 		}
 		if (wired != 0) {
 			unit = dunit;
 			break;
 		}
 	}
 
 	/*
 	 * Either start from 0 looking for the next unit or from
 	 * the unit number given in the resource config.  This way,
 	 * if we have wildcard matches, we don't return the same
 	 * unit number twice.
 	 */
 	unit = camperiphnextunit(p_drv, unit, wired, pathid, target, lun);
 
 	return (unit);
 }
 
 void
 cam_periph_invalidate(struct cam_periph *periph)
 {
 
 	cam_periph_assert(periph, MA_OWNED);
 	/*
 	 * We only call this routine the first time a peripheral is
 	 * invalidated.
 	 */
 	if ((periph->flags & CAM_PERIPH_INVALID) != 0)
 		return;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph invalidated\n"));
 	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) {
 		struct sbuf sb;
 		char buffer[160];
 
 		sbuf_new(&sb, buffer, 160, SBUF_FIXEDLEN);
 		xpt_denounce_periph_sbuf(periph, &sb);
 		sbuf_finish(&sb);
 		sbuf_putbuf(&sb);
 	}
 	periph->flags |= CAM_PERIPH_INVALID;
 	periph->flags &= ~CAM_PERIPH_NEW_DEV_FOUND;
 	if (periph->periph_oninval != NULL)
 		periph->periph_oninval(periph);
 	cam_periph_release_locked(periph);
 }
 
 static void
 camperiphfree(struct cam_periph *periph)
 {
 	struct periph_driver **p_drv;
 	struct periph_driver *drv;
 
 	cam_periph_assert(periph, MA_OWNED);
 	KASSERT(periph->periph_allocating == 0, ("%s%d: freed while allocating",
 	    periph->periph_name, periph->unit_number));
 	for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) {
 		if (strcmp((*p_drv)->driver_name, periph->periph_name) == 0)
 			break;
 	}
 	if (*p_drv == NULL) {
 		printf("camperiphfree: attempt to free non-existant periph\n");
 		return;
 	}
 	/*
 	 * Cache a pointer to the periph_driver structure.  If a
 	 * periph_driver is added or removed from the array (see
 	 * periphdriver_register()) while we drop the toplogy lock
 	 * below, p_drv may change.  This doesn't protect against this
 	 * particular periph_driver going away.  That will require full
 	 * reference counting in the periph_driver infrastructure.
 	 */
 	drv = *p_drv;
 
 	/*
 	 * We need to set this flag before dropping the topology lock, to
 	 * let anyone who is traversing the list that this peripheral is
 	 * about to be freed, and there will be no more reference count
 	 * checks.
 	 */
 	periph->flags |= CAM_PERIPH_FREE;
 
 	/*
 	 * The peripheral destructor semantics dictate calling with only the
 	 * SIM mutex held.  Since it might sleep, it should not be called
 	 * with the topology lock held.
 	 */
 	xpt_unlock_buses();
 
 	/*
 	 * We need to call the peripheral destructor prior to removing the
 	 * peripheral from the list.  Otherwise, we risk running into a
 	 * scenario where the peripheral unit number may get reused
 	 * (because it has been removed from the list), but some resources
 	 * used by the peripheral are still hanging around.  In particular,
 	 * the devfs nodes used by some peripherals like the pass(4) driver
 	 * aren't fully cleaned up until the destructor is run.  If the
 	 * unit number is reused before the devfs instance is fully gone,
 	 * devfs will panic.
 	 */
 	if (periph->periph_dtor != NULL)
 		periph->periph_dtor(periph);
 
 	/*
 	 * The peripheral list is protected by the topology lock.
 	 */
 	xpt_lock_buses();
 
 	TAILQ_REMOVE(&drv->units, periph, unit_links);
 	drv->generation++;
 
 	xpt_remove_periph(periph);
 
 	xpt_unlock_buses();
 	if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting)
 		xpt_print(periph->path, "Periph destroyed\n");
 	else
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 
 	if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) {
 		union ccb ccb;
 		void *arg;
 
 		switch (periph->deferred_ac) {
 		case AC_FOUND_DEVICE:
 			ccb.ccb_h.func_code = XPT_GDEV_TYPE;
 			xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 			xpt_action(&ccb);
 			arg = &ccb;
 			break;
 		case AC_PATH_REGISTERED:
 			xpt_path_inq(&ccb.cpi, periph->path);
 			arg = &ccb;
 			break;
 		default:
 			arg = NULL;
 			break;
 		}
 		periph->deferred_callback(NULL, periph->deferred_ac,
 					  periph->path, arg);
 	}
 	xpt_free_path(periph->path);
 	free(periph, M_CAMPERIPH);
 	xpt_lock_buses();
 }
 
 /*
  * Map user virtual pointers into kernel virtual address space, so we can
  * access the memory.  This is now a generic function that centralizes most
  * of the sanity checks on the data flags, if any.
  * This also only works for up to MAXPHYS memory.  Since we use
  * buffers to map stuff in and out, we're limited to the buffer size.
  */
 int
 cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo,
     u_int maxmap)
 {
-	int numbufs, i, j;
-	int flags[CAM_PERIPH_MAXMAPS];
+	int numbufs, i;
 	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
 	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
 	u_int32_t dirs[CAM_PERIPH_MAXMAPS];
 
+	bzero(mapinfo, sizeof(*mapinfo));
 	if (maxmap == 0)
 		maxmap = DFLTPHYS;	/* traditional default */
 	else if (maxmap > MAXPHYS)
 		maxmap = MAXPHYS;	/* for safety */
 	switch(ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
 		if (ccb->cdm.match_buf_len == 0) {
 			printf("cam_periph_mapmem: invalid match buffer "
 			       "length 0\n");
 			return(EINVAL);
 		}
 		if (ccb->cdm.pattern_buf_len > 0) {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
 			lengths[0] = ccb->cdm.pattern_buf_len;
 			dirs[0] = CAM_DIR_OUT;
 			data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[1] = ccb->cdm.match_buf_len;
 			dirs[1] = CAM_DIR_IN;
 			numbufs = 2;
 		} else {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
 			lengths[0] = ccb->cdm.match_buf_len;
 			dirs[0] = CAM_DIR_IN;
 			numbufs = 1;
 		}
 		/*
 		 * This request will not go to the hardware, no reason
 		 * to be so strict. vmapbuf() is able to map up to MAXPHYS.
 		 */
 		maxmap = MAXPHYS;
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->csio.data_ptr;
 		lengths[0] = ccb->csio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_ATA_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->ataio.data_ptr;
 		lengths[0] = ccb->ataio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_MMC_IO:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return(0);
 		/* Two mappings: one for cmd->data and one for cmd->data->data */
 		data_ptrs[0] = (unsigned char **)&ccb->mmcio.cmd.data;
 		lengths[0] = sizeof(struct mmc_data *);
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		data_ptrs[1] = (unsigned char **)&ccb->mmcio.cmd.data->data;
 		lengths[1] = ccb->mmcio.cmd.data->len;
 		dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 2;
 		break;
 	case XPT_SMP_IO:
 		data_ptrs[0] = &ccb->smpio.smp_request;
 		lengths[0] = ccb->smpio.smp_request_len;
 		dirs[0] = CAM_DIR_OUT;
 		data_ptrs[1] = &ccb->smpio.smp_response;
 		lengths[1] = ccb->smpio.smp_response_len;
 		dirs[1] = CAM_DIR_IN;
 		numbufs = 2;
 		break;
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN:
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE)
 			return (0);
 		if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR)
 			return (EINVAL);
 		data_ptrs[0] = &ccb->nvmeio.data_ptr;
 		lengths[0] = ccb->nvmeio.dxfer_len;
 		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
 		numbufs = 1;
 		break;
 	case XPT_DEV_ADVINFO:
 		if (ccb->cdai.bufsiz == 0)
 			return (0);
 
 		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
 		lengths[0] = ccb->cdai.bufsiz;
 		dirs[0] = CAM_DIR_IN;
 		numbufs = 1;
 
 		/*
 		 * This request will not go to the hardware, no reason
 		 * to be so strict. vmapbuf() is able to map up to MAXPHYS.
 		 */
 		maxmap = MAXPHYS;
 		break;
 	default:
 		return(EINVAL);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * Check the transfer length and permissions first, so we don't
 	 * have to unmap any previously mapped buffers.
 	 */
 	for (i = 0; i < numbufs; i++) {
 
-		flags[i] = 0;
-
 		/*
 		 * The userland data pointer passed in may not be page
 		 * aligned.  vmapbuf() truncates the address to a page
 		 * boundary, so if the address isn't page aligned, we'll
 		 * need enough space for the given transfer length, plus
 		 * whatever extra space is necessary to make it to the page
 		 * boundary.
 		 */
 		if ((lengths[i] +
 		    (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK)) > maxmap){
 			printf("cam_periph_mapmem: attempt to map %lu bytes, "
 			       "which is greater than %lu\n",
 			       (long)(lengths[i] +
 			       (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK)),
 			       (u_long)maxmap);
 			return(E2BIG);
 		}
-
-		if (dirs[i] & CAM_DIR_OUT) {
-			flags[i] = BIO_WRITE;
-		}
-
-		if (dirs[i] & CAM_DIR_IN) {
-			flags[i] = BIO_READ;
-		}
-
 	}
 
 	/*
 	 * This keeps the kernel stack of current thread from getting
 	 * swapped.  In low-memory situations where the kernel stack might
 	 * otherwise get swapped out, this holds it and allows the thread
 	 * to make progress and release the kernel mapped pages sooner.
 	 *
 	 * XXX KDM should I use P_NOSWAP instead?
 	 */
 	PHOLD(curproc);
 
 	for (i = 0; i < numbufs; i++) {
+
+		/* Save the user's data address. */
+		mapinfo->orig[i] = *data_ptrs[i];
+
 		/*
+		 * For small buffers use malloc+copyin/copyout instead of
+		 * mapping to KVA to avoid expensive TLB shootdowns.  For
+		 * small allocations malloc is backed by UMA, and so much
+		 * cheaper on SMP systems.
+		 */
+		if (lengths[i] <= periph_mapmem_thresh &&
+		    ccb->ccb_h.func_code != XPT_MMC_IO) {
+			*data_ptrs[i] = malloc(lengths[i], M_CAMPERIPH,
+			    M_WAITOK);
+			if (dirs[i] != CAM_DIR_IN) {
+				if (copyin(mapinfo->orig[i], *data_ptrs[i],
+				    lengths[i]) != 0) {
+					free(*data_ptrs[i], M_CAMPERIPH);
+					*data_ptrs[i] = mapinfo->orig[i];
+					goto fail;
+				}
+			} else
+				bzero(*data_ptrs[i], lengths[i]);
+			continue;
+		}
+
+		/*
 		 * Get the buffer.
 		 */
 		mapinfo->bp[i] = uma_zalloc(pbuf_zone, M_WAITOK);
 
 		/* put our pointer in the data slot */
 		mapinfo->bp[i]->b_data = *data_ptrs[i];
 
-		/* save the user's data address */
-		mapinfo->bp[i]->b_caller1 = *data_ptrs[i];
-
 		/* set the transfer length, we know it's < MAXPHYS */
 		mapinfo->bp[i]->b_bufsize = lengths[i];
 
 		/* set the direction */
-		mapinfo->bp[i]->b_iocmd = flags[i];
+		mapinfo->bp[i]->b_iocmd = (dirs[i] == CAM_DIR_OUT) ?
+		    BIO_WRITE : BIO_READ;
 
 		/*
 		 * Map the buffer into kernel memory.
 		 *
 		 * Note that useracc() alone is not a  sufficient test.
 		 * vmapbuf() can still fail due to a smaller file mapped
 		 * into a larger area of VM, or if userland races against
 		 * vmapbuf() after the useracc() check.
 		 */
 		if (vmapbuf(mapinfo->bp[i], 1) < 0) {
-			for (j = 0; j < i; ++j) {
-				*data_ptrs[j] = mapinfo->bp[j]->b_caller1;
-				vunmapbuf(mapinfo->bp[j]);
-				uma_zfree(pbuf_zone, mapinfo->bp[j]);
-			}
 			uma_zfree(pbuf_zone, mapinfo->bp[i]);
-			PRELE(curproc);
-			return(EACCES);
+			goto fail;
 		}
 
 		/* set our pointer to the new mapped area */
 		*data_ptrs[i] = mapinfo->bp[i]->b_data;
-
-		mapinfo->num_bufs_used++;
 	}
 
 	/*
 	 * Now that we've gotten this far, change ownership to the kernel
 	 * of the buffers so that we don't run afoul of returning to user
 	 * space with locks (on the buffer) held.
 	 */
 	for (i = 0; i < numbufs; i++) {
-		BUF_KERNPROC(mapinfo->bp[i]);
+		if (mapinfo->bp[i])
+			BUF_KERNPROC(mapinfo->bp[i]);
 	}
 
-
+	mapinfo->num_bufs_used = numbufs;
 	return(0);
+
+fail:
+	for (i--; i >= 0; i--) {
+		if (mapinfo->bp[i]) {
+			vunmapbuf(mapinfo->bp[i]);
+			uma_zfree(pbuf_zone, mapinfo->bp[i]);
+		} else
+			free(*data_ptrs[i], M_CAMPERIPH);
+		*data_ptrs[i] = mapinfo->orig[i];
+	}
+	PRELE(curproc);
+	return(EACCES);
 }
 
 /*
  * Unmap memory segments mapped into kernel virtual address space by
  * cam_periph_mapmem().
  */
 void
 cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo)
 {
 	int numbufs, i;
 	u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS];
+	u_int32_t lengths[CAM_PERIPH_MAXMAPS];
+	u_int32_t dirs[CAM_PERIPH_MAXMAPS];
 
 	if (mapinfo->num_bufs_used <= 0) {
 		/* nothing to free and the process wasn't held. */
 		return;
 	}
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_DEV_MATCH:
-		numbufs = min(mapinfo->num_bufs_used, 2);
-
-		if (numbufs == 1) {
-			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
-		} else {
+		if (ccb->cdm.pattern_buf_len > 0) {
 			data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns;
+			lengths[0] = ccb->cdm.pattern_buf_len;
+			dirs[0] = CAM_DIR_OUT;
 			data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches;
+			lengths[1] = ccb->cdm.match_buf_len;
+			dirs[1] = CAM_DIR_IN;
+			numbufs = 2;
+		} else {
+			data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches;
+			lengths[0] = ccb->cdm.match_buf_len;
+			dirs[0] = CAM_DIR_IN;
+			numbufs = 1;
 		}
 		break;
 	case XPT_SCSI_IO:
 	case XPT_CONT_TARGET_IO:
 		data_ptrs[0] = &ccb->csio.data_ptr;
-		numbufs = min(mapinfo->num_bufs_used, 1);
+		lengths[0] = ccb->csio.dxfer_len;
+		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
+		numbufs = 1;
 		break;
 	case XPT_ATA_IO:
 		data_ptrs[0] = &ccb->ataio.data_ptr;
-		numbufs = min(mapinfo->num_bufs_used, 1);
+		lengths[0] = ccb->ataio.dxfer_len;
+		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
+		numbufs = 1;
 		break;
+	case XPT_MMC_IO:
+		data_ptrs[0] = (u_int8_t **)&ccb->mmcio.cmd.data;
+		lengths[0] = sizeof(struct mmc_data *);
+		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
+		data_ptrs[1] = (u_int8_t **)&ccb->mmcio.cmd.data->data;
+		lengths[1] = ccb->mmcio.cmd.data->len;
+		dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK;
+		numbufs = 2;
+		break;
 	case XPT_SMP_IO:
-		numbufs = min(mapinfo->num_bufs_used, 2);
 		data_ptrs[0] = &ccb->smpio.smp_request;
+		lengths[0] = ccb->smpio.smp_request_len;
+		dirs[0] = CAM_DIR_OUT;
 		data_ptrs[1] = &ccb->smpio.smp_response;
+		lengths[1] = ccb->smpio.smp_response_len;
+		dirs[1] = CAM_DIR_IN;
+		numbufs = 2;
 		break;
-	case XPT_DEV_ADVINFO:
-		numbufs = min(mapinfo->num_bufs_used, 1);
-		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
-		break;
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN:
 		data_ptrs[0] = &ccb->nvmeio.data_ptr;
-		numbufs = min(mapinfo->num_bufs_used, 1);
+		lengths[0] = ccb->nvmeio.dxfer_len;
+		dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK;
+		numbufs = 1;
 		break;
+	case XPT_DEV_ADVINFO:
+		data_ptrs[0] = (uint8_t **)&ccb->cdai.buf;
+		lengths[0] = ccb->cdai.bufsiz;
+		dirs[0] = CAM_DIR_IN;
+		numbufs = 1;
+		break;
 	default:
 		/* allow ourselves to be swapped once again */
 		PRELE(curproc);
 		return;
 		break; /* NOTREACHED */ 
 	}
 
 	for (i = 0; i < numbufs; i++) {
-		/* Set the user's pointer back to the original value */
-		*data_ptrs[i] = mapinfo->bp[i]->b_caller1;
+		if (mapinfo->bp[i]) {
+			/* unmap the buffer */
+			vunmapbuf(mapinfo->bp[i]);
 
-		/* unmap the buffer */
-		vunmapbuf(mapinfo->bp[i]);
+			/* release the buffer */
+			uma_zfree(pbuf_zone, mapinfo->bp[i]);
+		} else {
+			if (dirs[i] != CAM_DIR_OUT) {
+				copyout(*data_ptrs[i], mapinfo->orig[i],
+				    lengths[i]);
+			}
+			free(*data_ptrs[i], M_CAMPERIPH);
+		}
 
-		/* release the buffer */
-		uma_zfree(pbuf_zone, mapinfo->bp[i]);
+		/* Set the user's pointer back to the original value */
+		*data_ptrs[i] = mapinfo->orig[i];
 	}
 
 	/* allow ourselves to be swapped once again */
 	PRELE(curproc);
 }
 
 int
 cam_periph_ioctl(struct cam_periph *periph, u_long cmd, caddr_t addr,
 		 int (*error_routine)(union ccb *ccb, 
 				      cam_flags camflags,
 				      u_int32_t sense_flags))
 {
 	union ccb 	     *ccb;
 	int 		     error;
 	int		     found;
 
 	error = found = 0;
 
 	switch(cmd){
 	case CAMGETPASSTHRU:
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 		xpt_setup_ccb(&ccb->ccb_h,
 			      ccb->ccb_h.path,
 			      CAM_PRIORITY_NORMAL);
 		ccb->ccb_h.func_code = XPT_GDEVLIST;
 
 		/*
 		 * Basically, the point of this is that we go through
 		 * getting the list of devices, until we find a passthrough
 		 * device.  In the current version of the CAM code, the
 		 * only way to determine what type of device we're dealing
 		 * with is by its name.
 		 */
 		while (found == 0) {
 			ccb->cgdl.index = 0;
 			ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS;
 			while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) {
 
 				/* we want the next device in the list */
 				xpt_action(ccb);
 				if (strncmp(ccb->cgdl.periph_name, 
 				    "pass", 4) == 0){
 					found = 1;
 					break;
 				}
 			}
 			if ((ccb->cgdl.status == CAM_GDEVLIST_LAST_DEVICE) &&
 			    (found == 0)) {
 				ccb->cgdl.periph_name[0] = '\0';
 				ccb->cgdl.unit_number = 0;
 				break;
 			}
 		}
 
 		/* copy the result back out */	
 		bcopy(ccb, addr, sizeof(union ccb));
 
 		/* and release the ccb */
 		xpt_release_ccb(ccb);
 
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return(error);
 }
 
 static void
 cam_periph_done_panic(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	panic("%s: already done with ccb %p", __func__, done_ccb);
 }
 
 static void
 cam_periph_done(struct cam_periph *periph, union ccb *done_ccb)
 {
 
 	/* Caller will release the CCB */
 	xpt_path_assert(done_ccb->ccb_h.path, MA_OWNED);
 	done_ccb->ccb_h.cbfcnp = cam_periph_done_panic;
 	wakeup(&done_ccb->ccb_h.cbfcnp);
 }
 
 static void
 cam_periph_ccbwait(union ccb *ccb)
 {
 
 	if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) != 0) {
 		while (ccb->ccb_h.cbfcnp != cam_periph_done_panic)
 			xpt_path_sleep(ccb->ccb_h.path, &ccb->ccb_h.cbfcnp,
 			    PRIBIO, "cbwait", 0);
 	}
 	KASSERT(ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX &&
 	    (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG,
 	    ("%s: proceeding with incomplete ccb: ccb=%p, func_code=%#x, "
 	     "status=%#x, index=%d", __func__, ccb, ccb->ccb_h.func_code,
 	     ccb->ccb_h.status, ccb->ccb_h.pinfo.index));
 }
 
 /*
  * Dispatch a CCB and wait for it to complete.  If the CCB has set a
  * callback function (ccb->ccb_h.cbfcnp), it will be overwritten and lost.
  */
 int
 cam_periph_runccb(union ccb *ccb,
 		  int (*error_routine)(union ccb *ccb,
 				       cam_flags camflags,
 				       u_int32_t sense_flags),
 		  cam_flags camflags, u_int32_t sense_flags,
 		  struct devstat *ds)
 {
 	struct bintime *starttime;
 	struct bintime ltime;
 	int error;
 	bool must_poll;
 	uint32_t timeout = 1;
 
 	starttime = NULL;
 	xpt_path_assert(ccb->ccb_h.path, MA_OWNED);
 	KASSERT((ccb->ccb_h.flags & CAM_UNLOCKED) == 0,
 	    ("%s: ccb=%p, func_code=%#x, flags=%#x", __func__, ccb,
 	     ccb->ccb_h.func_code, ccb->ccb_h.flags));
 
 	/*
 	 * If the user has supplied a stats structure, and if we understand
 	 * this particular type of ccb, record the transaction start.
 	 */
 	if (ds != NULL &&
 	    (ccb->ccb_h.func_code == XPT_SCSI_IO ||
 	    ccb->ccb_h.func_code == XPT_ATA_IO ||
 	    ccb->ccb_h.func_code == XPT_NVME_IO)) {
 		starttime = &ltime;
 		binuptime(starttime);
 		devstat_start_transaction(ds, starttime);
 	}
 
 	/*
 	 * We must poll the I/O while we're dumping. The scheduler is normally
 	 * stopped for dumping, except when we call doadump from ddb. While the
 	 * scheduler is running in this case, we still need to poll the I/O to
 	 * avoid sleeping waiting for the ccb to complete.
 	 *
 	 * A panic triggered dump stops the scheduler, any callback from the
 	 * shutdown_post_sync event will run with the scheduler stopped, but
 	 * before we're officially dumping. To avoid hanging in adashutdown
 	 * initiated commands (or other similar situations), we have to test for
 	 * either SCHEDULER_STOPPED() here as well.
 	 *
 	 * To avoid locking problems, dumping/polling callers must call
 	 * without a periph lock held.
 	 */
 	must_poll = dumping || SCHEDULER_STOPPED();
 	ccb->ccb_h.cbfcnp = cam_periph_done;
 
 	/*
 	 * If we're polling, then we need to ensure that we have ample resources
 	 * in the periph.  cam_periph_error can reschedule the ccb by calling
 	 * xpt_action and returning ERESTART, so we have to effect the polling
 	 * in the do loop below.
 	 */
 	if (must_poll) {
 		timeout = xpt_poll_setup(ccb);
 	}
 
 	if (timeout == 0) {
 		ccb->ccb_h.status = CAM_RESRC_UNAVAIL;
 		error = EBUSY;
 	} else {
 		xpt_action(ccb);
 		do {
 			if (must_poll) {
 				xpt_pollwait(ccb, timeout);
 				timeout = ccb->ccb_h.timeout * 10;
 			} else {
 				cam_periph_ccbwait(ccb);
 			}
 			if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 				error = 0;
 			else if (error_routine != NULL) {
 				ccb->ccb_h.cbfcnp = cam_periph_done;
 				error = (*error_routine)(ccb, camflags, sense_flags);
 			} else
 				error = 0;
 		} while (error == ERESTART);
 	}
 
 	if ((ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 		cam_release_devq(ccb->ccb_h.path,
 				 /* relsim_flags */0,
 				 /* openings */0,
 				 /* timeout */0,
 				 /* getcount_only */ FALSE);
 		ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 	}
 
 	if (ds != NULL) {
 		uint32_t bytes;
 		devstat_tag_type tag;
 		bool valid = true;
 
 		if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
 			bytes = ccb->csio.dxfer_len - ccb->csio.resid;
 			tag = (devstat_tag_type)(ccb->csio.tag_action & 0x3);
 		} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 			bytes = ccb->ataio.dxfer_len - ccb->ataio.resid;
 			tag = (devstat_tag_type)0;
 		} else if (ccb->ccb_h.func_code == XPT_NVME_IO) {
 			bytes = ccb->nvmeio.dxfer_len; /* NB: resid no possible */
 			tag = (devstat_tag_type)0;
 		} else {
 			valid = false;
 		}
 		if (valid)
 			devstat_end_transaction(ds, bytes, tag,
 			    ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ?
 			    DEVSTAT_NO_DATA : (ccb->ccb_h.flags & CAM_DIR_OUT) ?
 			    DEVSTAT_WRITE : DEVSTAT_READ, NULL, starttime);
 	}
 
 	return(error);
 }
 
 void
 cam_freeze_devq(struct cam_path *path)
 {
 	struct ccb_hdr ccb_h;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_freeze_devq\n"));
 	xpt_setup_ccb(&ccb_h, path, /*priority*/1);
 	ccb_h.func_code = XPT_NOOP;
 	ccb_h.flags = CAM_DEV_QFREEZE;
 	xpt_action((union ccb *)&ccb_h);
 }
 
 u_int32_t
 cam_release_devq(struct cam_path *path, u_int32_t relsim_flags,
 		 u_int32_t openings, u_int32_t arg,
 		 int getcount_only)
 {
 	struct ccb_relsim crs;
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_release_devq(%u, %u, %u, %d)\n",
 	    relsim_flags, openings, arg, getcount_only));
 	xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL);
 	crs.ccb_h.func_code = XPT_REL_SIMQ;
 	crs.ccb_h.flags = getcount_only ? CAM_DEV_QFREEZE : 0;
 	crs.release_flags = relsim_flags;
 	crs.openings = openings;
 	crs.release_timeout = arg;
 	xpt_action((union ccb *)&crs);
 	return (crs.qfrozen_cnt);
 }
 
 #define saved_ccb_ptr ppriv_ptr0
 static void
 camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	union ccb      *saved_ccb;
 	cam_status	status;
 	struct scsi_start_stop_unit *scsi_cmd;
 	int		error = 0, error_code, sense_key, asc, ascq;
 
 	scsi_cmd = (struct scsi_start_stop_unit *)
 	    &done_ccb->csio.cdb_io.cdb_bytes;
 	status = done_ccb->ccb_h.status;
 
 	if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (scsi_extract_sense_ccb(done_ccb,
 		    &error_code, &sense_key, &asc, &ascq)) {
 			/*
 			 * If the error is "invalid field in CDB",
 			 * and the load/eject flag is set, turn the
 			 * flag off and try again.  This is just in
 			 * case the drive in question barfs on the
 			 * load eject flag.  The CAM code should set
 			 * the load/eject flag by default for
 			 * removable media.
 			 */
 			if ((scsi_cmd->opcode == START_STOP_UNIT) &&
 			    ((scsi_cmd->how & SSS_LOEJ) != 0) &&
 			     (asc == 0x24) && (ascq == 0x00)) {
 				scsi_cmd->how &= ~SSS_LOEJ;
 				if (status & CAM_DEV_QFRZN) {
 					cam_release_devq(done_ccb->ccb_h.path,
 					    0, 0, 0, 0);
 					done_ccb->ccb_h.status &=
 					    ~CAM_DEV_QFRZN;
 				}
 				xpt_action(done_ccb);
 				goto out;
 			}
 		}
 		error = cam_periph_error(done_ccb, 0,
 		    SF_RETRY_UA | SF_NO_PRINT);
 		if (error == ERESTART)
 			goto out;
 		if (done_ccb->ccb_h.status & CAM_DEV_QFRZN) {
 			cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
 			done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		}
 	} else {
 		/*
 		 * If we have successfully taken a device from the not
 		 * ready to ready state, re-scan the device and re-get
 		 * the inquiry information.  Many devices (mostly disks)
 		 * don't properly report their inquiry information unless
 		 * they are spun up.
 		 */
 		if (scsi_cmd->opcode == START_STOP_UNIT)
 			xpt_async(AC_INQ_CHANGED, done_ccb->ccb_h.path, NULL);
 	}
 
 	/*
 	 * After recovery action(s) completed, return to the original CCB.
 	 * If the recovery CCB has failed, considering its own possible
 	 * retries and recovery, assume we are back in state where we have
 	 * been originally, but without recovery hopes left.  In such case,
 	 * after the final attempt below, we cancel any further retries,
 	 * blocking by that also any new recovery attempts for this CCB,
 	 * and the result will be the final one returned to the CCB owher.
 	 */
 	saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr;
 	bcopy(saved_ccb, done_ccb, sizeof(*done_ccb));
 	xpt_free_ccb(saved_ccb);
 	if (done_ccb->ccb_h.cbfcnp != camperiphdone)
 		periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG;
 	if (error != 0)
 		done_ccb->ccb_h.retry_count = 0;
 	xpt_action(done_ccb);
 
 out:
 	/* Drop freeze taken due to CAM_DEV_QFREEZE flag set. */
 	cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0);
 }
 
 /*
  * Generic Async Event handler.  Peripheral drivers usually
  * filter out the events that require personal attention,
  * and leave the rest to this function.
  */
 void
 cam_periph_async(struct cam_periph *periph, u_int32_t code,
 		 struct cam_path *path, void *arg)
 {
 	switch (code) {
 	case AC_LOST_DEVICE:
 		cam_periph_invalidate(periph);
 		break; 
 	default:
 		break;
 	}
 }
 
 void
 cam_periph_bus_settle(struct cam_periph *periph, u_int bus_settle)
 {
 	struct ccb_getdevstats cgds;
 
 	xpt_setup_ccb(&cgds.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	cgds.ccb_h.func_code = XPT_GDEV_STATS;
 	xpt_action((union ccb *)&cgds);
 	cam_periph_freeze_after_event(periph, &cgds.last_reset, bus_settle);
 }
 
 void
 cam_periph_freeze_after_event(struct cam_periph *periph,
 			      struct timeval* event_time, u_int duration_ms)
 {
 	struct timeval delta;
 	struct timeval duration_tv;
 
 	if (!timevalisset(event_time))
 		return;
 
 	microtime(&delta);
 	timevalsub(&delta, event_time);
 	duration_tv.tv_sec = duration_ms / 1000;
 	duration_tv.tv_usec = (duration_ms % 1000) * 1000;
 	if (timevalcmp(&delta, &duration_tv, <)) {
 		timevalsub(&duration_tv, &delta);
 
 		duration_ms = duration_tv.tv_sec * 1000;
 		duration_ms += duration_tv.tv_usec / 1000;
 		cam_freeze_devq(periph->path); 
 		cam_release_devq(periph->path,
 				RELSIM_RELEASE_AFTER_TIMEOUT,
 				/*reduction*/0,
 				/*timeout*/duration_ms,
 				/*getcount_only*/0);
 	}
 
 }
 
 static int
 camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb,
     cam_flags camflags, u_int32_t sense_flags,
     int *openings, u_int32_t *relsim_flags,
     u_int32_t *timeout, u_int32_t *action, const char **action_string)
 {
 	int error;
 
 	switch (ccb->csio.scsi_status) {
 	case SCSI_STATUS_OK:
 	case SCSI_STATUS_COND_MET:
 	case SCSI_STATUS_INTERMED:
 	case SCSI_STATUS_INTERMED_COND_MET:
 		error = 0;
 		break;
 	case SCSI_STATUS_CMD_TERMINATED:
 	case SCSI_STATUS_CHECK_COND:
 		error = camperiphscsisenseerror(ccb, orig_ccb,
 					        camflags,
 					        sense_flags,
 					        openings,
 					        relsim_flags,
 					        timeout,
 					        action,
 					        action_string);
 		break;
 	case SCSI_STATUS_QUEUE_FULL:
 	{
 		/* no decrement */
 		struct ccb_getdevstats cgds;
 
 		/*
 		 * First off, find out what the current
 		 * transaction counts are.
 		 */
 		xpt_setup_ccb(&cgds.ccb_h,
 			      ccb->ccb_h.path,
 			      CAM_PRIORITY_NORMAL);
 		cgds.ccb_h.func_code = XPT_GDEV_STATS;
 		xpt_action((union ccb *)&cgds);
 
 		/*
 		 * If we were the only transaction active, treat
 		 * the QUEUE FULL as if it were a BUSY condition.
 		 */
 		if (cgds.dev_active != 0) {
 			int total_openings;
 
 			/*
 		 	 * Reduce the number of openings to
 			 * be 1 less than the amount it took
 			 * to get a queue full bounded by the
 			 * minimum allowed tag count for this
 			 * device.
 		 	 */
 			total_openings = cgds.dev_active + cgds.dev_openings;
 			*openings = cgds.dev_active;
 			if (*openings < cgds.mintags)
 				*openings = cgds.mintags;
 			if (*openings < total_openings)
 				*relsim_flags = RELSIM_ADJUST_OPENINGS;
 			else {
 				/*
 				 * Some devices report queue full for
 				 * temporary resource shortages.  For
 				 * this reason, we allow a minimum
 				 * tag count to be entered via a
 				 * quirk entry to prevent the queue
 				 * count on these devices from falling
 				 * to a pessimisticly low value.  We
 				 * still wait for the next successful
 				 * completion, however, before queueing
 				 * more transactions to the device.
 				 */
 				*relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT;
 			}
 			*timeout = 0;
 			error = ERESTART;
 			*action &= ~SSQ_PRINT_SENSE;
 			break;
 		}
 		/* FALLTHROUGH */
 	}
 	case SCSI_STATUS_BUSY:
 		/*
 		 * Restart the queue after either another
 		 * command completes or a 1 second timeout.
 		 */
 		if ((sense_flags & SF_RETRY_BUSY) != 0 ||
 		    (ccb->ccb_h.retry_count--) > 0) {
 			error = ERESTART;
 			*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT
 				      | RELSIM_RELEASE_AFTER_CMDCMPLT;
 			*timeout = 1000;
 		} else {
 			error = EIO;
 		}
 		break;
 	case SCSI_STATUS_RESERV_CONFLICT:
 	default:
 		error = EIO;
 		break;
 	}
 	return (error);
 }
 
 static int
 camperiphscsisenseerror(union ccb *ccb, union ccb **orig,
     cam_flags camflags, u_int32_t sense_flags,
     int *openings, u_int32_t *relsim_flags,
     u_int32_t *timeout, u_int32_t *action, const char **action_string)
 {
 	struct cam_periph *periph;
 	union ccb *orig_ccb = ccb;
 	int error, recoveryccb;
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 	if (ccb->ccb_h.func_code == XPT_SCSI_IO && ccb->csio.bio != NULL)
 		biotrack(ccb->csio.bio, __func__);
 #endif
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	recoveryccb = (ccb->ccb_h.cbfcnp == camperiphdone);
 	if ((periph->flags & CAM_PERIPH_RECOVERY_INPROG) && !recoveryccb) {
 		/*
 		 * If error recovery is already in progress, don't attempt
 		 * to process this error, but requeue it unconditionally
 		 * and attempt to process it once error recovery has
 		 * completed.  This failed command is probably related to
 		 * the error that caused the currently active error recovery
 		 * action so our  current recovery efforts should also
 		 * address this command.  Be aware that the error recovery
 		 * code assumes that only one recovery action is in progress
 		 * on a particular peripheral instance at any given time
 		 * (e.g. only one saved CCB for error recovery) so it is
 		 * imperitive that we don't violate this assumption.
 		 */
 		error = ERESTART;
 		*action &= ~SSQ_PRINT_SENSE;
 	} else {
 		scsi_sense_action err_action;
 		struct ccb_getdev cgd;
 
 		/*
 		 * Grab the inquiry data for this device.
 		 */
 		xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL);
 		cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 		xpt_action((union ccb *)&cgd);
 
 		err_action = scsi_error_action(&ccb->csio, &cgd.inq_data,
 		    sense_flags);
 		error = err_action & SS_ERRMASK;
 
 		/*
 		 * Do not autostart sequential access devices
 		 * to avoid unexpected tape loading.
 		 */
 		if ((err_action & SS_MASK) == SS_START &&
 		    SID_TYPE(&cgd.inq_data) == T_SEQUENTIAL) {
 			*action_string = "Will not autostart a "
 			    "sequential access device";
 			goto sense_error_done;
 		}
 
 		/*
 		 * Avoid recovery recursion if recovery action is the same.
 		 */
 		if ((err_action & SS_MASK) >= SS_START && recoveryccb) {
 			if (((err_action & SS_MASK) == SS_START &&
 			     ccb->csio.cdb_io.cdb_bytes[0] == START_STOP_UNIT) ||
 			    ((err_action & SS_MASK) == SS_TUR &&
 			     (ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY))) {
 				err_action = SS_RETRY|SSQ_DECREMENT_COUNT|EIO;
 				*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 				*timeout = 500;
 			}
 		}
 
 		/*
 		 * If the recovery action will consume a retry,
 		 * make sure we actually have retries available.
 		 */
 		if ((err_action & SSQ_DECREMENT_COUNT) != 0) {
 		 	if (ccb->ccb_h.retry_count > 0 &&
 			    (periph->flags & CAM_PERIPH_INVALID) == 0)
 		 		ccb->ccb_h.retry_count--;
 			else {
 				*action_string = "Retries exhausted";
 				goto sense_error_done;
 			}
 		}
 
 		if ((err_action & SS_MASK) >= SS_START) {
 			/*
 			 * Do common portions of commands that
 			 * use recovery CCBs.
 			 */
 			orig_ccb = xpt_alloc_ccb_nowait();
 			if (orig_ccb == NULL) {
 				*action_string = "Can't allocate recovery CCB";
 				goto sense_error_done;
 			}
 			/*
 			 * Clear freeze flag for original request here, as
 			 * this freeze will be dropped as part of ERESTART.
 			 */
 			ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 			bcopy(ccb, orig_ccb, sizeof(*orig_ccb));
 		}
 
 		switch (err_action & SS_MASK) {
 		case SS_NOP:
 			*action_string = "No recovery action needed";
 			error = 0;
 			break;
 		case SS_RETRY:
 			*action_string = "Retrying command (per sense data)";
 			error = ERESTART;
 			break;
 		case SS_FAIL:
 			*action_string = "Unretryable error";
 			break;
 		case SS_START:
 		{
 			int le;
 
 			/*
 			 * Send a start unit command to the device, and
 			 * then retry the command.
 			 */
 			*action_string = "Attempting to start unit";
 			periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
 
 			/*
 			 * Check for removable media and set
 			 * load/eject flag appropriately.
 			 */
 			if (SID_IS_REMOVABLE(&cgd.inq_data))
 				le = TRUE;
 			else
 				le = FALSE;
 
 			scsi_start_stop(&ccb->csio,
 					/*retries*/1,
 					camperiphdone,
 					MSG_SIMPLE_Q_TAG,
 					/*start*/TRUE,
 					/*load/eject*/le,
 					/*immediate*/FALSE,
 					SSD_FULL_SIZE,
 					/*timeout*/50000);
 			break;
 		}
 		case SS_TUR:
 		{
 			/*
 			 * Send a Test Unit Ready to the device.
 			 * If the 'many' flag is set, we send 120
 			 * test unit ready commands, one every half 
 			 * second.  Otherwise, we just send one TUR.
 			 * We only want to do this if the retry 
 			 * count has not been exhausted.
 			 */
 			int retries;
 
 			if ((err_action & SSQ_MANY) != 0) {
 				*action_string = "Polling device for readiness";
 				retries = 120;
 			} else {
 				*action_string = "Testing device for readiness";
 				retries = 1;
 			}
 			periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
 			scsi_test_unit_ready(&ccb->csio,
 					     retries,
 					     camperiphdone,
 					     MSG_SIMPLE_Q_TAG,
 					     SSD_FULL_SIZE,
 					     /*timeout*/5000);
 
 			/*
 			 * Accomplish our 500ms delay by deferring
 			 * the release of our device queue appropriately.
 			 */
 			*relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 			*timeout = 500;
 			break;
 		}
 		default:
 			panic("Unhandled error action %x", err_action);
 		}
 		
 		if ((err_action & SS_MASK) >= SS_START) {
 			/*
 			 * Drop the priority, so that the recovery
 			 * CCB is the first to execute.  Freeze the queue
 			 * after this command is sent so that we can
 			 * restore the old csio and have it queued in
 			 * the proper order before we release normal 
 			 * transactions to the device.
 			 */
 			ccb->ccb_h.pinfo.priority--;
 			ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
 			ccb->ccb_h.saved_ccb_ptr = orig_ccb;
 			error = ERESTART;
 			*orig = orig_ccb;
 		}
 
 sense_error_done:
 		*action = err_action;
 	}
 	return (error);
 }
 
 /*
  * Generic error handler.  Peripheral drivers usually filter
  * out the errors that they handle in a unique manner, then
  * call this function.
  */
 int
 cam_periph_error(union ccb *ccb, cam_flags camflags,
 		 u_int32_t sense_flags)
 {
 	struct cam_path *newpath;
 	union ccb  *orig_ccb, *scan_ccb;
 	struct cam_periph *periph;
 	const char *action_string;
 	cam_status  status;
 	int	    frozen, error, openings, devctl_err;
 	u_int32_t   action, relsim_flags, timeout;
 
 	action = SSQ_PRINT_SENSE;
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	action_string = NULL;
 	status = ccb->ccb_h.status;
 	frozen = (status & CAM_DEV_QFRZN) != 0;
 	status &= CAM_STATUS_MASK;
 	devctl_err = openings = relsim_flags = timeout = 0;
 	orig_ccb = ccb;
 
 	/* Filter the errors that should be reported via devctl */
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 	case CAM_REQ_ABORTED:
 	case CAM_REQ_CMP_ERR:
 	case CAM_REQ_TERMIO:
 	case CAM_UNREC_HBA_ERROR:
 	case CAM_DATA_RUN_ERR:
 	case CAM_SCSI_STATUS_ERROR:
 	case CAM_ATA_STATUS_ERROR:
 	case CAM_SMP_STATUS_ERROR:
 		devctl_err++;
 		break;
 	default:
 		break;
 	}
 
 	switch (status) {
 	case CAM_REQ_CMP:
 		error = 0;
 		action &= ~SSQ_PRINT_SENSE;
 		break;
 	case CAM_SCSI_STATUS_ERROR:
 		error = camperiphscsistatuserror(ccb, &orig_ccb,
 		    camflags, sense_flags, &openings, &relsim_flags,
 		    &timeout, &action, &action_string);
 		break;
 	case CAM_AUTOSENSE_FAIL:
 		error = EIO;	/* we have to kill the command */
 		break;
 	case CAM_UA_ABORT:
 	case CAM_UA_TERMIO:
 	case CAM_MSG_REJECT_REC:
 		/* XXX Don't know that these are correct */
 		error = EIO;
 		break;
 	case CAM_SEL_TIMEOUT:
 		if ((camflags & CAM_RETRY_SELTO) != 0) {
 			if (ccb->ccb_h.retry_count > 0 &&
 			    (periph->flags & CAM_PERIPH_INVALID) == 0) {
 				ccb->ccb_h.retry_count--;
 				error = ERESTART;
 
 				/*
 				 * Wait a bit to give the device
 				 * time to recover before we try again.
 				 */
 				relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 				timeout = periph_selto_delay;
 				break;
 			}
 			action_string = "Retries exhausted";
 		}
 		/* FALLTHROUGH */
 	case CAM_DEV_NOT_THERE:
 		error = ENXIO;
 		action = SSQ_LOST;
 		break;
 	case CAM_REQ_INVALID:
 	case CAM_PATH_INVALID:
 	case CAM_NO_HBA:
 	case CAM_PROVIDE_FAIL:
 	case CAM_REQ_TOO_BIG:
 	case CAM_LUN_INVALID:
 	case CAM_TID_INVALID:
 	case CAM_FUNC_NOTAVAIL:
 		error = EINVAL;
 		break;
 	case CAM_SCSI_BUS_RESET:
 	case CAM_BDR_SENT:
 		/*
 		 * Commands that repeatedly timeout and cause these
 		 * kinds of error recovery actions, should return
 		 * CAM_CMD_TIMEOUT, which allows us to safely assume
 		 * that this command was an innocent bystander to
 		 * these events and should be unconditionally
 		 * retried.
 		 */
 	case CAM_REQUEUE_REQ:
 		/* Unconditional requeue if device is still there */
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			action_string = "Periph was invalidated";
 			error = EIO;
 		} else if (sense_flags & SF_NO_RETRY) {
 			error = EIO;
 			action_string = "Retry was blocked";
 		} else {
 			error = ERESTART;
 			action &= ~SSQ_PRINT_SENSE;
 		}
 		break;
 	case CAM_RESRC_UNAVAIL:
 		/* Wait a bit for the resource shortage to abate. */
 		timeout = periph_noresrc_delay;
 		/* FALLTHROUGH */
 	case CAM_BUSY:
 		if (timeout == 0) {
 			/* Wait a bit for the busy condition to abate. */
 			timeout = periph_busy_delay;
 		}
 		relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
 		/* FALLTHROUGH */
 	case CAM_ATA_STATUS_ERROR:
 	case CAM_REQ_CMP_ERR:
 	case CAM_CMD_TIMEOUT:
 	case CAM_UNEXP_BUSFREE:
 	case CAM_UNCOR_PARITY:
 	case CAM_DATA_RUN_ERR:
 	default:
 		if (periph->flags & CAM_PERIPH_INVALID) {
 			error = EIO;
 			action_string = "Periph was invalidated";
 		} else if (ccb->ccb_h.retry_count == 0) {
 			error = EIO;
 			action_string = "Retries exhausted";
 		} else if (sense_flags & SF_NO_RETRY) {
 			error = EIO;
 			action_string = "Retry was blocked";
 		} else {
 			ccb->ccb_h.retry_count--;
 			error = ERESTART;
 		}
 		break;
 	}
 
 	if ((sense_flags & SF_PRINT_ALWAYS) ||
 	    CAM_DEBUGGED(ccb->ccb_h.path, CAM_DEBUG_INFO))
 		action |= SSQ_PRINT_SENSE;
 	else if (sense_flags & SF_NO_PRINT)
 		action &= ~SSQ_PRINT_SENSE;
 	if ((action & SSQ_PRINT_SENSE) != 0)
 		cam_error_print(orig_ccb, CAM_ESF_ALL, CAM_EPF_ALL);
 	if (error != 0 && (action & SSQ_PRINT_SENSE) != 0) {
 		if (error != ERESTART) {
 			if (action_string == NULL)
 				action_string = "Unretryable error";
 			xpt_print(ccb->ccb_h.path, "Error %d, %s\n",
 			    error, action_string);
 		} else if (action_string != NULL)
 			xpt_print(ccb->ccb_h.path, "%s\n", action_string);
 		else {
 			xpt_print(ccb->ccb_h.path,
 			    "Retrying command, %d more tries remain\n",
 			    ccb->ccb_h.retry_count);
 		}
 	}
 
 	if (devctl_err && (error != 0 || (action & SSQ_PRINT_SENSE) != 0))
 		cam_periph_devctl_notify(orig_ccb);
 
 	if ((action & SSQ_LOST) != 0) {
 		lun_id_t lun_id;
 
 		/*
 		 * For a selection timeout, we consider all of the LUNs on
 		 * the target to be gone.  If the status is CAM_DEV_NOT_THERE,
 		 * then we only get rid of the device(s) specified by the
 		 * path in the original CCB.
 		 */
 		if (status == CAM_SEL_TIMEOUT)
 			lun_id = CAM_LUN_WILDCARD;
 		else
 			lun_id = xpt_path_lun_id(ccb->ccb_h.path);
 
 		/* Should we do more if we can't create the path?? */
 		if (xpt_create_path(&newpath, periph,
 				    xpt_path_path_id(ccb->ccb_h.path),
 				    xpt_path_target_id(ccb->ccb_h.path),
 				    lun_id) == CAM_REQ_CMP) {
 
 			/*
 			 * Let peripheral drivers know that this
 			 * device has gone away.
 			 */
 			xpt_async(AC_LOST_DEVICE, newpath, NULL);
 			xpt_free_path(newpath);
 		}
 	}
 
 	/* Broadcast UNIT ATTENTIONs to all periphs. */
 	if ((action & SSQ_UA) != 0)
 		xpt_async(AC_UNIT_ATTENTION, orig_ccb->ccb_h.path, orig_ccb);
 
 	/* Rescan target on "Reported LUNs data has changed" */
 	if ((action & SSQ_RESCAN) != 0) {
 		if (xpt_create_path(&newpath, NULL,
 				    xpt_path_path_id(ccb->ccb_h.path),
 				    xpt_path_target_id(ccb->ccb_h.path),
 				    CAM_LUN_WILDCARD) == CAM_REQ_CMP) {
 
 			scan_ccb = xpt_alloc_ccb_nowait();
 			if (scan_ccb != NULL) {
 				scan_ccb->ccb_h.path = newpath;
 				scan_ccb->ccb_h.func_code = XPT_SCAN_TGT;
 				scan_ccb->crcn.flags = 0;
 				xpt_rescan(scan_ccb);
 			} else {
 				xpt_print(newpath,
 				    "Can't allocate CCB to rescan target\n");
 				xpt_free_path(newpath);
 			}
 		}
 	}
 
 	/* Attempt a retry */
 	if (error == ERESTART || error == 0) {
 		if (frozen != 0)
 			ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
 		if (error == ERESTART)
 			xpt_action(ccb);
 		if (frozen != 0)
 			cam_release_devq(ccb->ccb_h.path,
 					 relsim_flags,
 					 openings,
 					 timeout,
 					 /*getcount_only*/0);
 	}
 
 	return (error);
 }
 
 #define CAM_PERIPH_DEVD_MSG_SIZE	256
 
 static void
 cam_periph_devctl_notify(union ccb *ccb)
 {
 	struct cam_periph *periph;
 	struct ccb_getdev *cgd;
 	struct sbuf sb;
 	int serr, sk, asc, ascq;
 	char *sbmsg, *type;
 
 	sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT);
 	if (sbmsg == NULL)
 		return;
 
 	sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN);
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	sbuf_printf(&sb, "device=%s%d ", periph->periph_name,
 	    periph->unit_number);
 
 	sbuf_printf(&sb, "serial=\"");
 	if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
 		xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
 		    CAM_PRIORITY_NORMAL);
 		cgd->ccb_h.func_code = XPT_GDEV_TYPE;
 		xpt_action((union ccb *)cgd);
 
 		if (cgd->ccb_h.status == CAM_REQ_CMP)
 			sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
 		xpt_free_ccb((union ccb *)cgd);
 	}
 	sbuf_printf(&sb, "\" ");
 	sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status);
 
 	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
 	case CAM_CMD_TIMEOUT:
 		sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout);
 		type = "timeout";
 		break;
 	case CAM_SCSI_STATUS_ERROR:
 		sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status);
 		if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq))
 			sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ",
 			    serr, sk, asc, ascq);
 		type = "error";
 		break;
 	case CAM_ATA_STATUS_ERROR:
 		sbuf_printf(&sb, "RES=\"");
 		ata_res_sbuf(&ccb->ataio.res, &sb);
 		sbuf_printf(&sb, "\" ");
 		type = "error";
 		break;
 	default:
 		type = "error";
 		break;
 	}
 
 	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
 		sbuf_printf(&sb, "CDB=\"");
 		scsi_cdb_sbuf(scsiio_cdb_ptr(&ccb->csio), &sb);
 		sbuf_printf(&sb, "\" ");
 	} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 		sbuf_printf(&sb, "ACB=\"");
 		ata_cmd_sbuf(&ccb->ataio.cmd, &sb);
 		sbuf_printf(&sb, "\" ");
 	}
 
 	if (sbuf_finish(&sb) == 0)
 		devctl_notify("CAM", "periph", type, sbuf_data(&sb));
 	sbuf_delete(&sb);
 	free(sbmsg, M_CAMPERIPH);
 }
 
 /*
  * Sysctl to force an invalidation of the drive right now. Can be
  * called with CTLFLAG_MPSAFE since we take periph lock.
  */
 int
 cam_periph_invalidate_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct cam_periph *periph;
 	int error, value;
 
 	periph = arg1;
 	value = 0;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL || value != 1)
 		return (error);
 
 	cam_periph_lock(periph);
 	cam_periph_invalidate(periph);
 	cam_periph_unlock(periph);
 
 	return (0);
 }
Index: projects/capsicum-test/sys/cam/cam_periph.h
===================================================================
--- projects/capsicum-test/sys/cam/cam_periph.h	(revision 345709)
+++ projects/capsicum-test/sys/cam/cam_periph.h	(revision 345710)
@@ -1,269 +1,270 @@
 /*-
  * Data structures and definitions for CAM peripheral ("type") drivers.
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _CAM_CAM_PERIPH_H
 #define _CAM_CAM_PERIPH_H 1
 
 #include <sys/queue.h>
 #include <cam/cam_sim.h>
 
 #ifdef _KERNEL
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <cam/cam_xpt.h>
 
 struct devstat;
 
 extern struct cam_periph *xpt_periph;
 
 extern struct periph_driver **periph_drivers;
 void periphdriver_register(void *);
 int periphdriver_unregister(void *);
 void periphdriver_init(int level);
 
 #include <sys/module.h>
 #define PERIPHDRIVER_DECLARE(name, driver) \
 	static int name ## _modevent(module_t mod, int type, void *data) \
 	{ \
 		switch (type) { \
 		case MOD_LOAD: \
 			periphdriver_register(data); \
 			break; \
 		case MOD_UNLOAD: \
 			return (periphdriver_unregister(data)); \
 		default: \
 			return EOPNOTSUPP; \
 		} \
 		return 0; \
 	} \
 	static moduledata_t name ## _mod = { \
 		#name, \
 		name ## _modevent, \
 		(void *)&driver \
 	}; \
 	DECLARE_MODULE(name, name ## _mod, SI_SUB_DRIVERS, SI_ORDER_ANY); \
 	MODULE_DEPEND(name, cam, 1, 1, 1)
 
 /*
  * Callback informing the peripheral driver it can perform it's
  * initialization since the XPT is now fully initialized.
  */
 typedef void (periph_init_t)(void);
 
 /*
  * Callback requesting the peripheral driver to remove its instances
  * and shutdown, if possible.
  */
 typedef int (periph_deinit_t)(void);
 
 struct periph_driver {
 	periph_init_t		*init;
 	char			*driver_name;
 	TAILQ_HEAD(,cam_periph)	 units;
 	u_int			 generation;
 	u_int			 flags;
 #define CAM_PERIPH_DRV_EARLY		0x01
 	periph_deinit_t		*deinit;
 };
 
 typedef enum {
 	CAM_PERIPH_BIO
 } cam_periph_type;
 
 /* Generically useful offsets into the peripheral private area */
 #define ppriv_ptr0 periph_priv.entries[0].ptr
 #define ppriv_ptr1 periph_priv.entries[1].ptr
 #define ppriv_field0 periph_priv.entries[0].field
 #define ppriv_field1 periph_priv.entries[1].field
 
 typedef void		periph_start_t (struct cam_periph *periph,
 					union ccb *start_ccb);
 typedef cam_status	periph_ctor_t (struct cam_periph *periph,
 				       void *arg);
 typedef void		periph_oninv_t (struct cam_periph *periph);
 typedef void		periph_dtor_t (struct cam_periph *periph);
 struct cam_periph {
 	periph_start_t		*periph_start;
 	periph_oninv_t		*periph_oninval;
 	periph_dtor_t		*periph_dtor;
 	char			*periph_name;
 	struct cam_path		*path;	/* Compiled path to device */
 	void			*softc;
 	struct cam_sim		*sim;
 	u_int32_t		 unit_number;
 	cam_periph_type		 type;
 	u_int32_t		 flags;
 #define CAM_PERIPH_RUNNING		0x01
 #define CAM_PERIPH_LOCKED		0x02
 #define CAM_PERIPH_LOCK_WANTED		0x04
 #define CAM_PERIPH_INVALID		0x08
 #define CAM_PERIPH_NEW_DEV_FOUND	0x10
 #define CAM_PERIPH_RECOVERY_INPROG	0x20
 #define CAM_PERIPH_RUN_TASK		0x40
 #define CAM_PERIPH_FREE			0x80
 #define CAM_PERIPH_ANNOUNCED		0x100
 	uint32_t		 scheduled_priority;
 	uint32_t		 immediate_priority;
 	int			 periph_allocating;
 	int			 periph_allocated;
 	u_int32_t		 refcount;
 	SLIST_HEAD(, ccb_hdr)	 ccb_list;	/* For "immediate" requests */
 	SLIST_ENTRY(cam_periph)  periph_links;
 	TAILQ_ENTRY(cam_periph)  unit_links;
 	ac_callback_t		*deferred_callback; 
 	ac_code			 deferred_ac;
 	struct task		 periph_run_task;
 };
 
 #define CAM_PERIPH_MAXMAPS	2
 
 struct cam_periph_map_info {
 	int		num_bufs_used;
+	void		*orig[CAM_PERIPH_MAXMAPS];
 	struct buf	*bp[CAM_PERIPH_MAXMAPS];
 };
 
 cam_status cam_periph_alloc(periph_ctor_t *periph_ctor,
 			    periph_oninv_t *periph_oninvalidate,
 			    periph_dtor_t *periph_dtor,
 			    periph_start_t *periph_start,
 			    char *name, cam_periph_type type, struct cam_path *,
 			    ac_callback_t *, ac_code, void *arg);
 struct cam_periph *cam_periph_find(struct cam_path *path, char *name);
 int		cam_periph_list(struct cam_path *, struct sbuf *);
 int		cam_periph_acquire(struct cam_periph *periph);
 void		cam_periph_doacquire(struct cam_periph *periph);
 void		cam_periph_release(struct cam_periph *periph);
 void		cam_periph_release_locked(struct cam_periph *periph);
 void		cam_periph_release_locked_buses(struct cam_periph *periph);
 int		cam_periph_hold(struct cam_periph *periph, int priority);
 void		cam_periph_unhold(struct cam_periph *periph);
 void		cam_periph_invalidate(struct cam_periph *periph);
 int		cam_periph_mapmem(union ccb *ccb,
 				  struct cam_periph_map_info *mapinfo,
 				  u_int maxmap);
 void		cam_periph_unmapmem(union ccb *ccb,
 				    struct cam_periph_map_info *mapinfo);
 union ccb	*cam_periph_getccb(struct cam_periph *periph,
 				   u_int32_t priority);
 int		cam_periph_runccb(union ccb *ccb,
 				  int (*error_routine)(union ccb *ccb,
 						       cam_flags camflags,
 						       u_int32_t sense_flags),
 				  cam_flags camflags, u_int32_t sense_flags,
 				  struct devstat *ds);
 int		cam_periph_ioctl(struct cam_periph *periph, u_long cmd, 
 				 caddr_t addr,
 				 int (*error_routine)(union ccb *ccb,
 						      cam_flags camflags,
 						      u_int32_t sense_flags));
 void		cam_freeze_devq(struct cam_path *path);
 u_int32_t	cam_release_devq(struct cam_path *path, u_int32_t relsim_flags,
 				 u_int32_t opening_reduction, u_int32_t arg,
 				 int getcount_only);
 void		cam_periph_async(struct cam_periph *periph, u_int32_t code,
 		 		 struct cam_path *path, void *arg);
 void		cam_periph_bus_settle(struct cam_periph *periph,
 				      u_int bus_settle_ms);
 void		cam_periph_freeze_after_event(struct cam_periph *periph,
 					      struct timeval* event_time,
 					      u_int duration_ms);
 int		cam_periph_error(union ccb *ccb, cam_flags camflags,
 				 u_int32_t sense_flags);
 int		cam_periph_invalidate_sysctl(SYSCTL_HANDLER_ARGS);
 
 static __inline struct mtx *
 cam_periph_mtx(struct cam_periph *periph)
 {
 	if (periph != NULL)
 		return (xpt_path_mtx(periph->path));
 	else
 		return (NULL);
 }
 
 #define cam_periph_owned(periph)					\
 	mtx_owned(xpt_path_mtx((periph)->path))
 
 #define cam_periph_lock(periph)						\
 	mtx_lock(xpt_path_mtx((periph)->path))
 
 #define cam_periph_unlock(periph)					\
 	mtx_unlock(xpt_path_mtx((periph)->path))
 
 #define cam_periph_assert(periph, what)					\
 	mtx_assert(xpt_path_mtx((periph)->path), (what))
 
 #define cam_periph_sleep(periph, chan, priority, wmesg, timo)		\
 	xpt_path_sleep((periph)->path, (chan), (priority), (wmesg), (timo))
 
 static inline struct cam_periph *
 cam_periph_acquire_first(struct periph_driver *driver)
 {
 	struct cam_periph *periph;
 
 	xpt_lock_buses();
 	periph = TAILQ_FIRST(&driver->units);
 	while (periph != NULL && (periph->flags & CAM_PERIPH_INVALID) != 0)
 		periph = TAILQ_NEXT(periph, unit_links);
 	if (periph != NULL)
 		periph->refcount++;
 	xpt_unlock_buses();
 	return (periph);
 }
 
 static inline struct cam_periph *
 cam_periph_acquire_next(struct cam_periph *pperiph)
 {
 	struct cam_periph *periph = pperiph;
 
 	cam_periph_assert(pperiph, MA_NOTOWNED);
 	xpt_lock_buses();
 	do {
 		periph = TAILQ_NEXT(periph, unit_links);
 	} while (periph != NULL && (periph->flags & CAM_PERIPH_INVALID) != 0);
 	if (periph != NULL)
 		periph->refcount++;
 	xpt_unlock_buses();
 	cam_periph_release(pperiph);
 	return (periph);
 }
 
 #define CAM_PERIPH_FOREACH(periph, driver)				\
 	for ((periph) = cam_periph_acquire_first(driver);		\
 	    (periph) != NULL;						\
 	    (periph) = cam_periph_acquire_next(periph))
 
 #define CAM_PERIPH_PRINT(p, msg, args...)				\
     printf("%s%d:" msg, (periph)->periph_name, (periph)->unit_number, ##args)
 
 #endif /* _KERNEL */
 #endif /* _CAM_CAM_PERIPH_H */
Index: projects/capsicum-test/sys/conf/NOTES
===================================================================
--- projects/capsicum-test/sys/conf/NOTES	(revision 345709)
+++ projects/capsicum-test/sys/conf/NOTES	(revision 345710)
@@ -1,3008 +1,3008 @@
 # $FreeBSD$
 #
 # NOTES -- Lines that can be cut/pasted into kernel and hints configs.
 #
 # Lines that begin with 'device', 'options', 'machine', 'ident', 'maxusers',
 # 'makeoptions', 'hints', etc. go into the kernel configuration that you
 # run config(8) with.
 #
 # Lines that begin with 'hint.' are NOT for config(8), they go into your
 # hints file.  See /boot/device.hints and/or the 'hints' config(8) directive.
 #
 # Please use ``make LINT'' to create an old-style LINT file if you want to
 # do kernel test-builds.
 #
 # This file contains machine independent kernel configuration notes.  For
 # machine dependent notes, look in /sys/<arch>/conf/NOTES.
 #
 
 #
 # NOTES conventions and style guide:
 #
 # Large block comments should begin and end with a line containing only a
 # comment character.
 #
 # To describe a particular object, a block comment (if it exists) should
 # come first.  Next should come device, options, and hints lines in that
 # order.  All device and option lines must be described by a comment that
 # doesn't just expand the device or option name.  Use only a concise
 # comment on the same line if possible.  Very detailed descriptions of
 # devices and subsystems belong in man pages.
 #
 # A space followed by a tab separates 'options' from an option name.  Two
 # spaces followed by a tab separate 'device' from a device name.  Comments
 # after an option or device should use one space after the comment character.
 # To comment out a negative option that disables code and thus should not be
 # enabled for LINT builds, precede 'options' with "#!".
 #
 
 #
 # This is the ``identification'' of the kernel.  Usually this should
 # be the same as the name of your kernel.
 #
 ident		LINT
 
 #
 # The `maxusers' parameter controls the static sizing of a number of
 # internal system tables by a formula defined in subr_param.c.
 # Omitting this parameter or setting it to 0 will cause the system to
 # auto-size based on physical memory.
 #
 maxusers	10
 
 # To statically compile in device wiring instead of /boot/device.hints
 #hints		"LINT.hints"		# Default places to look for devices.
 
 # Use the following to compile in values accessible to the kernel
 # through getenv() (or kenv(1) in userland). The format of the file
 # is 'variable=value', see kenv(1)
 #
 #env		"LINT.env"
 
 #
 # The `makeoptions' parameter allows variables to be passed to the
 # generated Makefile in the build area.
 #
 # CONF_CFLAGS gives some extra compiler flags that are added to ${CFLAGS}
 # after most other flags.  Here we use it to inhibit use of non-optimal
 # gcc built-in functions (e.g., memcmp).
 #
 # DEBUG happens to be magic.
 # The following is equivalent to 'config -g KERNELNAME' and creates
 # 'kernel.debug' compiled with -g debugging as well as a normal
 # 'kernel'.  Use 'make install.debug' to install the debug kernel
 # but that isn't normally necessary as the debug symbols are not loaded
 # by the kernel and are not useful there anyway.
 #
 # KERNEL can be overridden so that you can change the default name of your
 # kernel.
 #
 # MODULES_OVERRIDE can be used to limit modules built to a specific list.
 #
 makeoptions	CONF_CFLAGS=-fno-builtin  #Don't allow use of memcmp, etc.
 #makeoptions	DEBUG=-g		#Build kernel with gdb(1) debug symbols
 #makeoptions	KERNEL=foo		#Build kernel "foo" and install "/foo"
 # Only build ext2fs module plus those parts of the sound system I need.
 #makeoptions	MODULES_OVERRIDE="ext2fs sound/sound sound/driver/maestro3"
 makeoptions	DESTDIR=/tmp
 
 #
 # FreeBSD processes are subject to certain limits to their consumption
 # of system resources.  See getrlimit(2) for more details.  Each
 # resource limit has two values, a "soft" limit and a "hard" limit.
 # The soft limits can be modified during normal system operation, but
 # the hard limits are set at boot time.  Their default values are
 # in sys/<arch>/include/vmparam.h.  There are two ways to change them:
 # 
 # 1.  Set the values at kernel build time.  The options below are one
 #     way to allow that limit to grow to 1GB.  They can be increased
 #     further by changing the parameters:
 #	
 # 2.  In /boot/loader.conf, set the tunables kern.maxswzone,
 #     kern.maxbcache, kern.maxtsiz, kern.dfldsiz, kern.maxdsiz,
 #     kern.dflssiz, kern.maxssiz and kern.sgrowsiz.
 #
 # The options in /boot/loader.conf override anything in the kernel
 # configuration file.  See the function init_param1 in
 # sys/kern/subr_param.c for more details.
 #
 
 options 	MAXDSIZ=(1024UL*1024*1024)
 options 	MAXSSIZ=(128UL*1024*1024)
 options 	DFLDSIZ=(1024UL*1024*1024)
 
 #
 # BLKDEV_IOSIZE sets the default block size used in user block
 # device I/O.  Note that this value will be overridden by the label
 # when specifying a block device from a label with a non-0
 # partition blocksize.  The default is PAGE_SIZE.
 #
 options 	BLKDEV_IOSIZE=8192
 
 #
 # MAXPHYS and DFLTPHYS
 #
 # These are the maximal and safe 'raw' I/O block device access sizes.
 # Reads and writes will be split into MAXPHYS chunks for known good
 # devices and DFLTPHYS for the rest. Some applications have better
 # performance with larger raw I/O access sizes. Note that certain VM
 # parameters are derived from these values and making them too large
 # can make an unbootable kernel.
 #
 # The defaults are 64K and 128K respectively.
 options 	DFLTPHYS=(64*1024)
 options 	MAXPHYS=(128*1024)
 
 
 # This allows you to actually store this configuration file into
 # the kernel binary itself. See config(8) for more details.
 #
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
 #
 # Compile-time defaults for various boot parameters
 #
 options 	BOOTVERBOSE=1
 options 	BOOTHOWTO=RB_MULTIPLE
 
 #
 # Compile-time defaults for dmesg boot tagging
 #
 # Default boot tag; may use 'kern.boot_tag' loader tunable to override.  The
 # current boot's tag is also exposed via the 'kern.boot_tag' sysctl.
-options 	BOOT_TAG=\"---<<BOOT>>---\"
+options 	BOOT_TAG=\"\"
 # Maximum boot tag size the kernel's static buffer should accomodate.  Maximum
 # size for both BOOT_TAG and the assocated tunable.
 options 	BOOT_TAG_SZ=32
 
 options 	GEOM_BDE		# Disk encryption.
 options 	GEOM_BSD		# BSD disklabels (obsolete, gone in 12)
 options 	GEOM_CACHE		# Disk cache.
 options 	GEOM_CONCAT		# Disk concatenation.
 options 	GEOM_ELI		# Disk encryption.
 options 	GEOM_FOX		# Redundant path mitigation (obsolete, gone in 12)
 options 	GEOM_GATE		# Userland services.
 options 	GEOM_JOURNAL		# Journaling.
 options 	GEOM_LABEL		# Providers labelization.
 options 	GEOM_LINUX_LVM		# Linux LVM2 volumes
 options 	GEOM_MAP		# Map based partitioning
 options 	GEOM_MBR		# DOS/MBR partitioning (obsolete, gone in 12)
 options 	GEOM_MIRROR		# Disk mirroring.
 options 	GEOM_MULTIPATH		# Disk multipath
 options 	GEOM_NOP		# Test class.
 options 	GEOM_PART_APM		# Apple partitioning
 options 	GEOM_PART_BSD		# BSD disklabel
 options 	GEOM_PART_BSD64		# BSD disklabel64
 options 	GEOM_PART_EBR		# Extended Boot Records
 options 	GEOM_PART_EBR_COMPAT	# Backward compatible partition names
 options 	GEOM_PART_GPT		# GPT partitioning
 options 	GEOM_PART_LDM		# Logical Disk Manager
 options 	GEOM_PART_MBR		# MBR partitioning
 options 	GEOM_PART_VTOC8		# SMI VTOC8 disk label
 options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_RAID3		# RAID3 functionality.
 options 	GEOM_SHSEC		# Shared secret.
 options 	GEOM_STRIPE		# Disk striping.
 options 	GEOM_SUNLABEL		# Sun/Solaris partitioning (obsolete, gone in 12)
 options 	GEOM_UZIP		# Read-only compressed disks
 options 	GEOM_VINUM		# Vinum logical volume manager
 options 	GEOM_VIRSTOR		# Virtual storage.
 options 	GEOM_VOL		# Volume names from UFS superblock (obsolete, gone in 12)
 options 	GEOM_ZERO		# Performance testing helper.
 
 #
 # The root device and filesystem type can be compiled in;
 # this provides a fallback option if the root device cannot
 # be correctly guessed by the bootstrap code, or an override if
 # the RB_DFLTROOT flag (-r) is specified when booting the kernel.
 #
 options 	ROOTDEVNAME=\"ufs:da0s2e\"
 
 
 #####################################################################
 # Scheduler options:
 #
 # Specifying one of SCHED_4BSD or SCHED_ULE is mandatory.  These options
 # select which scheduler is compiled in.
 #
 # SCHED_4BSD is the historical, proven, BSD scheduler.  It has a global run
 # queue and no CPU affinity which makes it suboptimal for SMP.  It has very
 # good interactivity and priority selection.
 #
 # SCHED_ULE provides significant performance advantages over 4BSD on many
 # workloads on SMP machines.  It supports cpu-affinity, per-cpu runqueues
 # and scheduler locks.  It also has a stronger notion of interactivity 
 # which leads to better responsiveness even on uniprocessor machines.  This
 # is the default scheduler.
 #
 # SCHED_STATS is a debugging option which keeps some stats in the sysctl
 # tree at 'kern.sched.stats' and is useful for debugging scheduling decisions.
 #
 options 	SCHED_4BSD
 options 	SCHED_STATS
 #options 	SCHED_ULE
 
 #####################################################################
 # SMP OPTIONS:
 #
 # SMP enables building of a Symmetric MultiProcessor Kernel.
 
 # Mandatory:
 options 	SMP			# Symmetric MultiProcessor Kernel
 
 # EARLY_AP_STARTUP releases the Application Processors earlier in the
 # kernel startup process (before devices are probed) rather than at the
 # end.  This is a temporary option for use during the transition from
 # late to early AP startup.
 options		EARLY_AP_STARTUP
 
 # MAXCPU defines the maximum number of CPUs that can boot in the system.
 # A default value should be already present, for every architecture.
 options 	MAXCPU=32
 
 # NUMA enables use of Non-Uniform Memory Access policies in various kernel
 # subsystems.
 options 	NUMA
 
 # MAXMEMDOM defines the maximum number of memory domains that can boot in the
 # system.  A default value should already be defined by every architecture.
 options 	MAXMEMDOM=2
 
 # ADAPTIVE_MUTEXES changes the behavior of blocking mutexes to spin
 # if the thread that currently owns the mutex is executing on another
 # CPU.  This behavior is enabled by default, so this option can be used
 # to disable it.
 options 	NO_ADAPTIVE_MUTEXES
 
 # ADAPTIVE_RWLOCKS changes the behavior of reader/writer locks to spin
 # if the thread that currently owns the rwlock is executing on another
 # CPU.  This behavior is enabled by default, so this option can be used
 # to disable it.
 options 	NO_ADAPTIVE_RWLOCKS
 
 # ADAPTIVE_SX changes the behavior of sx locks to spin if the thread that
 # currently owns the sx lock is executing on another CPU.
 # This behavior is enabled by default, so this option can be used to
 # disable it.
 options 	NO_ADAPTIVE_SX
 
 # MUTEX_NOINLINE forces mutex operations to call functions to perform each
 # operation rather than inlining the simple cases.  This can be used to
 # shrink the size of the kernel text segment.  Note that this behavior is
 # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING,
 # and WITNESS options.
 options 	MUTEX_NOINLINE
 
 # RWLOCK_NOINLINE forces rwlock operations to call functions to perform each
 # operation rather than inlining the simple cases.  This can be used to
 # shrink the size of the kernel text segment.  Note that this behavior is
 # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING,
 # and WITNESS options.
 options 	RWLOCK_NOINLINE
 
 # SX_NOINLINE forces sx lock operations to call functions to perform each
 # operation rather than inlining the simple cases.  This can be used to
 # shrink the size of the kernel text segment.  Note that this behavior is
 # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING,
 # and WITNESS options.
 options 	SX_NOINLINE
 
 # SMP Debugging Options:
 #
 # CALLOUT_PROFILING enables rudimentary profiling of the callwheel data
 #	  structure used as backend in callout(9).
 # PREEMPTION allows the threads that are in the kernel to be preempted by
 #	  higher priority [interrupt] threads.  It helps with interactivity
 #	  and allows interrupt threads to run sooner rather than waiting.
 #	  WARNING! Only tested on amd64 and i386.
 # FULL_PREEMPTION instructs the kernel to preempt non-realtime kernel
 #	  threads.  Its sole use is to expose race conditions and other
 #	  bugs during development.  Enabling this option will reduce
 #	  performance and increase the frequency of kernel panics by
 #	  design.  If you aren't sure that you need it then you don't.
 #	  Relies on the PREEMPTION option.  DON'T TURN THIS ON.
 # SLEEPQUEUE_PROFILING enables rudimentary profiling of the hash table
 #	  used to hold active sleep queues as well as sleep wait message
 #	  frequency.
 # TURNSTILE_PROFILING enables rudimentary profiling of the hash table
 #	  used to hold active lock queues.
 # UMTX_PROFILING enables rudimentary profiling of the hash table used 
 #	  to hold active lock queues.
 # WITNESS enables the witness code which detects deadlocks and cycles
 #         during locking operations.
 # WITNESS_KDB causes the witness code to drop into the kernel debugger if
 #	  a lock hierarchy violation occurs or if locks are held when going to
 #	  sleep.
 # WITNESS_SKIPSPIN disables the witness checks on spin mutexes.
 options 	PREEMPTION
 options 	FULL_PREEMPTION
 options 	WITNESS
 options 	WITNESS_KDB
 options 	WITNESS_SKIPSPIN
 
 # LOCK_PROFILING - Profiling locks.  See LOCK_PROFILING(9) for details.
 options 	LOCK_PROFILING
 # Set the number of buffers and the hash size.  The hash size MUST be larger
 # than the number of buffers.  Hash size should be prime.
 options 	MPROF_BUFFERS="1536"
 options 	MPROF_HASH_SIZE="1543"
 
 # Profiling for the callout(9) backend.
 options 	CALLOUT_PROFILING
 
 # Profiling for internal hash tables.
 options 	SLEEPQUEUE_PROFILING
 options 	TURNSTILE_PROFILING
 options 	UMTX_PROFILING
 
 
 #####################################################################
 # COMPATIBILITY OPTIONS
 
 #
 # Implement system calls compatible with 4.3BSD and older versions of
 # FreeBSD.  You probably do NOT want to remove this as much current code
 # still relies on the 4.3 emulation.  Note that some architectures that
 # are supported by FreeBSD do not include support for certain important
 # aspects of this compatibility option, namely those related to the
 # signal delivery mechanism.
 #
 options 	COMPAT_43
 
 # Old tty interface.
 options 	COMPAT_43TTY
 
 # Note that as a general rule, COMPAT_FREEBSD<n> depends on
 # COMPAT_FREEBSD<n+1>, COMPAT_FREEBSD<n+2>, etc.
 
 # Enable FreeBSD4 compatibility syscalls
 options 	COMPAT_FREEBSD4
 
 # Enable FreeBSD5 compatibility syscalls
 options 	COMPAT_FREEBSD5
 
 # Enable FreeBSD6 compatibility syscalls
 options 	COMPAT_FREEBSD6
 
 # Enable FreeBSD7 compatibility syscalls
 options 	COMPAT_FREEBSD7
 
 # Enable FreeBSD9 compatibility syscalls
 options 	COMPAT_FREEBSD9
 
 # Enable FreeBSD10 compatibility syscalls
 options 	COMPAT_FREEBSD10
 
 # Enable FreeBSD11 compatibility syscalls
 options 	COMPAT_FREEBSD11
 
 # Enable Linux Kernel Programming Interface
 options 	COMPAT_LINUXKPI
 
 #
 # These three options provide support for System V Interface
 # Definition-style interprocess communication, in the form of shared
 # memory, semaphores, and message queues, respectively.
 #
 options 	SYSVSHM
 options 	SYSVSEM
 options 	SYSVMSG
 
 
 #####################################################################
 # DEBUGGING OPTIONS
 
 #
 # Compile with kernel debugger related code.
 #
 options 	KDB
 
 #
 # Print a stack trace of the current thread on the console for a panic.
 #
 options 	KDB_TRACE
 
 #
 # Don't enter the debugger for a panic. Intended for unattended operation
 # where you may want to enter the debugger from the console, but still want
 # the machine to recover from a panic.
 #
 options 	KDB_UNATTENDED
 
 #
 # Enable the ddb debugger backend.
 #
 options 	DDB
 
 #
 # Print the numerical value of symbols in addition to the symbolic
 # representation.
 #
 options 	DDB_NUMSYM
 
 #
 # Enable the remote gdb debugger backend.
 #
 options 	GDB
 
 #
 # SYSCTL_DEBUG enables a 'sysctl' debug tree that can be used to dump the
 # contents of the registered sysctl nodes on the console.  It is disabled by
 # default because it generates excessively verbose console output that can
 # interfere with serial console operation.
 #
 options 	SYSCTL_DEBUG
 
 #
 # Enable textdump by default, this disables kernel core dumps.
 #
 options		TEXTDUMP_PREFERRED
 
 #
 # Enable extra debug messages while performing textdumps.
 #
 options		TEXTDUMP_VERBOSE
 
 #
 # NO_SYSCTL_DESCR omits the sysctl node descriptions to save space in the
 # resulting kernel.
 options		NO_SYSCTL_DESCR
 
 #
 # MALLOC_DEBUG_MAXZONES enables multiple uma zones for malloc(9)
 # allocations that are smaller than a page.  The purpose is to isolate
 # different malloc types into hash classes, so that any buffer
 # overruns or use-after-free will usually only affect memory from
 # malloc types in that hash class.  This is purely a debugging tool;
 # by varying the hash function and tracking which hash class was
 # corrupted, the intersection of the hash classes from each instance
 # will point to a single malloc type that is being misused.  At this
 # point inspection or memguard(9) can be used to catch the offending
 # code.
 #
 options 	MALLOC_DEBUG_MAXZONES=8
 
 #
 # DEBUG_MEMGUARD builds and enables memguard(9), a replacement allocator
 # for the kernel used to detect modify-after-free scenarios.  See the
 # memguard(9) man page for more information on usage.
 #
 options 	DEBUG_MEMGUARD
 
 #
 # DEBUG_REDZONE enables buffer underflows and buffer overflows detection for
 # malloc(9).
 #
 options 	DEBUG_REDZONE
 
 #
 # EARLY_PRINTF enables support for calling a special printf (eprintf)
 # very early in the kernel (before cn_init() has been called).  This
 # should only be used for debugging purposes early in boot.  Normally,
 # it is not defined.  It is commented out here because this feature
 # isn't generally available. And the required eputc() isn't defined.
 #
 #options	EARLY_PRINTF
 
 #
 # KTRACE enables the system-call tracing facility ktrace(2).  To be more
 # SMP-friendly, KTRACE uses a worker thread to process most trace events
 # asynchronously to the thread generating the event.  This requires a
 # pre-allocated store of objects representing trace events.  The
 # KTRACE_REQUEST_POOL option specifies the initial size of this store.
 # The size of the pool can be adjusted both at boottime and runtime via
 # the kern.ktrace_request_pool tunable and sysctl.
 #
 options 	KTRACE			#kernel tracing
 options 	KTRACE_REQUEST_POOL=101
 
 #
 # KTR is a kernel tracing facility imported from BSD/OS.  It is
 # enabled with the KTR option.  KTR_ENTRIES defines the number of
 # entries in the circular trace buffer; it may be an arbitrary number.
 # KTR_BOOT_ENTRIES defines the number of entries during the early boot,
 # before malloc(9) is functional.
 # KTR_COMPILE defines the mask of events to compile into the kernel as
 # defined by the KTR_* constants in <sys/ktr.h>.  KTR_MASK defines the
 # initial value of the ktr_mask variable which determines at runtime
 # what events to trace.  KTR_CPUMASK determines which CPU's log
 # events, with bit X corresponding to CPU X.  The layout of the string
 # passed as KTR_CPUMASK must match a series of bitmasks each of them
 # separated by the "," character (ie:
 # KTR_CPUMASK=0xAF,0xFFFFFFFFFFFFFFFF).  KTR_VERBOSE enables
 # dumping of KTR events to the console by default.  This functionality
 # can be toggled via the debug.ktr_verbose sysctl and defaults to off
 # if KTR_VERBOSE is not defined.  See ktr(4) and ktrdump(8) for details.
 #
 options 	KTR
 options 	KTR_BOOT_ENTRIES=1024
 options 	KTR_ENTRIES=(128*1024)
 options 	KTR_COMPILE=(KTR_ALL)
 options 	KTR_MASK=KTR_INTR
 options 	KTR_CPUMASK=0x3
 options 	KTR_VERBOSE
 
 #
 # ALQ(9) is a facility for the asynchronous queuing of records from the kernel
 # to a vnode, and is employed by services such as ktr(4) to produce trace
 # files based on a kernel event stream.  Records are written asynchronously
 # in a worker thread.
 #
 options 	ALQ
 options 	KTR_ALQ
 
 #
 # The INVARIANTS option is used in a number of source files to enable
 # extra sanity checking of internal structures.  This support is not
 # enabled by default because of the extra time it would take to check
 # for these conditions, which can only occur as a result of
 # programming errors.
 #
 options 	INVARIANTS
 
 #
 # The INVARIANT_SUPPORT option makes us compile in support for
 # verifying some of the internal structures.  It is a prerequisite for
 # 'INVARIANTS', as enabling 'INVARIANTS' will make these functions be
 # called.  The intent is that you can set 'INVARIANTS' for single
 # source files (by changing the source file or specifying it on the
 # command line) if you have 'INVARIANT_SUPPORT' enabled.  Also, if you
 # wish to build a kernel module with 'INVARIANTS', then adding
 # 'INVARIANT_SUPPORT' to your kernel will provide all the necessary
 # infrastructure without the added overhead.
 #
 options 	INVARIANT_SUPPORT
 
 #
 # The KASSERT_PANIC_OPTIONAL option allows kasserts to fire without
 # necessarily inducing a panic.  Panic is the default behavior, but
 # runtime options can configure it either entirely off, or off with a
 # limit.
 #
 options 	KASSERT_PANIC_OPTIONAL
 
 #
 # The DIAGNOSTIC option is used to enable extra debugging information
 # from some parts of the kernel.  As this makes everything more noisy,
 # it is disabled by default.
 #
 options 	DIAGNOSTIC
 
 #
 # REGRESSION causes optional kernel interfaces necessary only for regression
 # testing to be enabled.  These interfaces may constitute security risks
 # when enabled, as they permit processes to easily modify aspects of the
 # run-time environment to reproduce unlikely or unusual (possibly normally
 # impossible) scenarios.
 #
 options 	REGRESSION
 
 #
 # This option lets some drivers co-exist that can't co-exist in a running
 # system.  This is used to be able to compile all kernel code in one go for
 # quality assurance purposes (like this file, which the option takes it name
 # from.)
 #
 options 	COMPILING_LINT
 
 #
 # STACK enables the stack(9) facility, allowing the capture of kernel stack
 # for the purpose of procinfo(1), etc.  stack(9) will also be compiled in
 # automatically if DDB(4) is compiled into the kernel.
 #
 options 	STACK
 
 #
 # The NUM_CORE_FILES option specifies the limit for the number of core
 # files generated by a particular process, when the core file format
 # specifier includes the %I pattern. Since we only have 1 character for
 # the core count in the format string, meaning the range will be 0-9, the
 # maximum value allowed for this option is 10.
 # This core file limit can be adjusted at runtime via the debug.ncores
 # sysctl.
 #
 options 	NUM_CORE_FILES=5
 
 #
 # The TSLOG option enables timestamped logging of events, especially
 # function entries/exits, in order to track the time spent by the kernel.
 # In particular, this is useful when investigating the early boot process,
 # before it is possible to use more sophisticated tools like DTrace.
 # The TSLOGSIZE option controls the size of the (preallocated, fixed
 # length) buffer used for storing these events (default: 262144 records).
 #
 # For security reasons the TSLOG option should not be enabled on systems
 # used in production.
 #
 options 	TSLOG
 options 	TSLOGSIZE=262144
 
 
 #####################################################################
 # PERFORMANCE MONITORING OPTIONS
 
 #
 # The hwpmc driver that allows the use of in-CPU performance monitoring
 # counters for performance monitoring.  The base kernel needs to be configured
 # with the 'options' line, while the hwpmc device can be either compiled
 # in or loaded as a loadable kernel module.
 #
 # Additional configuration options may be required on specific architectures,
 # please see hwpmc(4).
 
 device		hwpmc			# Driver (also a loadable module)
 options 	HWPMC_DEBUG
 options 	HWPMC_HOOKS		# Other necessary kernel hooks
 
 
 #####################################################################
 # NETWORKING OPTIONS
 
 #
 # Protocol families
 #
 options 	INET			#Internet communications protocols
 options 	INET6			#IPv6 communications protocols
 
 options		RATELIMIT		# TX rate limiting support
 
 options 	ROUTETABLES=2		# allocated fibs up to 65536. default is 1.
 					# but that would be a bad idea as they are large.
 
 options 	TCP_OFFLOAD		# TCP offload support.
 
 options		TCPHPTS
 
 # In order to enable IPSEC you MUST also add device crypto to 
 # your kernel configuration
 options 	IPSEC			#IP security (requires device crypto)
 
 # Option IPSEC_SUPPORT does not enable IPsec, but makes it possible to 
 # load it as a kernel module. You still MUST add device crypto to your kernel
 # configuration.
 options		IPSEC_SUPPORT
 #options 	IPSEC_DEBUG		#debug for IP security
 
 #
 # SMB/CIFS requester
 # NETSMB enables support for SMB protocol, it requires LIBMCHAIN and LIBICONV
 # options.
 options 	NETSMB			#SMB/CIFS requester
 
 # mchain library. It can be either loaded as KLD or compiled into kernel
 options 	LIBMCHAIN
 
 # libalias library, performing NAT
 options 	LIBALIAS
 
 #
 # SCTP is a NEW transport protocol defined by
 # RFC2960 updated by RFC3309 and RFC3758.. and
 # soon to have a new base RFC and many many more
 # extensions. This release supports all the extensions
 # including many drafts (most about to become RFC's).
 # It is the reference implementation of SCTP
 # and is quite well tested.
 #
 # Note YOU MUST have both INET and INET6 defined.
 # You don't have to enable V6, but SCTP is 
 # dual stacked and so far we have not torn apart
 # the V6 and V4.. since an association can span
 # both a V6 and V4 address at the SAME time :-)
 #
 options 	SCTP
 # There are bunches of options:
 # this one turns on all sorts of
 # nastily printing that you can
 # do. It's all controlled by a
 # bit mask (settable by socket opt and
 # by sysctl). Including will not cause
 # logging until you set the bits.. but it
 # can be quite verbose.. so without this
 # option we don't do any of the tests for
 # bits and prints.. which makes the code run
 # faster.. if you are not debugging don't use.
 options 	SCTP_DEBUG
 #
 # All that options after that turn on specific types of
 # logging. You can monitor CWND growth, flight size
 # and all sorts of things. Go look at the code and
 # see. I have used this to produce interesting 
 # charts and graphs as well :->
 # 
 # I have not yet committed the tools to get and print
 # the logs, I will do that eventually .. before then
 # if you want them send me an email rrs@freebsd.org
 # You basically must have ktr(4) enabled for these
 # and you then set the sysctl to turn on/off various
 # logging bits. Use ktrdump(8) to pull the log and run
 # it through a display program.. and graphs and other
 # things too.
 #
 options 	SCTP_LOCK_LOGGING
 options 	SCTP_MBUF_LOGGING
 options 	SCTP_MBCNT_LOGGING
 options 	SCTP_PACKET_LOGGING
 options 	SCTP_LTRACE_CHUNKS
 options 	SCTP_LTRACE_ERRORS
 
 
 # altq(9). Enable the base part of the hooks with the ALTQ option.
 # Individual disciplines must be built into the base system and can not be
 # loaded as modules at this point. ALTQ requires a stable TSC so if yours is
 # broken or changes with CPU throttling then you must also have the ALTQ_NOPCC
 # option.
 options 	ALTQ
 options 	ALTQ_CBQ	# Class Based Queueing
 options 	ALTQ_RED	# Random Early Detection
 options 	ALTQ_RIO	# RED In/Out
 options 	ALTQ_CODEL	# CoDel Active Queueing
 options 	ALTQ_HFSC	# Hierarchical Packet Scheduler
 options 	ALTQ_FAIRQ	# Fair Packet Scheduler
 options 	ALTQ_CDNR	# Traffic conditioner
 options 	ALTQ_PRIQ	# Priority Queueing
 options 	ALTQ_NOPCC	# Required if the TSC is unusable
 options 	ALTQ_DEBUG
 
 # netgraph(4). Enable the base netgraph code with the NETGRAPH option.
 # Individual node types can be enabled with the corresponding option
 # listed below; however, this is not strictly necessary as netgraph
 # will automatically load the corresponding KLD module if the node type
 # is not already compiled into the kernel. Each type below has a
 # corresponding man page, e.g., ng_async(8).
 options 	NETGRAPH		# netgraph(4) system
 options 	NETGRAPH_DEBUG		# enable extra debugging, this
 					# affects netgraph(4) and nodes
 # Node types
 options 	NETGRAPH_ASYNC
 options 	NETGRAPH_ATMLLC
 options 	NETGRAPH_ATM_ATMPIF
 options 	NETGRAPH_BLUETOOTH		# ng_bluetooth(4)
 options 	NETGRAPH_BLUETOOTH_BT3C		# ng_bt3c(4)
 options 	NETGRAPH_BLUETOOTH_HCI		# ng_hci(4)
 options 	NETGRAPH_BLUETOOTH_L2CAP	# ng_l2cap(4)
 options 	NETGRAPH_BLUETOOTH_SOCKET	# ng_btsocket(4)
 options 	NETGRAPH_BLUETOOTH_UBT		# ng_ubt(4)
 options 	NETGRAPH_BLUETOOTH_UBTBCMFW	# ubtbcmfw(4)
 options 	NETGRAPH_BPF
 options 	NETGRAPH_BRIDGE
 options 	NETGRAPH_CAR
 options 	NETGRAPH_CHECKSUM
 options 	NETGRAPH_CISCO
 options 	NETGRAPH_DEFLATE
 options 	NETGRAPH_DEVICE
 options 	NETGRAPH_ECHO
 options 	NETGRAPH_EIFACE
 options 	NETGRAPH_ETHER
 options 	NETGRAPH_FRAME_RELAY
 options 	NETGRAPH_GIF
 options 	NETGRAPH_GIF_DEMUX
 options 	NETGRAPH_HOLE
 options 	NETGRAPH_IFACE
 options 	NETGRAPH_IP_INPUT
 options 	NETGRAPH_IPFW
 options 	NETGRAPH_KSOCKET
 options 	NETGRAPH_L2TP
 options 	NETGRAPH_LMI
 options 	NETGRAPH_MPPC_COMPRESSION
 options 	NETGRAPH_MPPC_ENCRYPTION
 options 	NETGRAPH_NETFLOW
 options 	NETGRAPH_NAT
 options 	NETGRAPH_ONE2MANY
 options 	NETGRAPH_PATCH
 options 	NETGRAPH_PIPE
 options 	NETGRAPH_PPP
 options 	NETGRAPH_PPPOE
 options 	NETGRAPH_PPTPGRE
 options 	NETGRAPH_PRED1
 options 	NETGRAPH_RFC1490
 options 	NETGRAPH_SOCKET
 options 	NETGRAPH_SPLIT
 options 	NETGRAPH_SPPP
 options 	NETGRAPH_TAG
 options 	NETGRAPH_TCPMSS
 options 	NETGRAPH_TEE
 options 	NETGRAPH_UI
 options 	NETGRAPH_VJC
 options 	NETGRAPH_VLAN
 
 # NgATM - Netgraph ATM
 options 	NGATM_ATM
 options 	NGATM_ATMBASE
 options 	NGATM_SSCOP
 options 	NGATM_SSCFU
 options 	NGATM_UNI
 options 	NGATM_CCATM
 
 device		mn	# Munich32x/Falc54 Nx64kbit/sec cards.
 
 # Network stack virtualization.
 options	VIMAGE
 options	VNET_DEBUG	# debug for VIMAGE
 
 #
 # Network interfaces:
 #  The `loop' device is MANDATORY when networking is enabled.
 device		loop
 
 #  The `ether' device provides generic code to handle
 #  Ethernets; it is MANDATORY when an Ethernet device driver is
 #  configured.
 device		ether
 
 #  The `vlan' device implements the VLAN tagging of Ethernet frames
 #  according to IEEE 802.1Q.
 device		vlan
 
 # The `vxlan' device implements the VXLAN encapsulation of Ethernet
 # frames in UDP packets according to RFC7348.
 device		vxlan
 
 #  The `wlan' device provides generic code to support 802.11
 #  drivers, including host AP mode; it is MANDATORY for the wi,
 #  and ath drivers and will eventually be required by all 802.11 drivers.
 device		wlan
 options 	IEEE80211_DEBUG		#enable debugging msgs
 options 	IEEE80211_SUPPORT_MESH	#enable 802.11s D3.0 support
 options 	IEEE80211_SUPPORT_TDMA	#enable TDMA support
 
 #  The `wlan_wep', `wlan_tkip', and `wlan_ccmp' devices provide
 #  support for WEP, TKIP, and AES-CCMP crypto protocols optionally
 #  used with 802.11 devices that depend on the `wlan' module.
 device		wlan_wep
 device		wlan_ccmp
 device		wlan_tkip
 
 #  The `wlan_xauth' device provides support for external (i.e. user-mode)
 #  authenticators for use with 802.11 drivers that use the `wlan'
 #  module and support 802.1x and/or WPA security protocols.
 device		wlan_xauth
 
 #  The `wlan_acl' device provides a MAC-based access control mechanism
 #  for use with 802.11 drivers operating in ap mode and using the
 #  `wlan' module.
 #  The 'wlan_amrr' device provides AMRR transmit rate control algorithm
 device		wlan_acl
 device		wlan_amrr
 
 #  The `sppp' device serves a similar role for certain types
 #  of synchronous PPP links (like `cx', `ar').
 device		sppp
 
 #  The `bpf' device enables the Berkeley Packet Filter.  Be
 #  aware of the legal and administrative consequences of enabling this
 #  option.  DHCP requires bpf.
 device		bpf
 
 #  The `netmap' device implements memory-mapped access to network
 #  devices from userspace, enabling wire-speed packet capture and
 #  generation even at 10Gbit/s. Requires support in the device
 #  driver. Supported drivers are ixgbe, e1000, re.
 device		netmap
 
 #  The `disc' device implements a minimal network interface,
 #  which throws away all packets sent and never receives any.  It is
 #  included for testing and benchmarking purposes.
 device		disc
 
 # The `epair' device implements a virtual back-to-back connected Ethernet
 # like interface pair.
 device		epair
 
 #  The `edsc' device implements a minimal Ethernet interface,
 #  which discards all packets sent and receives none.
 device		edsc
 
 #  The `tap' device is a pty-like virtual Ethernet interface
 device		tap
 
 #  The `tun' device implements (user-)ppp and nos-tun(8)
 device		tun
 
 #  The `gif' device implements IPv6 over IP4 tunneling,
 #  IPv4 over IPv6 tunneling, IPv4 over IPv4 tunneling and
 #  IPv6 over IPv6 tunneling.
 #  The `gre' device implements GRE (Generic Routing Encapsulation) tunneling,
 #  as specified in the RFC 2784 and RFC 2890.
 #  The `me' device implements Minimal Encapsulation within IPv4 as
 #  specified in the RFC 2004.
 #  The XBONEHACK option allows the same pair of addresses to be configured on
 #  multiple gif interfaces.
 device		gif
 device		gre
 device		me
 options 	XBONEHACK
 
 #  The `stf' device implements 6to4 encapsulation.
 device		stf
 
 # The pf packet filter consists of three devices:
 #  The `pf' device provides /dev/pf and the firewall code itself.
 #  The `pflog' device provides the pflog0 interface which logs packets.
 #  The `pfsync' device provides the pfsync0 interface used for
 #   synchronization of firewall state tables (over the net).
 device		pf
 device		pflog
 device		pfsync
 
 # Bridge interface.
 device		if_bridge
 
 # Common Address Redundancy Protocol. See carp(4) for more details.
 device		carp
 
 # IPsec interface.
 device		enc
 
 # Link aggregation interface.
 device		lagg
 
 #
 # Internet family options:
 #
 # MROUTING enables the kernel multicast packet forwarder, which works
 # with mrouted and XORP.
 #
 # IPFIREWALL enables support for IP firewall construction, in
 # conjunction with the `ipfw' program.  IPFIREWALL_VERBOSE sends
 # logged packets to the system logger.  IPFIREWALL_VERBOSE_LIMIT
 # limits the number of times a matching entry can be logged.
 #
 # WARNING:  IPFIREWALL defaults to a policy of "deny ip from any to any"
 # and if you do not add other rules during startup to allow access,
 # YOU WILL LOCK YOURSELF OUT.  It is suggested that you set firewall_type=open
 # in /etc/rc.conf when first enabling this feature, then refining the
 # firewall rules in /etc/rc.firewall after you've tested that the new kernel
 # feature works properly.
 #
 # IPFIREWALL_DEFAULT_TO_ACCEPT causes the default rule (at boot) to
 # allow everything.  Use with care, if a cracker can crash your
 # firewall machine, they can get to your protected machines.  However,
 # if you are using it as an as-needed filter for specific problems as
 # they arise, then this may be for you.  Changing the default to 'allow'
 # means that you won't get stuck if the kernel and /sbin/ipfw binary get
 # out of sync.
 #
 # IPDIVERT enables the divert IP sockets, used by ``ipfw divert''.  It
 # depends on IPFIREWALL if compiled into the kernel.
 #
 # IPFIREWALL_NAT adds support for in kernel nat in ipfw, and it requires
 # LIBALIAS.
 #
 # IPFIREWALL_NAT64 adds support for in kernel NAT64 in ipfw.
 #
 # IPFIREWALL_NPTV6 adds support for in kernel NPTv6 in ipfw.
 #
 # IPFIREWALL_PMOD adds support for protocols modification module. Currently
 # it supports only TCP MSS modification.
 #
 # IPSTEALTH enables code to support stealth forwarding (i.e., forwarding
 # packets without touching the TTL).  This can be useful to hide firewalls
 # from traceroute and similar tools.
 #
 # PF_DEFAULT_TO_DROP causes the default pf(4) rule to deny everything.
 #
 # TCPDEBUG enables code which keeps traces of the TCP state machine
 # for sockets with the SO_DEBUG option set, which can then be examined
 # using the trpt(8) utility.
 #
 # TCPPCAP enables code which keeps the last n packets sent and received
 # on a TCP socket.
 #
 # TCP_BLACKBOX enables enhanced TCP event logging.
 #
 # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
 #
 # RADIX_MPATH provides support for equal-cost multi-path routing.
 #
 options 	MROUTING		# Multicast routing
 options 	IPFIREWALL		#firewall
 options 	IPFIREWALL_VERBOSE	#enable logging to syslogd(8)
 options 	IPFIREWALL_VERBOSE_LIMIT=100	#limit verbosity
 options 	IPFIREWALL_DEFAULT_TO_ACCEPT	#allow everything by default
 options 	IPFIREWALL_NAT		#ipfw kernel nat support
 options 	IPFIREWALL_NAT64	#ipfw kernel NAT64 support
 options 	IPFIREWALL_NPTV6	#ipfw kernel IPv6 NPT support
 options 	IPDIVERT		#divert sockets
 options 	IPFILTER		#ipfilter support
 options 	IPFILTER_LOG		#ipfilter logging
 options 	IPFILTER_LOOKUP		#ipfilter pools
 options 	IPFILTER_DEFAULT_BLOCK	#block all packets by default
 options 	IPSTEALTH		#support for stealth forwarding
 options 	PF_DEFAULT_TO_DROP	#drop everything by default
 options 	TCPDEBUG
 options 	TCPPCAP
 options 	TCP_BLACKBOX
 options 	TCP_HHOOK
 options 	RADIX_MPATH
 
 # The MBUF_STRESS_TEST option enables options which create
 # various random failures / extreme cases related to mbuf
 # functions.  See mbuf(9) for a list of available test cases.
 # MBUF_PROFILING enables code to profile the mbuf chains
 # exiting the system (via participating interfaces) and
 # return a logarithmic histogram of monitored parameters
 # (e.g. packet size, wasted space, number of mbufs in chain).
 options 	MBUF_STRESS_TEST
 options 	MBUF_PROFILING
 
 # Statically link in accept filters
 options 	ACCEPT_FILTER_DATA
 options 	ACCEPT_FILTER_DNS
 options 	ACCEPT_FILTER_HTTP
 
 # TCP_SIGNATURE adds support for RFC 2385 (TCP-MD5) digests. These are
 # carried in TCP option 19. This option is commonly used to protect
 # TCP sessions (e.g. BGP) where IPSEC is not available nor desirable.
 # This is enabled on a per-socket basis using the TCP_MD5SIG socket option.
 # This requires the use of 'device crypto' and either 'options IPSEC' or
 # 'options IPSEC_SUPPORT'.
 options 	TCP_SIGNATURE		#include support for RFC 2385
 
 # DUMMYNET enables the "dummynet" bandwidth limiter.  You need IPFIREWALL
 # as well.  See dummynet(4) and ipfw(8) for more info.  When you run
 # DUMMYNET it is advisable to also have at least "options HZ=1000" to achieve
 # a smooth scheduling of the traffic.
 options 	DUMMYNET
 
 # The NETDUMP option enables netdump(4) client support in the kernel.
 # This allows a panicking kernel to transmit a kernel dump to a remote host.
 options 	NETDUMP
 
 #####################################################################
 # FILESYSTEM OPTIONS
 
 #
 # Only the root filesystem needs to be statically compiled or preloaded
 # as module; everything else will be automatically loaded at mount
 # time.  Some people still prefer to statically compile other
 # filesystems as well.
 #
 # NB: The UNION filesystem was known to be buggy in the past.  It is now
 # being actively maintained, although there are still some issues being
 # resolved.
 #
 
 # One of these is mandatory:
 options 	FFS			#Fast filesystem
 options 	NFSCL			#Network File System client
 
 # The rest are optional:
 options 	AUTOFS			#Automounter filesystem
 options 	CD9660			#ISO 9660 filesystem
 options 	FDESCFS			#File descriptor filesystem
 options 	FUSEFS			#FUSEFS support module
 options 	MSDOSFS			#MS DOS File System (FAT, FAT32)
 options 	NFSLOCKD		#Network Lock Manager
 options 	NFSD			#Network Filesystem Server
 options 	KGSSAPI			#Kernel GSSAPI implementation
 
 options 	NULLFS			#NULL filesystem
 options 	PROCFS			#Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		#Pseudo-filesystem framework
 options 	PSEUDOFS_TRACE		#Debugging support for PSEUDOFS
 options 	SMBFS			#SMB/CIFS filesystem
 options 	TMPFS			#Efficient memory filesystem
 options 	UDF			#Universal Disk Format
 options 	UNIONFS			#Union filesystem
 # The xFS_ROOT options REQUIRE the associated ``options xFS''
 options 	NFS_ROOT		#NFS usable as root device
 
 # Soft updates is a technique for improving filesystem speed and
 # making abrupt shutdown less risky.
 #
 options 	SOFTUPDATES
 
 # Extended attributes allow additional data to be associated with files,
 # and is used for ACLs, Capabilities, and MAC labels.
 # See src/sys/ufs/ufs/README.extattr for more information.
 options 	UFS_EXTATTR
 options 	UFS_EXTATTR_AUTOSTART
 
 # Access Control List support for UFS filesystems.  The current ACL
 # implementation requires extended attribute support, UFS_EXTATTR,
 # for the underlying filesystem.
 # See src/sys/ufs/ufs/README.acls for more information.
 options 	UFS_ACL
 
 # Directory hashing improves the speed of operations on very large
 # directories at the expense of some memory.
 options 	UFS_DIRHASH
 
 # Gjournal-based UFS journaling support.
 options 	UFS_GJOURNAL
 
 # Make space in the kernel for a root filesystem on a md device.
 # Define to the number of kilobytes to reserve for the filesystem.
 # This is now optional.
 # If not defined, the root filesystem passed in as the MFS_IMAGE makeoption
 # will be automatically embedded in the kernel during linking. Its exact size
 # will be consumed within the kernel.
 # If defined, the old way of embedding the filesystem in the kernel will be
 # used. That is to say MD_ROOT_SIZE KB will be allocated in the kernel and
 # later, the filesystem image passed in as the MFS_IMAGE makeoption will be
 # dd'd into the reserved space if it fits.
 options 	MD_ROOT_SIZE=10
 
 # Make the md device a potential root device, either with preloaded
 # images of type mfs_root or md_root.
 options 	MD_ROOT
 
 # Write-protect the md root device so that it may not be mounted writeable.
 options 	MD_ROOT_READONLY
 
 # Allow to read MD image from external memory regions
 options		MD_ROOT_MEM
 
 # Disk quotas are supported when this option is enabled.
 options 	QUOTA			#enable disk quotas
 
 # If you are running a machine just as a fileserver for PC and MAC
 # users, using SAMBA, you may consider setting this option
 # and keeping all those users' directories on a filesystem that is
 # mounted with the suiddir option. This gives new files the same
 # ownership as the directory (similar to group). It's a security hole
 # if you let these users run programs, so confine it to file-servers
 # (but it'll save you lots of headaches in those cases). Root owned
 # directories are exempt and X bits are cleared. The suid bit must be
 # set on the directory as well; see chmod(1). PC owners can't see/set
 # ownerships so they keep getting their toes trodden on. This saves
 # you all the support calls as the filesystem it's used on will act as
 # they expect: "It's my dir so it must be my file".
 #
 options 	SUIDDIR
 
 # NFS options:
 options 	NFS_MINATTRTIMO=3	# VREG attrib cache timeout in sec
 options 	NFS_MAXATTRTIMO=60
 options 	NFS_MINDIRATTRTIMO=30	# VDIR attrib cache timeout in sec
 options 	NFS_MAXDIRATTRTIMO=60
 options 	NFS_DEBUG		# Enable NFS Debugging
 
 #
 # Add support for the EXT2FS filesystem of Linux fame.  Be a bit
 # careful with this - the ext2fs code has a tendency to lag behind
 # changes and not be exercised very much, so mounting read/write could
 # be dangerous (and even mounting read only could result in panics.)
 #
 options 	EXT2FS
 
 # Cryptographically secure random number generator; /dev/random
 device		random
 
 # The system memory devices; /dev/mem, /dev/kmem
 device		mem
 
 # The kernel symbol table device; /dev/ksyms
 device		ksyms
 
 # Optional character code conversion support with LIBICONV.
 # Each option requires their base file system and LIBICONV.
 options 	CD9660_ICONV
 options 	MSDOSFS_ICONV
 options 	UDF_ICONV
 
 
 #####################################################################
 # POSIX P1003.1B
 
 # Real time extensions added in the 1993 POSIX
 # _KPOSIX_PRIORITY_SCHEDULING: Build in _POSIX_PRIORITY_SCHEDULING
 
 options 	_KPOSIX_PRIORITY_SCHEDULING
 # p1003_1b_semaphores are very experimental,
 # user should be ready to assist in debugging if problems arise.
 options 	P1003_1B_SEMAPHORES
 
 # POSIX message queue
 options 	P1003_1B_MQUEUE
 
 #####################################################################
 # SECURITY POLICY PARAMETERS
 
 # Support for BSM audit
 options 	AUDIT
 
 # Support for Mandatory Access Control (MAC):
 options 	MAC
 options 	MAC_BIBA
 options 	MAC_BSDEXTENDED
 options 	MAC_IFOFF
 options 	MAC_LOMAC
 options 	MAC_MLS
 options 	MAC_NONE
 options 	MAC_NTPD
 options 	MAC_PARTITION
 options 	MAC_PORTACL
 options 	MAC_SEEOTHERUIDS
 options 	MAC_STUB
 options 	MAC_TEST
 
 # Support for Capsicum
 options 	CAPABILITIES	# fine-grained rights on file descriptors
 options 	CAPABILITY_MODE	# sandboxes with no global namespace access
 
 
 #####################################################################
 # CLOCK OPTIONS
 
 # The granularity of operation is controlled by the kernel option HZ whose
 # default value (1000 on most architectures) means a granularity of 1ms
 # (1s/HZ).  Historically, the default was 100, but finer granularity is
 # required for DUMMYNET and other systems on modern hardware.  There are
 # reasonable arguments that HZ should, in fact, be 100 still; consider,
 # that reducing the granularity too much might cause excessive overhead in
 # clock interrupt processing, potentially causing ticks to be missed and thus
 # actually reducing the accuracy of operation.
 
 options 	HZ=100
 
 # Enable support for the kernel PLL to use an external PPS signal,
 # under supervision of [x]ntpd(8)
 # More info in ntpd documentation: http://www.eecis.udel.edu/~ntp
 
 options 	PPS_SYNC
 
 # Enable support for generic feed-forward clocks in the kernel.
 # The feed-forward clock support is an alternative to the feedback oriented
 # ntpd/system clock approach, and is to be used with a feed-forward
 # synchronization algorithm such as the RADclock:
 # More info here: http://www.synclab.org/radclock
 
 options 	FFCLOCK
 
 
 #####################################################################
 # SCSI DEVICES
 
 # SCSI DEVICE CONFIGURATION
 
 # The SCSI subsystem consists of the `base' SCSI code, a number of
 # high-level SCSI device `type' drivers, and the low-level host-adapter
 # device drivers.  The host adapters are listed in the ISA and PCI
 # device configuration sections below.
 #
 # It is possible to wire down your SCSI devices so that a given bus,
 # target, and LUN always come on line as the same device unit.  In
 # earlier versions the unit numbers were assigned in the order that
 # the devices were probed on the SCSI bus.  This means that if you
 # removed a disk drive, you may have had to rewrite your /etc/fstab
 # file, and also that you had to be careful when adding a new disk
 # as it may have been probed earlier and moved your device configuration
 # around.  (See also option GEOM_VOL for a different solution to this
 # problem.)
 
 # This old behavior is maintained as the default behavior.  The unit
 # assignment begins with the first non-wired down unit for a device
 # type.  For example, if you wire a disk as "da3" then the first
 # non-wired disk will be assigned da4.
 
 # The syntax for wiring down devices is:
 
 hint.scbus.0.at="ahc0"
 hint.scbus.1.at="ahc1"
 hint.scbus.1.bus="0"
 hint.scbus.3.at="ahc2"
 hint.scbus.3.bus="0"
 hint.scbus.2.at="ahc2"
 hint.scbus.2.bus="1"
 hint.da.0.at="scbus0"
 hint.da.0.target="0"
 hint.da.0.unit="0"
 hint.da.1.at="scbus3"
 hint.da.1.target="1"
 hint.da.2.at="scbus2"
 hint.da.2.target="3"
 hint.sa.1.at="scbus1"
 hint.sa.1.target="6"
 
 # "units" (SCSI logical unit number) that are not specified are
 # treated as if specified as LUN 0.
 
 # All SCSI devices allocate as many units as are required.
 
 # The ch driver drives SCSI Media Changer ("jukebox") devices.
 #
 # The da driver drives SCSI Direct Access ("disk") and Optical Media
 # ("WORM") devices.
 #
 # The sa driver drives SCSI Sequential Access ("tape") devices.
 #
 # The cd driver drives SCSI Read Only Direct Access ("cd") devices.
 #
 # The ses driver drives SCSI Environment Services ("ses") and
 # SAF-TE ("SCSI Accessible Fault-Tolerant Enclosure") devices.
 #
 # The pt driver drives SCSI Processor devices.
 #
 # The sg driver provides a passthrough API that is compatible with the
 # Linux SG driver.  It will work in conjunction with the COMPAT_LINUX
 # option to run linux SG apps.  It can also stand on its own and provide
 # source level API compatibility for porting apps to FreeBSD.
 #
 # Target Mode support is provided here but also requires that a SIM
 # (SCSI Host Adapter Driver) provide support as well.
 #
 # The targ driver provides target mode support as a Processor type device.
 # It exists to give the minimal context necessary to respond to Inquiry
 # commands. There is a sample user application that shows how the rest
 # of the command support might be done in /usr/share/examples/scsi_target.
 #
 # The targbh driver provides target mode support and exists to respond
 # to incoming commands that do not otherwise have a logical unit assigned
 # to them.
 #
 # The pass driver provides a passthrough API to access the CAM subsystem.
 
 device		scbus		#base SCSI code
 device		ch		#SCSI media changers
 device		da		#SCSI direct access devices (aka disks)
 device		sa		#SCSI tapes
 device		cd		#SCSI CD-ROMs
 device		ses		#Enclosure Services (SES and SAF-TE)
 device		pt		#SCSI processor
 device		targ		#SCSI Target Mode Code
 device		targbh		#SCSI Target Mode Blackhole Device
 device		pass		#CAM passthrough driver
 device		sg		#Linux SCSI passthrough
 device		ctl		#CAM Target Layer
 
 # CAM OPTIONS:
 # debugging options:
 # CAMDEBUG		Compile in all possible debugging.
 # CAM_DEBUG_COMPILE	Debug levels to compile in.
 # CAM_DEBUG_FLAGS	Debug levels to enable on boot.
 # CAM_DEBUG_BUS		Limit debugging to the given bus.
 # CAM_DEBUG_TARGET	Limit debugging to the given target.
 # CAM_DEBUG_LUN		Limit debugging to the given lun.
 # CAM_DEBUG_DELAY	Delay in us after printing each debug line.
 #
 # CAM_MAX_HIGHPOWER: Maximum number of concurrent high power (start unit) cmds
 # SCSI_NO_SENSE_STRINGS: When defined disables sense descriptions
 # SCSI_NO_OP_STRINGS: When defined disables opcode descriptions
 # SCSI_DELAY: The number of MILLISECONDS to freeze the SIM (scsi adapter)
 #             queue after a bus reset, and the number of milliseconds to
 #             freeze the device queue after a bus device reset.  This
 #             can be changed at boot and runtime with the
 #             kern.cam.scsi_delay tunable/sysctl.
 options 	CAMDEBUG
 options 	CAM_DEBUG_COMPILE=-1
 options 	CAM_DEBUG_FLAGS=(CAM_DEBUG_INFO|CAM_DEBUG_PROBE|CAM_DEBUG_PERIPH)
 options 	CAM_DEBUG_BUS=-1
 options 	CAM_DEBUG_TARGET=-1
 options 	CAM_DEBUG_LUN=-1
 options 	CAM_DEBUG_DELAY=1
 options 	CAM_MAX_HIGHPOWER=4
 options 	SCSI_NO_SENSE_STRINGS
 options 	SCSI_NO_OP_STRINGS
 options 	SCSI_DELAY=5000	# Be pessimistic about Joe SCSI device
 options 	CAM_IOSCHED_DYNAMIC
 options		CAM_TEST_FAILURE
 
 # Options for the CAM CDROM driver:
 # CHANGER_MIN_BUSY_SECONDS: Guaranteed minimum time quantum for a changer LUN
 # CHANGER_MAX_BUSY_SECONDS: Maximum time quantum per changer LUN, only
 #                           enforced if there is I/O waiting for another LUN
 # The compiled in defaults for these variables are 2 and 10 seconds,
 # respectively.
 #
 # These can also be changed on the fly with the following sysctl variables:
 # kern.cam.cd.changer.min_busy_seconds
 # kern.cam.cd.changer.max_busy_seconds
 #
 options 	CHANGER_MIN_BUSY_SECONDS=2
 options 	CHANGER_MAX_BUSY_SECONDS=10
 
 # Options for the CAM sequential access driver:
 # SA_IO_TIMEOUT: Timeout for read/write/wfm  operations, in minutes
 # SA_SPACE_TIMEOUT: Timeout for space operations, in minutes
 # SA_REWIND_TIMEOUT: Timeout for rewind operations, in minutes
 # SA_ERASE_TIMEOUT: Timeout for erase operations, in minutes
 # SA_1FM_AT_EOD: Default to model which only has a default one filemark at EOT.
 options 	SA_IO_TIMEOUT=4
 options 	SA_SPACE_TIMEOUT=60
 options 	SA_REWIND_TIMEOUT=(2*60)
 options 	SA_ERASE_TIMEOUT=(4*60)
 options 	SA_1FM_AT_EOD
 
 # Optional timeout for the CAM processor target (pt) device
 # This is specified in seconds.  The default is 60 seconds.
 options 	SCSI_PT_DEFAULT_TIMEOUT=60
 
 # Optional enable of doing SES passthrough on other devices (e.g., disks)
 #
 # Normally disabled because a lot of newer SCSI disks report themselves
 # as having SES capabilities, but this can then clot up attempts to build
 # a topology with the SES device that's on the box these drives are in....
 options 	SES_ENABLE_PASSTHROUGH
 
 
 #####################################################################
 # MISCELLANEOUS DEVICES AND OPTIONS
 
 device		pty		#BSD-style compatibility pseudo ttys
 device		nmdm		#back-to-back tty devices
 device		md		#Memory/malloc disk
 device		snp		#Snoop device - to look at pty/vty/etc..
 device		ccd		#Concatenated disk driver
 device		firmware	#firmware(9) support
 
 # Kernel side iconv library
 options 	LIBICONV
 
 # Size of the kernel message buffer.  Should be N * pagesize.
 options 	MSGBUF_SIZE=40960
 
 
 #####################################################################
 # HARDWARE BUS CONFIGURATION
 
 #
 # PCI bus & PCI options:
 #
 device		pci
 options 	PCI_HP			# PCI-Express native HotPlug
 options 	PCI_IOV			# PCI SR-IOV support
 
 
 #####################################################################
 # HARDWARE DEVICE CONFIGURATION
 
 # For ISA the required hints are listed.
 # PCI, CardBus, SD/MMC and pccard are self identifying buses, so
 # no hints are needed.
 
 #
 # Mandatory devices:
 #
 
 # These options are valid for other keyboard drivers as well.
 options 	KBD_DISABLE_KEYMAP_LOAD	# refuse to load a keymap
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
 
 device		kbdmux			# keyboard multiplexer
 options		KBDMUX_DFLT_KEYMAP	# specify the built-in keymap
 makeoptions	KBDMUX_DFLT_KEYMAP=it.iso
 
 options 	FB_DEBUG		# Frame buffer debugging
 
 device		splash			# Splash screen and screen saver support
 
 # Various screen savers.
 device		blank_saver
 device		daemon_saver
 device		dragon_saver
 device		fade_saver
 device		fire_saver
 device		green_saver
 device		logo_saver
 device		rain_saver
 device		snake_saver
 device		star_saver
 device		warp_saver
 
 # The syscons console driver (SCO color console compatible).
 device		sc
 hint.sc.0.at="isa"
 options 	MAXCONS=16		# number of virtual consoles
 options 	SC_ALT_MOUSE_IMAGE	# simplified mouse cursor in text mode
 options 	SC_DFLT_FONT		# compile font in
 makeoptions	SC_DFLT_FONT=cp850
 options 	SC_DFLT_TERM=\"sc\"	# default terminal emulator
 options 	SC_DISABLE_KDBKEY	# disable `debug' key
 options 	SC_DISABLE_REBOOT	# disable reboot key sequence
 options 	SC_HISTORY_SIZE=200	# number of history buffer lines
 options 	SC_MOUSE_CHAR=0x3	# char code for text mode mouse cursor
 options 	SC_PIXEL_MODE		# add support for the raster text mode
 
 # The following options will let you change the default colors of syscons.
 options 	SC_NORM_ATTR=(FG_GREEN|BG_BLACK)
 options 	SC_NORM_REV_ATTR=(FG_YELLOW|BG_GREEN)
 options 	SC_KERNEL_CONS_ATTR=(FG_RED|BG_BLACK)
 options 	SC_KERNEL_CONS_ATTRS=\"\x0c\x0d\x0e\x0f\x02\x09\x0a\x0b\"
 options 	SC_KERNEL_CONS_REV_ATTR=(FG_BLACK|BG_RED)
 
 # The following options will let you change the default behavior of
 # cut-n-paste feature
 options 	SC_CUT_SPACES2TABS	# convert leading spaces into tabs
 options 	SC_CUT_SEPCHARS=\"x09\"	# set of characters that delimit words
 					# (default is single space - \"x20\")
 
 # If you have a two button mouse, you may want to add the following option
 # to use the right button of the mouse to paste text.
 options 	SC_TWOBUTTON_MOUSE
 
 # You can selectively disable features in syscons.
 options 	SC_NO_CUTPASTE
 options 	SC_NO_FONT_LOADING
 options 	SC_NO_HISTORY
 options 	SC_NO_MODE_CHANGE
 options 	SC_NO_SYSMOUSE
 options 	SC_NO_SUSPEND_VTYSWITCH
 #!options 	SC_NO_TERM_DUMB
 #!options 	SC_NO_TERM_SC
 #!options 	SC_NO_TERM_SCTEKEN
 
 # `flags' for sc
 #	0x80	Put the video card in the VESA 800x600 dots, 16 color mode
 #	0x100	Probe for a keyboard device periodically if one is not present
 
 # Enable experimental features of the syscons terminal emulator (teken).
 options 	TEKEN_CONS25		# cons25-style terminal emulation
 options 	TEKEN_UTF8		# UTF-8 output handling
 
 # The vt video console driver.
 device		vt
 options		VT_ALT_TO_ESC_HACK=1	# Prepend ESC sequence to ALT keys
 options		VT_MAXWINDOWS=16	# Number of virtual consoles
 options		VT_TWOBUTTON_MOUSE	# Use right mouse button to paste
 
 # The following options set the default framebuffer size.
 options		VT_FB_DEFAULT_HEIGHT=480
 options		VT_FB_DEFAULT_WIDTH=640
 
 # The following options will let you change the default vt terminal colors.
 options		TERMINAL_NORM_ATTR=(FG_GREEN|BG_BLACK)
 options		TERMINAL_KERN_ATTR=(FG_LIGHTRED|BG_BLACK)
 
 #
 # Optional devices:
 #
 
 #
 # SCSI host adapters:
 #
 # ahc: Adaptec 274x/284x/2910/293x/294x/394x/3950x/3960x/398X/4944/
 #      19160x/29160x, aic7770/aic78xx
 # ahd: Adaptec 29320/39320 Controllers.
 # esp: Emulex ESP, NCR 53C9x and QLogic FAS families based controllers
 #      including the AMD Am53C974 (found on devices such as the Tekram
 #      DC-390(T)) and the Sun ESP and FAS families of controllers
 # isp: Qlogic ISP 1020, 1040 and 1040B PCI SCSI host adapters,
 #      ISP 1240 Dual Ultra SCSI, ISP 1080 and 1280 (Dual) Ultra2,
 #      ISP 12160 Ultra3 SCSI,
 #      Qlogic ISP 2100 and ISP 2200 1Gb Fibre Channel host adapters.
 #      Qlogic ISP 2300 and ISP 2312 2Gb Fibre Channel host adapters.
 #      Qlogic ISP 2322 and ISP 6322 2Gb Fibre Channel host adapters.
 # ispfw: Firmware module for Qlogic host adapters
 # mpt: LSI-Logic MPT/Fusion 53c1020 or 53c1030 Ultra4
 #      or FC9x9 Fibre Channel host adapters.
 # sym: Symbios/Logic 53C8XX family of PCI-SCSI I/O processors:
 #      53C810, 53C810A, 53C815, 53C825,  53C825A, 53C860, 53C875,
 #      53C876, 53C885,  53C895, 53C895A, 53C896,  53C897, 53C1510D,
 #      53C1010-33, 53C1010-66.
 # trm: Tekram DC395U/UW/F DC315U adapters.
 
 device		ahc
 device		ahd
 device		esp
 device		iscsi_initiator
 device		isp
 hint.isp.0.disable="1"
 hint.isp.0.role="3"
 hint.isp.0.prefer_iomap="1"
 hint.isp.0.prefer_memmap="1"
 hint.isp.0.fwload_disable="1"
 hint.isp.0.ignore_nvram="1"
 hint.isp.0.fullduplex="1"
 hint.isp.0.topology="lport"
 hint.isp.0.topology="nport"
 hint.isp.0.topology="lport-only"
 hint.isp.0.topology="nport-only"
 # we can't get u_int64_t types, nor can we get strings if it's got
 # a leading 0x, hence this silly dodge.
 hint.isp.0.portwnn="w50000000aaaa0000"
 hint.isp.0.nodewnn="w50000000aaaa0001"
 device		ispfw
 device		mpt
 device		sym
 device		trm
 
 # The aic7xxx driver will attempt to use memory mapped I/O for all PCI
 # controllers that have it configured only if this option is set. Unfortunately,
 # this doesn't work on some motherboards, which prevents it from being the
 # default.
 options 	AHC_ALLOW_MEMIO
 
 # Dump the contents of the ahc controller configuration PROM.
 options 	AHC_DUMP_EEPROM
 
 # Bitmap of units to enable targetmode operations.
 options 	AHC_TMODE_ENABLE
 
 # Compile in Aic7xxx Debugging code.
 options 	AHC_DEBUG
 
 # Aic7xxx driver debugging options. See sys/dev/aic7xxx/aic7xxx.h
 options 	AHC_DEBUG_OPTS
 
 # Print register bitfields in debug output.  Adds ~128k to driver
 # See ahc(4).
 options 	AHC_REG_PRETTY_PRINT
 
 # Compile in aic79xx debugging code.
 options 	AHD_DEBUG
 
 # Aic79xx driver debugging options.  Adds ~215k to driver.  See ahd(4).
 options 	AHD_DEBUG_OPTS=0xFFFFFFFF
 
 # Print human-readable register definitions when debugging
 options 	AHD_REG_PRETTY_PRINT
 
 # Bitmap of units to enable targetmode operations.
 options 	AHD_TMODE_ENABLE
 
 # Options used in dev/iscsi (Software iSCSI stack)
 #
 options 	ISCSI_INITIATOR_DEBUG=9
 
 # Options used in dev/isp/ (Qlogic SCSI/FC driver).
 #
 #	ISP_TARGET_MODE		-	enable target mode operation
 #
 options 	ISP_TARGET_MODE=1
 #
 #	ISP_DEFAULT_ROLES	-	default role
 #		none=0
 #		target=1
 #		initiator=2
 #		both=3			(not supported currently)
 #
 #	ISP_INTERNAL_TARGET		(trivial internal disk target, for testing)
 #
 options 	ISP_DEFAULT_ROLES=0
 
 #options 	SYM_SETUP_SCSI_DIFF	#-HVD support for 825a, 875, 885
 					# disabled:0 (default), enabled:1
 #options 	SYM_SETUP_PCI_PARITY	#-PCI parity checking
 					# disabled:0, enabled:1 (default)
 #options 	SYM_SETUP_MAX_LUN	#-Number of LUNs supported
 					# default:8, range:[1..64]
 
 #
 # Compaq "CISS" RAID controllers (SmartRAID 5* series)
 # These controllers have a SCSI-like interface, and require the
 # CAM infrastructure.
 #
 device		ciss
 
 #
 # Intel Integrated RAID controllers.
 # This driver was developed and is maintained by Intel.  Contacts
 # at Intel for this driver are
 # "Kannanthanam, Boji T" <boji.t.kannanthanam@intel.com> and
 # "Leubner, Achim" <achim.leubner@intel.com>.
 #
 device		iir
 
 #
 # Mylex AcceleRAID and eXtremeRAID controllers with v6 and later
 # firmware.  These controllers have a SCSI-like interface, and require
 # the CAM infrastructure.
 #
 device		mly
 
 #
 # Compaq Smart RAID, Mylex DAC960 and AMI MegaRAID controllers.  Only
 # one entry is needed; the code will find and configure all supported
 # controllers.
 #
 device		ida		# Compaq Smart RAID
 device		mlx		# Mylex DAC960
 device		amr		# AMI MegaRAID
 device		amrp		# SCSI Passthrough interface (optional, CAM req.)
 device		mfi		# LSI MegaRAID SAS
 device		mfip		# LSI MegaRAID SAS passthrough, requires CAM
 options 	MFI_DEBUG
 device		mrsas		# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 
 #
 # 3ware ATA RAID
 #
 device		twe		# 3ware ATA RAID
 
 #
 # Serial ATA host controllers:
 #
 # ahci: Advanced Host Controller Interface (AHCI) compatible
 # mvs:  Marvell 88SX50XX/88SX60XX/88SX70XX/SoC controllers
 # siis: SiliconImage SiI3124/SiI3132/SiI3531 controllers
 #
 # These drivers are part of cam(4) subsystem. They supersede less featured
 # ata(4) subsystem drivers, supporting same hardware.
 
 device		ahci
 device		mvs
 device		siis
 
 #
 # The 'ATA' driver supports all legacy ATA/ATAPI controllers, including
 # PC Card devices. You only need one "device ata" for it to find all
 # PCI and PC Card ATA/ATAPI devices on modern machines.
 # Alternatively, individual bus and chipset drivers may be chosen by using
 # the 'atacore' driver then selecting the drivers on a per vendor basis.
 # For example to build a system which only supports a VIA chipset,
 # omit 'ata' and include the 'atacore', 'atapci' and 'atavia' drivers.
 device		ata
 
 # Modular ATA
 #device		atacore		# Core ATA functionality
 #device		atapccard	# CARDBUS support
 #device		ataisa		# ISA bus support
 #device		atapci		# PCI bus support; only generic chipset support
 
 # PCI ATA chipsets
 #device		ataacard	# ACARD
 #device		ataacerlabs	# Acer Labs Inc. (ALI)
 #device		ataamd		# American Micro Devices (AMD)
 #device		ataati		# ATI
 #device		atacenatek	# Cenatek
 #device		atacypress	# Cypress
 #device		atacyrix	# Cyrix
 #device		atahighpoint	# HighPoint
 #device		ataintel	# Intel
 #device		ataite		# Integrated Technology Inc. (ITE)
 #device		atajmicron	# JMicron
 #device		atamarvell	# Marvell
 #device		atamicron	# Micron
 #device		atanational	# National
 #device		atanetcell	# NetCell
 #device		atanvidia	# nVidia
 #device		atapromise	# Promise
 #device		ataserverworks	# ServerWorks
 #device		atasiliconimage	# Silicon Image Inc. (SiI) (formerly CMD)
 #device		atasis		# Silicon Integrated Systems Corp.(SiS)
 #device		atavia		# VIA Technologies Inc.
 
 #
 # For older non-PCI, non-PnPBIOS systems, these are the hints lines to add:
 hint.ata.0.at="isa"
 hint.ata.0.port="0x1f0"
 hint.ata.0.irq="14"
 hint.ata.1.at="isa"
 hint.ata.1.port="0x170"
 hint.ata.1.irq="15"
 
 #
 # Standard floppy disk controllers and floppy tapes, supports
 # the Y-E DATA External FDD (PC Card)
 #
 device		fdc
 hint.fdc.0.at="isa"
 hint.fdc.0.port="0x3F0"
 hint.fdc.0.irq="6"
 hint.fdc.0.drq="2"
 #
 # FDC_DEBUG enables floppy debugging.  Since the debug output is huge, you
 # gotta turn it actually on by setting the variable fd_debug with DDB,
 # however.
 options 	FDC_DEBUG
 #
 # Activate this line if you happen to have an Insight floppy tape.
 # Probing them proved to be dangerous for people with floppy disks only,
 # so it's "hidden" behind a flag:
 #hint.fdc.0.flags="1"
 
 # Specify floppy devices
 hint.fd.0.at="fdc0"
 hint.fd.0.drive="0"
 hint.fd.1.at="fdc0"
 hint.fd.1.drive="1"
 
 #
 # uart: newbusified driver for serial interfaces.  It consolidates the sio(4),
 #	sab(4) and zs(4) drivers.
 #
 device		uart
 
 # Options for uart(4)
 options 	UART_PPS_ON_CTS		# Do time pulse capturing using CTS
 					# instead of DCD.
 options 	UART_POLL_FREQ		# Set polling rate, used when hw has
 					# no interrupt support (50 Hz default).
 
 # The following hint should only be used for pure ISA devices.  It is not
 # needed otherwise.  Use of hints is strongly discouraged.
 hint.uart.0.at="isa"
 
 # The following 3 hints are used when the UART is a system device (i.e., a
 # console or debug port), but only on platforms that don't have any other
 # means to pass the information to the kernel.  The unit number of the hint
 # is only used to bundle the hints together.  There is no relation to the
 # unit number of the probed UART.
 hint.uart.0.port="0x3f8"
 hint.uart.0.flags="0x10"
 hint.uart.0.baud="115200"
 
 # `flags' for serial drivers that support consoles like sio(4) and uart(4):
 #	0x10	enable console support for this unit.  Other console flags
 #		(if applicable) are ignored unless this is set.  Enabling
 #		console support does not make the unit the preferred console.
 #		Boot with -h or set boot_serial=YES in the loader.  For sio(4)
 #		specifically, the 0x20 flag can also be set (see above).
 #		Currently, at most one unit can have console support; the
 #		first one (in config file order) with this flag set is
 #		preferred.  Setting this flag for sio0 gives the old behavior.
 #	0x80	use this port for serial line gdb support in ddb.  Also known
 #		as debug port.
 #
 
 # Options for serial drivers that support consoles:
 options 	BREAK_TO_DEBUGGER	# A BREAK/DBG on the console goes to
 					# ddb, if available.
 
 # Solaris implements a new BREAK which is initiated by a character
 # sequence CR ~ ^b which is similar to a familiar pattern used on
 # Sun servers by the Remote Console.  There are FreeBSD extensions:
 # CR ~ ^p requests force panic and CR ~ ^r requests a clean reboot.
 options 	ALT_BREAK_TO_DEBUGGER
 
 # Serial Communications Controller
 # Supports the Siemens SAB 82532 and Zilog Z8530 multi-channel
 # communications controllers.
 device		scc
 
 # PCI Universal Communications driver
 # Supports various multi port PCI I/O cards.
 device		puc
 
 #
 # Network interfaces:
 #
 # MII bus support is required for many PCI Ethernet NICs,
 # namely those which use MII-compliant transceivers or implement
 # transceiver control interfaces that operate like an MII.  Adding
 # "device miibus" to the kernel config pulls in support for the generic
 # miibus API, the common support for for bit-bang'ing the MII and all
 # of the PHY drivers, including a generic one for PHYs that aren't
 # specifically handled by an individual driver.  Support for specific
 # PHYs may be built by adding "device mii", "device mii_bitbang" if
 # needed by the NIC driver and then adding the appropriate PHY driver.
 device  	mii		# Minimal MII support
 device  	mii_bitbang	# Common module for bit-bang'ing the MII
 device  	miibus		# MII support w/ bit-bang'ing and all PHYs
 
 device  	acphy		# Altima Communications AC101
 device  	amphy		# AMD AM79c873 / Davicom DM910{1,2}
 device  	atphy		# Attansic/Atheros F1
 device  	axphy		# Asix Semiconductor AX88x9x
 device  	bmtphy		# Broadcom BCM5201/BCM5202 and 3Com 3c905C
 device		bnxt		# Broadcom NetXtreme-C/NetXtreme-E
 device  	brgphy		# Broadcom BCM54xx/57xx 1000baseTX
 device  	ciphy		# Cicada/Vitesse CS/VSC8xxx
 device  	e1000phy	# Marvell 88E1000 1000/100/10-BT
 device  	gentbi		# Generic 10-bit 1000BASE-{LX,SX} fiber ifaces
 device  	icsphy		# ICS ICS1889-1893
 device  	ip1000phy	# IC Plus IP1000A/IP1001
 device  	jmphy		# JMicron JMP211/JMP202
 device  	lxtphy		# Level One LXT-970
 device  	mlphy		# Micro Linear 6692
 device  	nsgphy		# NatSemi DP8361/DP83865/DP83891
 device  	nsphy		# NatSemi DP83840A
 device  	nsphyter	# NatSemi DP83843/DP83815
 device  	pnaphy		# HomePNA
 device  	qsphy		# Quality Semiconductor QS6612
 device  	rdcphy		# RDC Semiconductor R6040
 device  	rgephy		# RealTek 8169S/8110S/8211B/8211C
 device  	rlphy		# RealTek 8139
 device  	rlswitch	# RealTek 8305
 device  	smcphy		# SMSC LAN91C111
 device  	tdkphy		# TDK 89Q2120
 device  	tlphy		# Texas Instruments ThunderLAN
 device  	truephy		# LSI TruePHY
 device		xmphy		# XaQti XMAC II
 
 # an:   Aironet 4500/4800 802.11 wireless adapters. Supports the PCMCIA,
 #       PCI and ISA varieties.
 # ae:   Support for gigabit ethernet adapters based on the Attansic/Atheros
 #       L2 PCI-Express FastEthernet controllers.
 # age:  Support for gigabit ethernet adapters based on the Attansic/Atheros
 #       L1 PCI express gigabit ethernet controllers.
 # alc:  Support for Atheros AR8131/AR8132 PCIe ethernet controllers.
 # ale:  Support for Atheros AR8121/AR8113/AR8114 PCIe ethernet controllers.
 # ath:  Atheros a/b/g WiFi adapters (requires ath_hal and wlan)
 # bce:	Broadcom NetXtreme II (BCM5706/BCM5708) PCI/PCIe Gigabit Ethernet
 #       adapters.
 # bfe:	Broadcom BCM4401 Ethernet adapter.
 # bge:	Support for gigabit ethernet adapters based on the Broadcom
 #	BCM570x family of controllers, including the 3Com 3c996-T,
 #	the Netgear GA302T, the SysKonnect SK-9D21 and SK-9D41, and
 #	the embedded gigE NICs on Dell PowerEdge 2550 servers.
 # bnxt:	Broadcom NetXtreme-C and NetXtreme-E PCIe 10/25/50G Ethernet adapters.
 # bxe:	Broadcom NetXtreme II (BCM5771X/BCM578XX) PCIe 10Gb Ethernet
 #       adapters.
 # bwi:	Broadcom BCM430* and BCM431* family of wireless adapters.
 # bwn:	Broadcom BCM43xx family of wireless adapters.
 # cas:	Sun Cassini/Cassini+ and National Semiconductor DP83065 Saturn
 # cxgb: Chelsio T3 based 1GbE/10GbE PCIe Ethernet adapters.
 # cxgbe:Chelsio T4, T5, and T6-based 1/10/25/40/100GbE PCIe Ethernet
 #	adapters.
 # cxgbev: Chelsio T4, T5, and T6-based PCIe Virtual Functions.
 # dc:   Support for PCI fast ethernet adapters based on the DEC/Intel 21143
 #       and various workalikes including:
 #       the ADMtek AL981 Comet and AN985 Centaur, the ASIX Electronics
 #       AX88140A and AX88141, the Davicom DM9100 and DM9102, the Lite-On
 #       82c168 and 82c169 PNIC, the Lite-On/Macronix LC82C115 PNIC II
 #       and the Macronix 98713/98713A/98715/98715A/98725 PMAC. This driver
 #       replaces the old al, ax, dm, pn and mx drivers.  List of brands:
 #       Digital DE500-BA, Kingston KNE100TX, D-Link DFE-570TX, SOHOware SFA110,
 #       SVEC PN102-TX, CNet Pro110B, 120A, and 120B, Compex RL100-TX,
 #       LinkSys LNE100TX, LNE100TX V2.0, Jaton XpressNet, Alfa Inc GFC2204,
 #       KNE110TX.
 # de:   Digital Equipment DC21040
 # em:   Intel Pro/1000 Gigabit Ethernet 82542, 82543, 82544 based adapters.
 # ep:   3Com 3C509, 3C529, 3C556, 3C562D, 3C563D, 3C572, 3C574X, 3C579, 3C589
 #       and PC Card devices using these chipsets.
 # ex:   Intel EtherExpress Pro/10 and other i82595-based adapters,
 #       Olicom Ethernet PC Card devices.
 # fe:   Fujitsu MB86960A/MB86965A Ethernet
 # fxp:  Intel EtherExpress Pro/100B
 #	(hint of prefer_iomap can be done to prefer I/O instead of Mem mapping)
 # gem:  Apple GMAC/Sun ERI/Sun GEM
 # hme:  Sun HME (Happy Meal Ethernet)
 # jme:  JMicron JMC260 Fast Ethernet/JMC250 Gigabit Ethernet based adapters.
 # le:   AMD Am7900 LANCE and Am79C9xx PCnet
 # lge:	Support for PCI gigabit ethernet adapters based on the Level 1
 #	LXT1001 NetCellerator chipset. This includes the D-Link DGE-500SX,
 #	SMC TigerCard 1000 (SMC9462SX), and some Addtron cards.
 # lio:  Support for Cavium 23XX Ethernet adapters
 # malo: Marvell Libertas wireless NICs.
 # mwl:  Marvell 88W8363 802.11n wireless NICs.
 #	Requires the mwl firmware module
 # mwlfw: Marvell 88W8363 firmware
 # msk:	Support for gigabit ethernet adapters based on the Marvell/SysKonnect
 #	Yukon II Gigabit controllers, including 88E8021, 88E8022, 88E8061,
 #	88E8062, 88E8035, 88E8036, 88E8038, 88E8050, 88E8052, 88E8053,
 #	88E8055, 88E8056 and D-Link 560T/550SX.
 # mlx5:	Mellanox ConnectX-4 and ConnectX-4 LX IB and Eth shared code module.
 # mlx5en:Mellanox ConnectX-4 and ConnectX-4 LX PCIe Ethernet adapters.
 # my:	Myson Fast Ethernet (MTD80X, MTD89X)
 # nge:	Support for PCI gigabit ethernet adapters based on the National
 #	Semiconductor DP83820 and DP83821 chipset. This includes the
 #	SMC EZ Card 1000 (SMC9462TX), D-Link DGE-500T, Asante FriendlyNet
 #	GigaNIX 1000TA and 1000TPC, the Addtron AEG320T, the Surecom
 #	EP-320G-TX and the Netgear GA622T.
 # oce:	Emulex 10 Gbit adapters (OneConnect Ethernet)
 # pcn:	Support for PCI fast ethernet adapters based on the AMD Am79c97x
 #	PCnet-FAST, PCnet-FAST+, PCnet-FAST III, PCnet-PRO and PCnet-Home
 #	chipsets. These can also be handled by the le(4) driver if the
 #	pcn(4) driver is left out of the kernel. The le(4) driver does not
 #	support the additional features like the MII bus and burst mode of
 #	the PCnet-FAST and greater chipsets though.
 # ral:	Ralink Technology IEEE 802.11 wireless adapter
 # re:   RealTek 8139C+/8169/816xS/811xS/8101E PCI/PCIe Ethernet adapter
 # rl:   Support for PCI fast ethernet adapters based on the RealTek 8129/8139
 #       chipset.  Note that the RealTek driver defaults to using programmed
 #       I/O to do register accesses because memory mapped mode seems to cause
 #       severe lockups on SMP hardware.  This driver also supports the
 #       Accton EN1207D `Cheetah' adapter, which uses a chip called
 #       the MPX 5030/5038, which is either a RealTek in disguise or a
 #       RealTek workalike.  Note that the D-Link DFE-530TX+ uses the RealTek
 #       chipset and is supported by this driver, not the 'vr' driver.
 # rtwn: RealTek wireless adapters.
 # rtwnfw: RealTek wireless firmware.
 # sf:   Support for Adaptec Duralink PCI fast ethernet adapters based on the
 #       Adaptec AIC-6915 "starfire" controller.
 #       This includes dual and quad port cards, as well as one 100baseFX card.
 #       Most of these are 64-bit PCI devices, except for one single port
 #       card which is 32-bit.
 # sge:  Silicon Integrated Systems SiS190/191 Fast/Gigabit Ethernet adapter
 # sis:  Support for NICs based on the Silicon Integrated Systems SiS 900,
 #       SiS 7016 and NS DP83815 PCI fast ethernet controller chips.
 # sk:   Support for the SysKonnect SK-984x series PCI gigabit ethernet NICs.
 #       This includes the SK-9841 and SK-9842 single port cards (single mode
 #       and multimode fiber) and the SK-9843 and SK-9844 dual port cards
 #       (also single mode and multimode).
 #       The driver will autodetect the number of ports on the card and
 #       attach each one as a separate network interface.
 # sn:   Support for ISA and PC Card Ethernet devices using the
 #       SMC91C90/92/94/95 chips.
 # ste:  Sundance Technologies ST201 PCI fast ethernet controller, includes
 #       the D-Link DFE-550TX.
 # stge: Support for gigabit ethernet adapters based on the Sundance/Tamarack
 #       TC9021 family of controllers, including the Sundance ST2021/ST2023,
 #       the Sundance/Tamarack TC9021, the D-Link DL-4000 and ASUS NX1101.
 # ti:   Support for PCI gigabit ethernet NICs based on the Alteon Networks
 #       Tigon 1 and Tigon 2 chipsets.  This includes the Alteon AceNIC, the
 #       3Com 3c985, the Netgear GA620 and various others.  Note that you will
 #       probably want to bump up kern.ipc.nmbclusters a lot to use this driver.
 # tl:   Support for the Texas Instruments TNETE100 series 'ThunderLAN'
 #       cards and integrated ethernet controllers.  This includes several
 #       Compaq Netelligent 10/100 cards and the built-in ethernet controllers
 #       in several Compaq Prosignia, Proliant and Deskpro systems.  It also
 #       supports several Olicom 10Mbps and 10/100 boards.
 # tx:   SMC 9432 TX, BTX and FTX cards. (SMC EtherPower II series)
 # txp:	Support for 3Com 3cR990 cards with the "Typhoon" chipset
 # vr:   Support for various fast ethernet adapters based on the VIA
 #       Technologies VT3043 `Rhine I' and VT86C100A `Rhine II' chips,
 #       including the D-Link DFE520TX and D-Link DFE530TX (see 'rl' for
 #       DFE530TX+), the Hawking Technologies PN102TX, and the AOpen/Acer ALN-320.
 # vte:  DM&P Vortex86 RDC R6040 Fast Ethernet
 # vx:   3Com 3C590 and 3C595
 # wb:   Support for fast ethernet adapters based on the Winbond W89C840F chip.
 #       Note: this is not the same as the Winbond W89C940F, which is a
 #       NE2000 clone.
 # wi:   Lucent WaveLAN/IEEE 802.11 PCMCIA adapters. Note: this supports both
 #       the PCMCIA and ISA cards: the ISA card is really a PCMCIA to ISA
 #       bridge with a PCMCIA adapter plugged into it.
 # xe:   Xircom/Intel EtherExpress Pro100/16 PC Card ethernet controller,
 #       Accton Fast EtherCard-16, Compaq Netelligent 10/100 PC Card,
 #       Toshiba 10/100 Ethernet PC Card, Xircom 16-bit Ethernet + Modem 56
 # xl:   Support for the 3Com 3c900, 3c905, 3c905B and 3c905C (Fast)
 #       Etherlink XL cards and integrated controllers.  This includes the
 #       integrated 3c905B-TX chips in certain Dell Optiplex and Dell
 #       Precision desktop machines and the integrated 3c905-TX chips
 #       in Dell Latitude laptop docking stations.
 #       Also supported: 3Com 3c980(C)-TX, 3Com 3cSOHO100-TX, 3Com 3c450-TX
 
 # Order for ISA devices is important here
 
 device		ep
 device		ex
 device		fe
 hint.fe.0.at="isa"
 hint.fe.0.port="0x300"
 device		sn
 hint.sn.0.at="isa"
 hint.sn.0.port="0x300"
 hint.sn.0.irq="10"
 device		an
 device		wi
 device		xe
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 device		ae		# Attansic/Atheros L2 FastEthernet
 device		age		# Attansic/Atheros L1 Gigabit Ethernet
 device		alc		# Atheros AR8131/AR8132 Ethernet
 device		ale		# Atheros AR8121/AR8113/AR8114 Ethernet
 device		bce		# Broadcom BCM5706/BCM5708 Gigabit Ethernet
 device		bfe		# Broadcom BCM440x 10/100 Ethernet
 device		bge		# Broadcom BCM570xx Gigabit Ethernet
 device		cas		# Sun Cassini/Cassini+ and NS DP83065 Saturn
 device		dc		# DEC/Intel 21143 and various workalikes
 device		et		# Agere ET1310 10/100/Gigabit Ethernet
 device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
 hint.fxp.0.prefer_iomap="0"
 device		gem		# Apple GMAC/Sun ERI/Sun GEM
 device		hme		# Sun HME (Happy Meal Ethernet)
 device		jme		# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
 device		lge		# Level 1 LXT1001 gigabit Ethernet
 device		mlx5		# Shared code module between IB and Ethernet
 device		mlx5en		# Mellanox ConnectX-4 and ConnectX-4 LX
 device		msk		# Marvell/SysKonnect Yukon II Gigabit Ethernet
 device		my		# Myson Fast Ethernet (MTD80X, MTD89X)
 device		nge		# NatSemi DP83820 gigabit Ethernet
 device		re		# RealTek 8139C+/8169/8169S/8110S
 device		rl		# RealTek 8129/8139
 device		pcn		# AMD Am79C97x PCI 10/100 NICs
 device		sf		# Adaptec AIC-6915 (``Starfire'')
 device		sge		# Silicon Integrated Systems SiS190/191
 device		sis		# Silicon Integrated Systems SiS 900/SiS 7016
 device		sk		# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste		# Sundance ST201 (D-Link DFE-550TX)
 device		stge		# Sundance/Tamarack TC9021 gigabit Ethernet
 device		tl		# Texas Instruments ThunderLAN
 device		tx		# SMC EtherPower II (83c170 ``EPIC'')
 device		vr		# VIA Rhine, Rhine II
 device		vte		# DM&P Vortex86 RDC R6040 Fast Ethernet
 device		wb		# Winbond W89C840F
 device		xl		# 3Com 3c90x (``Boomerang'', ``Cyclone'')
 
 # PCI/PCI-X/PCIe Ethernet NICs that use iflib infrastructure
 device		iflib
 device		em		# Intel Pro/1000 Gigabit Ethernet
 device		ix		# Intel Pro/10Gbe PCIE Ethernet
 device		ixv		# Intel Pro/10Gbe PCIE Ethernet VF
 
 # PCI Ethernet NICs.
 device		cxgb		# Chelsio T3 10 Gigabit Ethernet
 device		cxgb_t3fw	# Chelsio T3 10 Gigabit Ethernet firmware
 device		cxgbe		# Chelsio T4-T6 1/10/25/40/100 Gigabit Ethernet
 device		cxgbev		# Chelsio T4-T6 Virtual Functions
 device		de		# DEC/Intel DC21x4x (``Tulip'')
 device		le		# AMD Am7900 LANCE and Am79C9xx PCnet
 device		mxge		# Myricom Myri-10G 10GbE NIC
 device		oce		# Emulex 10 GbE (OneConnect Ethernet)
 device		ti		# Alteon Networks Tigon I/II gigabit Ethernet
 device		txp		# 3Com 3cR990 (``Typhoon'')
 device		vx		# 3Com 3c590, 3c595 (``Vortex'')
 
 # PCI IEEE 802.11 Wireless NICs
 device		ath		# Atheros pci/cardbus NIC's
 device		ath_hal		# pci/cardbus chip support
 #device		ath_ar5210	# AR5210 chips
 #device		ath_ar5211	# AR5211 chips
 #device		ath_ar5212	# AR5212 chips
 #device		ath_rf2413
 #device		ath_rf2417
 #device		ath_rf2425
 #device		ath_rf5111
 #device		ath_rf5112
 #device		ath_rf5413
 #device		ath_ar5416	# AR5416 chips
 # All of the AR5212 parts have a problem when paired with the AR71xx
 # CPUS.  These parts have a bug that triggers a fatal bus error on the AR71xx
 # only.  Details of the exact nature of the bug are sketchy, but some can be
 # found at https://forum.openwrt.org/viewtopic.php?pid=70060 on pages 4, 5 and
 # 6.  This option enables this workaround.  There is a performance penalty
 # for this work around, but without it things don't work at all.  The DMA
 # from the card usually bursts 128 bytes, but on the affected CPUs, only
 # 4 are safe.
 options	   	AH_RXCFG_SDMAMW_4BYTES
 #device		ath_ar9160	# AR9160 chips
 #device		ath_ar9280	# AR9280 chips
 #device		ath_ar9285	# AR9285 chips
 device		ath_rate_sample	# SampleRate tx rate control for ath
 device		bwi		# Broadcom BCM430* BCM431*
 device		bwn		# Broadcom BCM43xx
 device		malo		# Marvell Libertas wireless NICs.
 device		mwl		# Marvell 88W8363 802.11n wireless NICs.
 device		mwlfw
 device		ral		# Ralink Technology RT2500 wireless NICs.
 device		rtwn		# Realtek wireless NICs
 device		rtwnfw
 
 # Use sf_buf(9) interface for jumbo buffers on ti(4) controllers.
 #options 	TI_SF_BUF_JUMBO
 # Turn on the header splitting option for the ti(4) driver firmware.  This
 # only works for Tigon II chips, and has no effect for Tigon I chips.
 # This option requires the TI_SF_BUF_JUMBO option above.
 #options 	TI_JUMBO_HDRSPLIT
 
 # These two options allow manipulating the mbuf cluster size and mbuf size,
 # respectively.  Be very careful with NIC driver modules when changing
 # these from their default values, because that can potentially cause a
 # mismatch between the mbuf size assumed by the kernel and the mbuf size
 # assumed by a module.  The only driver that currently has the ability to
 # detect a mismatch is ti(4).
 options 	MCLSHIFT=12	# mbuf cluster shift in bits, 12 == 4KB
 options 	MSIZE=512	# mbuf size in bytes
 
 #
 # Sound drivers
 #
 # sound: The generic sound driver.
 #
 
 device		sound
 
 #
 # snd_*: Device-specific drivers.
 #
 # The flags of the device tell the device a bit more info about the
 # device that normally is obtained through the PnP interface.
 #	bit  2..0   secondary DMA channel;
 #	bit  4      set if the board uses two dma channels;
 #	bit 15..8   board type, overrides autodetection; leave it
 #		    zero if don't know what to put in (and you don't,
 #		    since this is unsupported at the moment...).
 #
 # snd_ad1816:		Analog Devices AD1816 ISA PnP/non-PnP.
 # snd_als4000:		Avance Logic ALS4000 PCI.
 # snd_atiixp:		ATI IXP 200/300/400 PCI.
 # snd_audiocs:		Crystal Semiconductor CS4231 SBus/EBus. Only
 #			for sparc64.
 # snd_cmi:		CMedia CMI8338/CMI8738 PCI.
 # snd_cs4281:		Crystal Semiconductor CS4281 PCI.
 # snd_csa:		Crystal Semiconductor CS461x/428x PCI. (except
 #			4281)
 # snd_ds1:		Yamaha DS-1 PCI.
 # snd_emu10k1:		Creative EMU10K1 PCI and EMU10K2 (Audigy) PCI.
 # snd_emu10kx:		Creative SoundBlaster Live! and Audigy
 # snd_envy24:		VIA Envy24 and compatible, needs snd_spicds.
 # snd_envy24ht:		VIA Envy24HT and compatible, needs snd_spicds.
 # snd_es137x:		Ensoniq AudioPCI ES137x PCI.
 # snd_ess:		Ensoniq ESS ISA PnP/non-PnP, to be used in
 #			conjunction with snd_sbc.
 # snd_fm801:		Forte Media FM801 PCI.
 # snd_gusc:		Gravis UltraSound ISA PnP/non-PnP.
 # snd_hda:		Intel High Definition Audio (Controller) and
 #			compatible.
 # snd_hdspe:		RME HDSPe AIO and RayDAT.
 # snd_ich:		Intel ICH AC'97 and some more audio controllers
 #			embedded in a chipset, for example nVidia
 #			nForce controllers.
 # snd_maestro:		ESS Technology Maestro-1/2x PCI.
 # snd_maestro3:		ESS Technology Maestro-3/Allegro PCI.
 # snd_mss:		Microsoft Sound System ISA PnP/non-PnP.
 # snd_neomagic:		Neomagic 256 AV/ZX PCI.
 # snd_sb16:		Creative SoundBlaster16, to be used in
 #			conjunction with snd_sbc.
 # snd_sb8:		Creative SoundBlaster (pre-16), to be used in
 #			conjunction with snd_sbc.
 # snd_sbc:		Creative SoundBlaster ISA PnP/non-PnP.
 #			Supports ESS and Avance ISA chips as well.
 # snd_solo:		ESS Solo-1x PCI.
 # snd_spicds:		SPI codec driver, needed by Envy24/Envy24HT drivers.
 # snd_t4dwave:		Trident 4DWave DX/NX PCI, Sis 7018 PCI and Acer Labs
 #			M5451 PCI.
 # snd_uaudio:		USB audio.
 # snd_via8233:		VIA VT8233x PCI.
 # snd_via82c686:	VIA VT82C686A PCI.
 # snd_vibes:		S3 Sonicvibes PCI.
 
 device		snd_ad1816
 device		snd_als4000
 device		snd_atiixp
 #device		snd_audiocs
 device		snd_cmi
 device		snd_cs4281
 device		snd_csa
 device		snd_ds1
 device		snd_emu10k1
 device		snd_emu10kx
 device		snd_envy24
 device		snd_envy24ht
 device		snd_es137x
 device		snd_ess
 device		snd_fm801
 device		snd_gusc
 device		snd_hda
 device		snd_hdspe
 device		snd_ich
 device		snd_maestro
 device		snd_maestro3
 device		snd_mss
 device		snd_neomagic
 device		snd_sb16
 device		snd_sb8
 device		snd_sbc
 device		snd_solo
 device		snd_spicds
 device		snd_t4dwave
 device		snd_uaudio
 device		snd_via8233
 device		snd_via82c686
 device		snd_vibes
 
 # For non-PnP sound cards:
 hint.pcm.0.at="isa"
 hint.pcm.0.irq="10"
 hint.pcm.0.drq="1"
 hint.pcm.0.flags="0x0"
 hint.sbc.0.at="isa"
 hint.sbc.0.port="0x220"
 hint.sbc.0.irq="5"
 hint.sbc.0.drq="1"
 hint.sbc.0.flags="0x15"
 hint.gusc.0.at="isa"
 hint.gusc.0.port="0x220"
 hint.gusc.0.irq="5"
 hint.gusc.0.drq="1"
 hint.gusc.0.flags="0x13"
 
 #
 # Following options are intended for debugging/testing purposes:
 #
 # SND_DEBUG                    Enable extra debugging code that includes
 #                              sanity checking and possible increase of
 #                              verbosity.
 #
 # SND_DIAGNOSTIC               Similar in a spirit of INVARIANTS/DIAGNOSTIC,
 #                              zero tolerance against inconsistencies.
 #
 # SND_FEEDER_MULTIFORMAT       By default, only 16/32 bit feeders are compiled
 #                              in. This options enable most feeder converters
 #                              except for 8bit. WARNING: May bloat the kernel.
 #
 # SND_FEEDER_FULL_MULTIFORMAT  Ditto, but includes 8bit feeders as well.
 #
 # SND_FEEDER_RATE_HP           (feeder_rate) High precision 64bit arithmetic
 #                              as much as possible (the default trying to
 #                              avoid it). Possible slowdown.
 #
 # SND_PCM_64                   (Only applicable for i386/32bit arch)
 #                              Process 32bit samples through 64bit
 #                              integer/arithmetic. Slight increase of dynamic
 #                              range at a cost of possible slowdown.
 #
 # SND_OLDSTEREO                Only 2 channels are allowed, effectively
 #                              disabling multichannel processing.
 #
 options		SND_DEBUG
 options		SND_DIAGNOSTIC
 options		SND_FEEDER_MULTIFORMAT
 options		SND_FEEDER_FULL_MULTIFORMAT
 options		SND_FEEDER_RATE_HP
 options		SND_PCM_64
 options		SND_OLDSTEREO
 
 #
 # Miscellaneous hardware:
 #
 # bktr: Brooktree bt848/848a/849a/878/879 video capture and TV Tuner board
 # cmx: OmniKey CardMan 4040 pccard smartcard reader
 
 device		cmx
 
 #
 # The 'bktr' device is a PCI video capture device using the Brooktree
 # bt848/bt848a/bt849a/bt878/bt879 chipset. When used with a TV Tuner it forms a
 # TV card, e.g. Miro PC/TV, Hauppauge WinCast/TV WinTV, VideoLogic Captivator,
 # Intel Smart Video III, AverMedia, IMS Turbo, FlyVideo.
 #
 # options 	OVERRIDE_CARD=xxx
 # options 	OVERRIDE_TUNER=xxx
 # options 	OVERRIDE_MSP=1
 # options 	OVERRIDE_DBX=1
 # These options can be used to override the auto detection
 # The current values for xxx are found in src/sys/dev/bktr/bktr_card.h
 # Using sysctl(8) run-time overrides on a per-card basis can be made
 #
 # options 	BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_PAL
 # or
 # options 	BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_NTSC
 # Specifies the default video capture mode.
 # This is required for Dual Crystal (28&35MHz) boards where PAL is used
 # to prevent hangs during initialization, e.g. VideoLogic Captivator PCI.
 #
 # options 	BKTR_USE_PLL
 # This is required for PAL or SECAM boards with a 28MHz crystal and no 35MHz
 # crystal, e.g. some new Bt878 cards.
 #
 # options 	BKTR_GPIO_ACCESS
 # This enables IOCTLs which give user level access to the GPIO port.
 #
 # options 	BKTR_NO_MSP_RESET
 # Prevents the MSP34xx reset. Good if you initialize the MSP in another OS first
 #
 # options 	BKTR_430_FX_MODE
 # Switch Bt878/879 cards into Intel 430FX chipset compatibility mode.
 #
 # options 	BKTR_SIS_VIA_MODE
 # Switch Bt878/879 cards into SIS/VIA chipset compatibility mode which is
 # needed for some old SiS and VIA chipset motherboards.
 # This also allows Bt878/879 chips to work on old OPTi (<1997) chipset
 # motherboards and motherboards with bad or incomplete PCI 2.1 support.
 # As a rough guess, old = before 1998
 #
 # options 	BKTR_NEW_MSP34XX_DRIVER
 # Use new, more complete initialization scheme for the msp34* soundchip.
 # Should fix stereo autodetection if the old driver does only output
 # mono sound.
 
 #
 # options 	BKTR_USE_FREEBSD_SMBUS
 # Compile with FreeBSD SMBus implementation
 #
 # Brooktree driver has been ported to the new I2C framework. Thus,
 # you'll need to have the following 3 lines in the kernel config.
 #     device smbus
 #     device iicbus
 #     device iicbb
 #     device iicsmb
 # The iic and smb devices are only needed if you want to control other
 # I2C slaves connected to the external connector of some cards.
 #
 device		bktr
  
 #
 # PC Card/PCMCIA and Cardbus
 #
 # cbb: pci/cardbus bridge implementing YENTA interface
 # pccard: pccard slots
 # cardbus: cardbus slots
 device		cbb
 device		pccard
 device		cardbus
 
 #
 # MMC/SD
 #
 # mmc 		MMC/SD bus
 # mmcsd		MMC/SD memory card
 # sdhci		Generic PCI SD Host Controller
 #
 device		mmc
 device		mmcsd
 device		sdhci
 
 #
 # SMB bus
 #
 # System Management Bus support is provided by the 'smbus' device.
 # Access to the SMBus device is via the 'smb' device (/dev/smb*),
 # which is a child of the 'smbus' device.
 #
 # Supported devices:
 # smb		standard I/O through /dev/smb*
 #
 # Supported SMB interfaces:
 # iicsmb	I2C to SMB bridge with any iicbus interface
 # bktr		brooktree848 I2C hardware interface
 # intpm		Intel PIIX4 (82371AB, 82443MX) Power Management Unit
 # alpm		Acer Aladdin-IV/V/Pro2 Power Management Unit
 # ichsmb	Intel ICH SMBus controller chips (82801AA, 82801AB, 82801BA)
 # viapm		VIA VT82C586B/596B/686A and VT8233 Power Management Unit
 # amdpm		AMD 756 Power Management Unit
 # amdsmb	AMD 8111 SMBus 2.0 Controller
 # nfpm		NVIDIA nForce Power Management Unit
 # nfsmb		NVIDIA nForce2/3/4 MCP SMBus 2.0 Controller
 # ismt		Intel SMBus 2.0 controller chips (on Atom S1200, C2000)
 #
 device		smbus		# Bus support, required for smb below.
 
 device		intpm
 device		alpm
 device		ichsmb
 device		viapm
 device		amdpm
 device		amdsmb
 device		nfpm
 device		nfsmb
 device		ismt
 
 device		smb
 
 # SMBus peripheral devices
 #
 # jedec_dimm	Asset and temperature reporting for DDR3 and DDR4 DIMMs
 #
 device		jedec_dimm
 
 # I2C Bus
 #
 # Philips i2c bus support is provided by the `iicbus' device.
 #
 # Supported devices:
 # ic	i2c network interface
 # iic	i2c standard io
 # iicsmb i2c to smb bridge. Allow i2c i/o with smb commands.
 # iicoc simple polling driver for OpenCores I2C controller
 #
 # Supported interfaces:
 # bktr	brooktree848 I2C software interface
 #
 # Other:
 # iicbb	generic I2C bit-banging code (needed by lpbb, bktr)
 #
 device		iicbus		# Bus support, required for ic/iic/iicsmb below.
 device		iicbb
 
 device		ic
 device		iic
 device		iicsmb		# smb over i2c bridge
 device		iicoc		# OpenCores I2C controller support
 
 # I2C peripheral devices
 #
 device		ds1307		# Dallas DS1307 RTC and compatible
 device		ds13rtc		# All Dallas/Maxim ds13xx chips
 device		ds1672		# Dallas DS1672 RTC
 device		ds3231		# Dallas DS3231 RTC + temperature
 device		icee		# AT24Cxxx and compatible EEPROMs
 device		lm75		# LM75 compatible temperature sensor
 device		nxprtc		# NXP RTCs: PCA/PFC212x PCA/PCF85xx
 device		s35390a		# Seiko Instruments S-35390A RTC
 
 # Parallel-Port Bus
 #
 # Parallel port bus support is provided by the `ppbus' device.
 # Multiple devices may be attached to the parallel port, devices
 # are automatically probed and attached when found.
 #
 # Supported devices:
 # vpo	Iomega Zip Drive
 #	Requires SCSI disk support ('scbus' and 'da'), best
 #	performance is achieved with ports in EPP 1.9 mode.
 # lpt	Parallel Printer
 # plip	Parallel network interface
 # ppi	General-purpose I/O ("Geek Port") + IEEE1284 I/O
 # pps	Pulse per second Timing Interface
 # lpbb	Philips official parallel port I2C bit-banging interface
 # pcfclock Parallel port clock driver.
 #
 # Supported interfaces:
 # ppc	ISA-bus parallel port interfaces.
 #
 
 options 	PPC_PROBE_CHIPSET # Enable chipset specific detection
 				  # (see flags in ppc(4))
 options 	DEBUG_1284	# IEEE1284 signaling protocol debug
 options 	PERIPH_1284	# Makes your computer act as an IEEE1284
 				# compliant peripheral
 options 	DONTPROBE_1284	# Avoid boot detection of PnP parallel devices
 options 	VP0_DEBUG	# ZIP/ZIP+ debug
 options 	LPT_DEBUG	# Printer driver debug
 options 	PPC_DEBUG	# Parallel chipset level debug
 options 	PLIP_DEBUG	# Parallel network IP interface debug
 options 	PCFCLOCK_VERBOSE         # Verbose pcfclock driver
 options 	PCFCLOCK_MAX_RETRIES=5   # Maximum read tries (default 10)
 
 device		ppc
 hint.ppc.0.at="isa"
 hint.ppc.0.irq="7"
 device		ppbus
 device		vpo
 device		lpt
 device		plip
 device		ppi
 device		pps
 device		lpbb
 device		pcfclock
 
 #
 # Etherswitch framework and drivers
 #
 # etherswitch	The etherswitch(4) framework
 # miiproxy	Proxy device for miibus(4) functionality
 # 
 # Switch hardware support:
 # arswitch	Atheros switches
 # ip17x 	IC+ 17x family switches
 # rtl8366r	Realtek RTL8366 switches
 # ukswitch	Multi-PHY switches
 #
 device		etherswitch
 device		miiproxy
 device		arswitch
 device		ip17x
 device		rtl8366rb
 device		ukswitch
 
 # Kernel BOOTP support
 
 options 	BOOTP		# Use BOOTP to obtain IP address/hostname
 				# Requires NFSCL and NFS_ROOT
 options 	BOOTP_NFSROOT	# NFS mount root filesystem using BOOTP info
 options 	BOOTP_NFSV3	# Use NFS v3 to NFS mount root
 options 	BOOTP_COMPAT	# Workaround for broken bootp daemons.
 options 	BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP
 options 	BOOTP_BLOCKSIZE=8192 # Override NFS block size
 
 #
 # Enable software watchdog routines, even if hardware watchdog is present.
 # By default, software watchdog timer is enabled only if no hardware watchdog
 # is present.
 #
 options 	SW_WATCHDOG
 
 #
 # Add the software deadlock resolver thread.
 #
 options 	DEADLKRES
 
 #
 # Disable swapping of stack pages.  This option removes all
 # code which actually performs swapping, so it's not possible to turn
 # it back on at run-time.
 #
 # This is sometimes usable for systems which don't have any swap space
 # (see also sysctl "vm.disable_swapspace_pageouts")
 #
 #options 	NO_SWAPPING
 
 # Set the number of sf_bufs to allocate. sf_bufs are virtual buffers
 # for sendfile(2) that are used to map file VM pages, and normally
 # default to a quantity that is roughly 16*MAXUSERS+512. You would
 # typically want about 4 of these for each simultaneous file send.
 #
 options 	NSFBUFS=1024
 
 #
 # Enable extra debugging code for locks.  This stores the filename and
 # line of whatever acquired the lock in the lock itself, and changes a
 # number of function calls to pass around the relevant data.  This is
 # not at all useful unless you are debugging lock code.  Note that
 # modules should be recompiled as this option modifies KBI.
 #
 options 	DEBUG_LOCKS
 
 
 #####################################################################
 # USB support
 # UHCI controller
 device		uhci
 # OHCI controller
 device		ohci
 # EHCI controller
 device		ehci
 # XHCI controller
 device		xhci
 # SL811 Controller
 #device		slhci
 # General USB code (mandatory for USB)
 device		usb
 #
 # USB Double Bulk Pipe devices
 device		udbp
 # USB Fm Radio
 device		ufm
 # USB temperature meter
 device		ugold
 # USB LED
 device		uled
 # Human Interface Device (anything with buttons and dials)
 device		uhid
 # USB keyboard
 device		ukbd
 # USB printer
 device		ulpt
 # USB mass storage driver (Requires scbus and da)
 device		umass
 # USB mass storage driver for device-side mode
 device		usfs
 # USB support for Belkin F5U109 and Magic Control Technology serial adapters
 device		umct
 # USB modem support
 device		umodem
 # USB mouse
 device		ums
 # USB touchpad(s)
 device		atp
 device		wsp
 # eGalax USB touch screen
 device		uep
 # Diamond Rio 500 MP3 player
 device		urio
 #
 # USB serial support
 device		ucom
 # USB support for 3G modem cards by Option, Novatel, Huawei and Sierra
 device		u3g
 # USB support for Technologies ARK3116 based serial adapters
 device		uark
 # USB support for Belkin F5U103 and compatible serial adapters
 device		ubsa
 # USB support for serial adapters based on the FT8U100AX and FT8U232AM
 device		uftdi
 # USB support for some Windows CE based serial communication.
 device		uipaq
 # USB support for Prolific PL-2303 serial adapters
 device		uplcom
 # USB support for Silicon Laboratories CP2101/CP2102 based USB serial adapters
 device		uslcom
 # USB Visor and Palm devices
 device		uvisor
 # USB serial support for DDI pocket's PHS
 device		uvscom
 #
 # USB ethernet support
 device		uether
 # ADMtek USB ethernet. Supports the LinkSys USB100TX,
 # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX
 # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus
 # eval board.
 device		aue
 
 # ASIX Electronics AX88172 USB 2.0 ethernet driver. Used in the
 # LinkSys USB200M and various other adapters.
 device		axe
 # ASIX Electronics AX88178A/AX88179 USB 2.0/3.0 gigabit ethernet driver.
 device		axge
 
 #
 # Devices which communicate using Ethernet over USB, particularly
 # Communication Device Class (CDC) Ethernet specification. Supports
 # Sharp Zaurus PDAs, some DOCSIS cable modems and so on.
 device		cdce
 #
 # CATC USB-EL1201A USB ethernet. Supports the CATC Netmate
 # and Netmate II, and the Belkin F5U111.
 device		cue
 #
 # Kawasaki LSI ethernet. Supports the LinkSys USB10T,
 # Entrega USB-NET-E45, Peracom Ethernet Adapter, the
 # 3Com 3c19250, the ADS Technologies USB-10BT, the ATen UC10T,
 # the Netgear EA101, the D-Link DSB-650, the SMC 2102USB
 # and 2104USB, and the Corega USB-T.
 device		kue
 #
 # RealTek RTL8150 USB to fast ethernet. Supports the Melco LUA-KTX
 # and the GREEN HOUSE GH-USB100B.
 device		rue
 #
 # Davicom DM9601E USB to fast ethernet. Supports the Corega FEther USB-TXC.
 device		udav
 #
 # RealTek RTL8152/RTL8153 USB Ethernet driver
 device		ure
 #
 # Moschip MCS7730/MCS7840 USB to fast ethernet. Supports the Sitecom LN030.
 device		mos
 #
 # HSxPA devices from Option N.V
 device		uhso
 
 # Realtek RTL8188SU/RTL8191SU/RTL8192SU wireless driver
 device		rsu
 #
 # Ralink Technology RT2501USB/RT2601USB wireless driver
 device		rum
 # Ralink Technology RT2700U/RT2800U/RT3000U wireless driver
 device		run
 #
 # Atheros AR5523 wireless driver
 device		uath
 #
 # Conexant/Intersil PrismGT wireless driver
 device		upgt
 #
 # Ralink Technology RT2500USB wireless driver
 device		ural
 #
 # RNDIS USB ethernet driver
 device		urndis
 # Realtek RTL8187B/L wireless driver
 device		urtw
 #
 # ZyDas ZD1211/ZD1211B wireless driver
 device		zyd
 #
 # Sierra USB wireless driver
 device		usie
 
 # 
 # debugging options for the USB subsystem
 #
 options 	USB_DEBUG
 options 	U3G_DEBUG
 
 # options for ukbd:
 options 	UKBD_DFLT_KEYMAP	# specify the built-in keymap
 makeoptions	UKBD_DFLT_KEYMAP=jp.106
 
 # options for uplcom:
 options 	UPLCOM_INTR_INTERVAL=100	# interrupt pipe interval
 						# in milliseconds
 
 # options for uvscom:
 options 	UVSCOM_DEFAULT_OPKTSIZE=8	# default output packet size
 options 	UVSCOM_INTR_INTERVAL=100	# interrupt pipe interval
 						# in milliseconds
 
 #####################################################################
 # FireWire support
 
 device		firewire	# FireWire bus code
 device		sbp		# SCSI over Firewire (Requires scbus and da)
 device		sbp_targ	# SBP-2 Target mode  (Requires scbus and targ)
 device		fwe		# Ethernet over FireWire (non-standard!)
 device		fwip		# IP over FireWire (RFC2734 and RFC3146)
 
 #####################################################################
 # dcons support (Dumb Console Device)
 
 device		dcons			# dumb console driver
 device		dcons_crom		# FireWire attachment
 options 	DCONS_BUF_SIZE=16384	# buffer size
 options 	DCONS_POLL_HZ=100	# polling rate
 options 	DCONS_FORCE_CONSOLE=0	# force to be the primary console
 options 	DCONS_FORCE_GDB=1	# force to be the gdb device
 
 #####################################################################
 # crypto subsystem
 #
 # This is a port of the OpenBSD crypto framework.  Include this when
 # configuring IPSEC and when you have a h/w crypto device to accelerate
 # user applications that link to OpenSSL.
 #
 # Drivers are ports from OpenBSD with some simple enhancements that have
 # been fed back to OpenBSD.
 
 device		crypto		# core crypto support
 
 # Only install the cryptodev device if you are running tests, or know
 # specifically why you need it.  In most cases, it is not needed and
 # will make things slower.
 device		cryptodev	# /dev/crypto for access to h/w
 
 device		rndtest		# FIPS 140-2 entropy tester
 
 device		ccr		# Chelsio T6
 
 device		hifn		# Hifn 7951, 7781, etc.
 options 	HIFN_DEBUG	# enable debugging support: hw.hifn.debug
 options 	HIFN_RNDTEST	# enable rndtest support
 
 device		ubsec		# Broadcom 5501, 5601, 58xx
 options 	UBSEC_DEBUG	# enable debugging support: hw.ubsec.debug
 options 	UBSEC_RNDTEST	# enable rndtest support
 
 #####################################################################
 
 
 #
 # Embedded system options:
 #
 # An embedded system might want to run something other than init.
 options 	INIT_PATH=/sbin/init:/rescue/init
 
 # Debug options
 options 	BUS_DEBUG	# enable newbus debugging
 options 	DEBUG_VFS_LOCKS	# enable VFS lock debugging
 options 	SOCKBUF_DEBUG	# enable sockbuf last record/mb tail checking
 options 	IFMEDIA_DEBUG	# enable debugging in net/if_media.c
 
 #
 # Verbose SYSINIT
 #
 # Make the SYSINIT process performed by mi_startup() verbose.  This is very
 # useful when porting to a new architecture.  If DDB is also enabled, this
 # will print function names instead of addresses.  If defined with a value
 # of zero, the verbose code is compiled-in but disabled by default, and can
 # be enabled with the debug.verbose_sysinit=1 tunable.
 options 	VERBOSE_SYSINIT
 
 #####################################################################
 # SYSV IPC KERNEL PARAMETERS
 #
 # Maximum number of System V semaphores that can be used on the system at
 # one time.
 options 	SEMMNI=11
 
 # Total number of semaphores system wide
 options 	SEMMNS=61
 
 # Total number of undo structures in system
 options 	SEMMNU=31
 
 # Maximum number of System V semaphores that can be used by a single process
 # at one time.
 options 	SEMMSL=61
 
 # Maximum number of operations that can be outstanding on a single System V
 # semaphore at one time.
 options 	SEMOPM=101
 
 # Maximum number of undo operations that can be outstanding on a single
 # System V semaphore at one time.
 options 	SEMUME=11
 
 # Maximum number of shared memory pages system wide.
 options 	SHMALL=1025
 
 # Maximum size, in bytes, of a single System V shared memory region.
 options 	SHMMAX=(SHMMAXPGS*PAGE_SIZE+1)
 options 	SHMMAXPGS=1025
 
 # Minimum size, in bytes, of a single System V shared memory region.
 options 	SHMMIN=2
 
 # Maximum number of shared memory regions that can be used on the system
 # at one time.
 options 	SHMMNI=33
 
 # Maximum number of System V shared memory regions that can be attached to
 # a single process at one time.
 options 	SHMSEG=9
 
 # Set the amount of time (in seconds) the system will wait before
 # rebooting automatically when a kernel panic occurs.  If set to (-1),
 # the system will wait indefinitely until a key is pressed on the
 # console.
 options 	PANIC_REBOOT_WAIT_TIME=16
 
 # Attempt to bypass the buffer cache and put data directly into the
 # userland buffer for read operation when O_DIRECT flag is set on the
 # file.  Both offset and length of the read operation must be
 # multiples of the physical media sector size.
 #
 options 	DIRECTIO
 
 # Specify a lower limit for the number of swap I/O buffers.  They are
 # (among other things) used when bypassing the buffer cache due to
 # DIRECTIO kernel option enabled and O_DIRECT flag set on file.
 #
 options 	NSWBUF_MIN=120
 
 #####################################################################
 
 # More undocumented options for linting.
 # Note that documenting these is not considered an affront.
 
 options 	CAM_DEBUG_DELAY
 
 # VFS cluster debugging.
 options 	CLUSTERDEBUG
 
 options 	DEBUG
 
 # Kernel filelock debugging.
 options 	LOCKF_DEBUG
 
 # System V compatible message queues
 # Please note that the values provided here are used to test kernel
 # building.  The defaults in the sources provide almost the same numbers.
 # MSGSSZ must be a power of 2 between 8 and 1024.
 options 	MSGMNB=2049	# Max number of chars in queue
 options 	MSGMNI=41	# Max number of message queue identifiers
 options 	MSGSEG=2049	# Max number of message segments
 options 	MSGSSZ=16	# Size of a message segment
 options 	MSGTQL=41	# Max number of messages in system
 
 options 	NBUF=512	# Number of buffer headers
 
 options 	SC_DEBUG_LEVEL=5	# Syscons debug level
 options 	SC_RENDER_DEBUG	# syscons rendering debugging
 
 options 	VFS_BIO_DEBUG	# VFS buffer I/O debugging
 
 options 	KSTACK_MAX_PAGES=32 # Maximum pages to give the kernel stack
 options 	KSTACK_USAGE_PROF
 
 # Adaptec Array Controller driver options
 options 	AAC_DEBUG	# Debugging levels:
 				# 0 - quiet, only emit warnings
 				# 1 - noisy, emit major function
 				#     points and things done
 				# 2 - extremely noisy, emit trace
 				#     items in loops, etc.
 
 # Resource Accounting
 options 	RACCT
 
 # Resource Limits
 options 	RCTL
 
 # Yet more undocumented options for linting.
 # BKTR_ALLOC_PAGES has no effect except to cause warnings, and
 # BROOKTREE_ALLOC_PAGES hasn't actually been superseded by it, since the
 # driver still mostly spells this option BROOKTREE_ALLOC_PAGES.
 ##options 	BKTR_ALLOC_PAGES=(217*4+1)
 options 	BROOKTREE_ALLOC_PAGES=(217*4+1)
 options 	MAXFILES=999
 
 # Random number generator
 # Allow the CSPRNG algorithm to be loaded as a module.
 #options 	RANDOM_LOADABLE
 # Select this to allow high-rate but potentially expensive
 # harvesting of Slab-Allocator entropy. In very high-rate
 # situations the value of doing this is dubious at best.
 options 	RANDOM_ENABLE_UMA	# slab allocator
 
 # Select this to allow high-rate but potentially expensive
 # harvesting of of the m_next pointer in the mbuf. Note that
 # the m_next pointer is NULL except when receiving > 4K
 # jumbo frames or sustained bursts by way of LRO. Thus in
 # the common case it is stirring zero in to the entropy
 # pool. In cases where it is not NULL it is pointing to one
 # of a small (in the thousands to 10s of thousands) number
 # of 256 byte aligned mbufs. Hence it is, even in the best
 # case, a poor source of entropy. And in the absence of actual
 # runtime analysis of entropy collection may mislead the user in
 # to believe that substantially more entropy is being collected
 # than in fact is - leading to a different class of security
 # risk. In high packet rate situations ethernet entropy
 # collection is also very expensive, possibly leading to as
 # much as a 50% drop in packets received.
 # This option is present to maintain backwards compatibility
 # if desired, however it cannot be recommended for use in any
 # environment.
 options 	RANDOM_ENABLE_ETHER	# ether_input
 
 # Module to enable execution of application via emulators like QEMU
 options         IMAGACT_BINMISC
 
 # zlib I/O stream support
 # This enables support for compressed core dumps.
 options 	GZIO
 
 # zstd I/O stream support
 # This enables support for Zstd compressed core dumps.
 options 	ZSTDIO
 
 # BHND(4) drivers
 options		BHND_LOGLEVEL	# Logging threshold level
 
 # evdev interface 
 device		evdev		# input event device support
 options 	EVDEV_SUPPORT	# evdev support in legacy drivers
 options 	EVDEV_DEBUG	# enable event debug msgs
 device		uinput		# install /dev/uinput cdev
 options 	UINPUT_DEBUG	# enable uinput debug msgs
 
 # Encrypted kernel crash dumps.
 options 	EKCD
 
 # Serial Peripheral Interface (SPI) support.
 device		spibus		# Bus support.
 device		at45d		# DataFlash driver
 device		cqspi		# 
 device		mx25l		# SPIFlash driver
 device		n25q		# 
 device		spigen		# Generic access to SPI devices from userland.
 # Enable legacy /dev/spigenN name aliases for /dev/spigenX.Y devices.
 options 	SPIGEN_LEGACY_CDEVNAME # legacy device names for spigen
 
 device		xz		# xz_embedded LZMA de-compression library
Index: projects/capsicum-test/sys/conf/ldscript.powerpc64
===================================================================
--- projects/capsicum-test/sys/conf/ldscript.powerpc64	(revision 345709)
+++ projects/capsicum-test/sys/conf/ldscript.powerpc64	(revision 345710)
@@ -1,168 +1,173 @@
 /* $FreeBSD$ */
 
 OUTPUT_FORMAT("elf64-powerpc-freebsd", "elf64-powerpc-freebsd",
 	      "elf64-powerpc-freebsd")
 OUTPUT_ARCH(powerpc:common64)
 ENTRY(__start)
 SEARCH_DIR(/usr/lib);
 PROVIDE (__stack = 0);
+PHDRS
+{
+	text PT_LOAD ;
+	dynamic PT_DYNAMIC ;
+}
 SECTIONS
 {
 
   /* Low-address wrapper for bootloaders (kexec/kboot) that can't parse ELF */
   . = kernbase - 0x100;
-  .kboot : { *(.text.kboot) }
+  .kboot : { *(.text.kboot) } :text
 
   /* Read-only sections, merged into text segment: */
   . = kernbase;
   PROVIDE (begin = .);
 
   .text      :
   {
+    *(.glink)
     *(.text)
     *(.stub)
     /* .gnu.warning sections are handled specially by elf32.em.  */
     *(.gnu.warning)
     *(.gnu.linkonce.t*)
   } =0
   _etext = .;
   PROVIDE (etext = .);
 
   /* Do not emit PT_INTERP section, which confuses some loaders (kexec-lite) */
-  .interpX    : { *(.interp) 	} : NONE
   /DISCARD/   : { *(.interp)	} 
 
   /* Also delete notes */
   /DISCARD/   : { *(.note.*)	} 
 
   .hash          : { *(.hash)		}
   .dynsym        : { *(.dynsym)		}
   .dynstr        : { *(.dynstr)		}
   .gnu.version   : { *(.gnu.version)	}
   .gnu.version_d   : { *(.gnu.version_d)	}
   .gnu.version_r   : { *(.gnu.version_r)	}
   .rela.text     :
     { *(.rela.text) *(.rela.gnu.linkonce.t*) }
   .rela.data     :
     { *(.rela.data) *(.rela.gnu.linkonce.d*) }
   .rela.rodata   :
     { *(.rela.rodata) *(.rela.gnu.linkonce.r*) }
   .rela.got      : { *(.rela.got)		}
   .rela.got1     : { *(.rela.got1)		}
   .rela.got2     : { *(.rela.got2)		}
   .rela.ctors    : { *(.rela.ctors)	}
   .rela.dtors    : { *(.rela.dtors)	}
   .rela.init     : { *(.rela.init)	}
   .rela.fini     : { *(.rela.fini)	}
   .rela.bss      : { *(.rela.bss)		}
   .rela.plt      : { *(.rela.plt)		}
   .rela.sdata    : { *(.rela.sdata)		}
   .rela.sbss     : { *(.rela.sbss)		}
   .rela.sdata2   : { *(.rela.sdata2)		}
   .rela.sbss2    : { *(.rela.sbss2)		}
 
   .init      : { *(.init)    } =0
   .fini      : { *(.fini)    } =0
   .rodata    : { *(.rodata) *(.gnu.linkonce.r*) }
   .rodata1   : { *(.rodata1) }
   .sdata2    : { *(.sdata2)  }
   .sbss2     : { *(.sbss2)   }
   /* Adjust the address for the data segment to the next page up. */
   . = ALIGN(4096);
   .data.read_frequently :
   {
     *(SORT_BY_ALIGNMENT(.data.read_frequently))
   }
   .data.read_mostly :
   {
     *(.data.read_mostly)
   }
   . = ALIGN(128);
   .data.exclusive_cache_line :
   {
     *(.data.exclusive_cache_line)
   }
   . = ALIGN(128);
   .data    :
   {
     *(.data)
     *(.gnu.linkonce.d*)
   }
   .data1     : { *(.data1) }
   .toc1      : ALIGN(8) { *(.toc1) }
   .opd       : ALIGN(8) { KEEP (*(.opd)) }
   .branch_lt : ALIGN(8) { *(.branch_lt) }
   . = ALIGN(4096);
   .got       : ALIGN(8) { __tocbase = .; *(.got .toc) }
 
-  .dynamic        : { *(.dynamic) }
+  .dynamic        : { *(.dynamic) } :text :dynamic
   /* Put .ctors and .dtors next to the .got2 section, so that the pointers
      get relocated with -mrelocatable. Also put in the .fixup pointers.
      The current compiler no longer needs this, but keep it around for 2.7.2  */
                 PROVIDE (_GOT2_START_ = .);
   .got2           :  { *(.got2) }
                 PROVIDE (__CTOR_LIST__ = .);
   .ctors          : { *(.ctors) }
                 PROVIDE (__CTOR_END__ = .);
                 PROVIDE (__DTOR_LIST__ = .);
   .dtors          : { *(.dtors) }
                 PROVIDE (__DTOR_END__ = .);
                 PROVIDE (_FIXUP_START_ = .);
   .fixup          : { *(.fixup) }
                 PROVIDE (_FIXUP_END_ = .);
                 PROVIDE (_GOT2_END_ = .);
   /* We want the small data sections together, so single-instruction offsets
      can access them all, and initialized data all before uninitialized, so
      we can shorten the on-disk segment size.  */
   .sdata     : { *(.sdata) }
   _edata  =  .;
   PROVIDE (edata = .);
   .sbss      :
   {
     PROVIDE (__sbss_start = .);
     *(.sbss)
     *(.scommon)
     *(.dynsbss)
     PROVIDE (__sbss_end = .);
   }
   .plt   : { *(.plt) }
   .bss       :
   {
    PROVIDE (__bss_start = .);
    *(.dynbss)
    *(.bss)
    *(COMMON)
   }
   _end = . ;
   PROVIDE (end = .);
   /* Stabs debugging sections.  */
   .stab 0 : { *(.stab) }
   .stabstr 0 : { *(.stabstr) }
   /* DWARF debug sections.
      Symbols in the DWARF debugging sections are relative to the beginning
      of the section so we begin them at 0.  */
   /* DWARF 1 */
   .debug          0 : { *(.debug) }
   .line           0 : { *(.line) }
   /* GNU DWARF 1 extensions */
   .debug_srcinfo  0 : { *(.debug_srcinfo) }
   .debug_sfnames  0 : { *(.debug_sfnames) }
   /* DWARF 1.1 and DWARF 2 */
   .debug_aranges  0 : { *(.debug_aranges) }
   .debug_pubnames 0 : { *(.debug_pubnames) }
   /* DWARF 2 */
   .debug_info     0 : { *(.debug_info) }
   .debug_abbrev   0 : { *(.debug_abbrev) }
   .debug_line     0 : { *(.debug_line) }
   .debug_frame    0 : { *(.debug_frame) }
   .debug_str      0 : { *(.debug_str) }
   .debug_loc      0 : { *(.debug_loc) }
   .debug_macinfo  0 : { *(.debug_macinfo) }
   /* SGI/MIPS DWARF 2 extensions */
   .debug_weaknames 0 : { *(.debug_weaknames) }
   .debug_funcnames 0 : { *(.debug_funcnames) }
   .debug_typenames 0 : { *(.debug_typenames) }
   .debug_varnames  0 : { *(.debug_varnames) }
   /* These must appear regardless of  .  */
 }
 
Index: projects/capsicum-test/sys/dev/cxgbe/adapter.h
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/adapter.h	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/adapter.h	(revision 345710)
@@ -1,1313 +1,1315 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef __T4_ADAPTER_H__
 #define __T4_ADAPTER_H__
 
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/types.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <vm/uma.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <machine/bus.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <netinet/in.h>
 #include <netinet/tcp_lro.h>
 
 #include "offload.h"
 #include "t4_ioctl.h"
 #include "common/t4_msg.h"
 #include "firmware/t4fw_interface.h"
 
 #define KTR_CXGBE	KTR_SPARE3
 MALLOC_DECLARE(M_CXGBE);
 #define CXGBE_UNIMPLEMENTED(s) \
     panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__)
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
 #else
 #define prefetch(x) __builtin_prefetch(x)
 #endif
 
 #ifndef SYSCTL_ADD_UQUAD
 #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD
 #define sysctl_handle_64 sysctl_handle_quad
 #define CTLTYPE_U64 CTLTYPE_QUAD
 #endif
 
 SYSCTL_DECL(_hw_cxgbe);
 
 struct adapter;
 typedef struct adapter adapter_t;
 
 enum {
 	/*
 	 * All ingress queues use this entry size.  Note that the firmware event
 	 * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to
 	 * be at least 64.
 	 */
 	IQ_ESIZE = 64,
 
 	/* Default queue sizes for all kinds of ingress queues */
 	FW_IQ_QSIZE = 256,
 	RX_IQ_QSIZE = 1024,
 
 	/* All egress queues use this entry size */
 	EQ_ESIZE = 64,
 
 	/* Default queue sizes for all kinds of egress queues */
 	CTRL_EQ_QSIZE = 1024,
 	TX_EQ_QSIZE = 1024,
 
 #if MJUMPAGESIZE != MCLBYTES
 	SW_ZONE_SIZES = 4,	/* cluster, jumbop, jumbo9k, jumbo16k */
 #else
 	SW_ZONE_SIZES = 3,	/* cluster, jumbo9k, jumbo16k */
 #endif
 	CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
 	SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */
 	TX_SGL_SEGS = 39,
 	TX_SGL_SEGS_TSO = 38,
 	TX_SGL_SEGS_EO_TSO = 30,	/* XXX: lower for IPv6. */
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
 enum {
 	/* adapter intr_type */
 	INTR_INTX	= (1 << 0),
 	INTR_MSI 	= (1 << 1),
 	INTR_MSIX	= (1 << 2)
 };
 
 enum {
 	XGMAC_MTU	= (1 << 0),
 	XGMAC_PROMISC	= (1 << 1),
 	XGMAC_ALLMULTI	= (1 << 2),
 	XGMAC_VLANEX	= (1 << 3),
 	XGMAC_UCADDR	= (1 << 4),
 	XGMAC_MCADDRS	= (1 << 5),
 
 	XGMAC_ALL	= 0xffff
 };
 
 enum {
 	/* flags understood by begin_synchronized_op */
 	HOLD_LOCK	= (1 << 0),
 	SLEEP_OK	= (1 << 1),
 	INTR_OK		= (1 << 2),
 
 	/* flags understood by end_synchronized_op */
 	LOCK_HELD	= HOLD_LOCK,
 };
 
 enum {
 	/* adapter flags */
 	FULL_INIT_DONE	= (1 << 0),
 	FW_OK		= (1 << 1),
 	CHK_MBOX_ACCESS	= (1 << 2),
 	MASTER_PF	= (1 << 3),
 	ADAP_SYSCTL_CTX	= (1 << 4),
 	ADAP_ERR	= (1 << 5),
 	BUF_PACKING_OK	= (1 << 6),
 	IS_VF		= (1 << 7),
 
 	CXGBE_BUSY	= (1 << 9),
 
 	/* port flags */
 	HAS_TRACEQ	= (1 << 3),
 	FIXED_IFMEDIA	= (1 << 4),	/* ifmedia list doesn't change. */
 
 	/* VI flags */
 	DOOMED		= (1 << 0),
 	VI_INIT_DONE	= (1 << 1),
 	VI_SYSCTL_CTX	= (1 << 2),
 
 	/* adapter debug_flags */
 	DF_DUMP_MBOX		= (1 << 0),	/* Log all mbox cmd/rpl. */
 	DF_LOAD_FW_ANYTIME	= (1 << 1),	/* Allow LOAD_FW after init */
 	DF_DISABLE_TCB_CACHE	= (1 << 2),	/* Disable TCB cache (T6+) */
 	DF_DISABLE_CFG_RETRY	= (1 << 3),	/* Disable fallback config */
 	DF_VERBOSE_SLOWINTR	= (1 << 4),	/* Chatty slow intr handler */
 };
 
 #define IS_DOOMED(vi)	((vi)->flags & DOOMED)
 #define SET_DOOMED(vi)	do {(vi)->flags |= DOOMED;} while (0)
 #define IS_BUSY(sc)	((sc)->flags & CXGBE_BUSY)
 #define SET_BUSY(sc)	do {(sc)->flags |= CXGBE_BUSY;} while (0)
 #define CLR_BUSY(sc)	do {(sc)->flags &= ~CXGBE_BUSY;} while (0)
 
 struct vi_info {
 	device_t dev;
 	struct port_info *pi;
 
 	struct ifnet *ifp;
 
 	unsigned long flags;
 	int if_flags;
 
 	uint16_t *rss, *nm_rss;
 	uint16_t viid;		/* opaque VI identifier */
 	uint16_t smt_idx;
 	uint16_t vin;
 	uint8_t vfvld;
 	int16_t  xact_addr_filt;/* index of exact MAC address filter */
 	uint16_t rss_size;	/* size of VI's RSS table slice */
 	uint16_t rss_base;	/* start of VI's RSS table slice */
 	int hashen;
 
 	int nintr;
 	int first_intr;
 
 	/* These need to be int as they are used in sysctl */
 	int ntxq;		/* # of tx queues */
 	int first_txq;		/* index of first tx queue */
 	int rsrv_noflowq; 	/* Reserve queue 0 for non-flowid packets */
 	int nrxq;		/* # of rx queues */
 	int first_rxq;		/* index of first rx queue */
 	int nofldtxq;		/* # of offload tx queues */
 	int first_ofld_txq;	/* index of first offload tx queue */
 	int nofldrxq;		/* # of offload rx queues */
 	int first_ofld_rxq;	/* index of first offload rx queue */
 	int nnmtxq;
 	int first_nm_txq;
 	int nnmrxq;
 	int first_nm_rxq;
 	int tmr_idx;
 	int ofld_tmr_idx;
 	int pktc_idx;
 	int ofld_pktc_idx;
 	int qsize_rxq;
 	int qsize_txq;
 
 	struct timeval last_refreshed;
 	struct fw_vi_stats_vf stats;
 
 	struct callout tick;
 	struct sysctl_ctx_list ctx;	/* from ifconfig up to driver detach */
 
 	uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */
 };
 
 struct tx_ch_rl_params {
 	enum fw_sched_params_rate ratemode;	/* %port (REL) or kbps (ABS) */
 	uint32_t maxrate;
 };
 
 enum {
 	CLRL_USER	= (1 << 0),	/* allocated manually. */
 	CLRL_SYNC	= (1 << 1),	/* sync hw update in progress. */
 	CLRL_ASYNC	= (1 << 2),	/* async hw update requested. */
 	CLRL_ERR	= (1 << 3),	/* last hw setup ended in error. */
 };
 
 struct tx_cl_rl_params {
 	int refcount;
 	uint8_t flags;
 	enum fw_sched_params_rate ratemode;	/* %port REL or ABS value */
 	enum fw_sched_params_unit rateunit;	/* kbps or pps (when ABS) */
 	enum fw_sched_params_mode mode;		/* aggr or per-flow */
 	uint32_t maxrate;
 	uint16_t pktsize;
 	uint16_t burstsize;
 };
 
 /* Tx scheduler parameters for a channel/port */
 struct tx_sched_params {
 	/* Channel Rate Limiter */
 	struct tx_ch_rl_params ch_rl;
 
 	/* Class WRR */
 	/* XXX */
 
 	/* Class Rate Limiter (including the default pktsize and burstsize). */
 	int pktsize;
 	int burstsize;
 	struct tx_cl_rl_params cl_rl[];
 };
 
 struct port_info {
 	device_t dev;
 	struct adapter *adapter;
 
 	struct vi_info *vi;
 	int nvi;
 	int up_vis;
 	int uld_vis;
 
 	struct tx_sched_params *sched_params;
 
 	struct mtx pi_lock;
 	char lockname[16];
 	unsigned long flags;
 
 	uint8_t  lport;		/* associated offload logical port */
 	int8_t   mdio_addr;
 	uint8_t  port_type;
 	uint8_t  mod_type;
 	uint8_t  port_id;
 	uint8_t  tx_chan;
 	uint8_t  mps_bg_map;	/* rx MPS buffer group bitmap */
 	uint8_t  rx_e_chan_map;	/* rx TP e-channel bitmap */
 
 	struct link_config link_cfg;
 	struct ifmedia media;
 
 	struct timeval last_refreshed;
  	struct port_stats stats;
 	u_int tnl_cong_drops;
 	u_int tx_parse_error;
 	u_long	tx_tls_records;
 	u_long	tx_tls_octets;
 	u_long	rx_tls_records;
 	u_long	rx_tls_octets;
 
 	struct callout tick;
 };
 
 #define	IS_MAIN_VI(vi)		((vi) == &((vi)->pi->vi[0]))
 
 /* Where the cluster came from, how it has been carved up. */
 struct cluster_layout {
 	int8_t zidx;
 	int8_t hwidx;
 	uint16_t region1;	/* mbufs laid out within this region */
 				/* region2 is the DMA region */
 	uint16_t region3;	/* cluster_metadata within this region */
 };
 
 struct cluster_metadata {
 	u_int refcount;
 	struct fl_sdesc *sd;	/* For debug only.  Could easily be stale */
 };
 
 struct fl_sdesc {
 	caddr_t cl;
 	uint16_t nmbuf;	/* # of driver originated mbufs with ref on cluster */
 	struct cluster_layout cll;
 };
 
 struct tx_desc {
 	__be64 flit[8];
 };
 
 struct tx_sdesc {
 	struct mbuf *m;		/* m_nextpkt linked chain of frames */
 	uint8_t desc_used;	/* # of hardware descriptors used by the WR */
 };
 
 
 #define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header))
 struct iq_desc {
 	struct rss_header rss;
 	uint8_t cpl[IQ_PAD];
 	struct rsp_ctrl rsp;
 };
 #undef IQ_PAD
 CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE);
 
 enum {
 	/* iq flags */
 	IQ_ALLOCATED	= (1 << 0),	/* firmware resources allocated */
 	IQ_HAS_FL	= (1 << 1),	/* iq associated with a freelist */
 	IQ_RX_TIMESTAMP	= (1 << 2),	/* provide the SGE rx timestamp */
 	IQ_LRO_ENABLED	= (1 << 3),	/* iq is an eth rxq with LRO enabled */
 	IQ_ADJ_CREDIT	= (1 << 4),	/* hw is off by 1 credit for this iq */
 
 	/* iq state */
 	IQS_DISABLED	= 0,
 	IQS_BUSY	= 1,
 	IQS_IDLE	= 2,
 
 	/* netmap related flags */
 	NM_OFF	= 0,
 	NM_ON	= 1,
 	NM_BUSY	= 2,
 };
 
 enum {
 	CPL_COOKIE_RESERVED = 0,
 	CPL_COOKIE_FILTER,
 	CPL_COOKIE_DDP0,
 	CPL_COOKIE_DDP1,
 	CPL_COOKIE_TOM,
 	CPL_COOKIE_HASHFILTER,
 	CPL_COOKIE_ETHOFLD,
 	CPL_COOKIE_AVAILABLE3,
 
 	NUM_CPL_COOKIES = 8	/* Limited by M_COOKIE.  Do not increase. */
 };
 
 struct sge_iq;
 struct rss_header;
 typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *);
 typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *);
 
 /*
  * Ingress Queue: T4 is producer, driver is consumer.
  */
 struct sge_iq {
 	uint32_t flags;
 	volatile int state;
 	struct adapter *adapter;
 	struct iq_desc  *desc;	/* KVA of descriptor ring */
 	int8_t   intr_pktc_idx;	/* packet count threshold index */
 	uint8_t  gen;		/* generation bit */
 	uint8_t  intr_params;	/* interrupt holdoff parameters */
 	uint8_t  intr_next;	/* XXX: holdoff for next interrupt */
 	uint16_t qsize;		/* size (# of entries) of the queue */
 	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer index */
 	uint16_t cntxt_id;	/* SGE context id for the iq */
 	uint16_t abs_id;	/* absolute SGE id for the iq */
 
 	STAILQ_ENTRY(sge_iq) link;
 
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	bus_addr_t ba;		/* bus address of descriptor ring */
 };
 
 enum {
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
 	EQ_OFLD		= 3,
 
 	/* eq flags */
 	EQ_TYPEMASK	= 0x3,		/* 2 lsbits hold the type (see above) */
 	EQ_ALLOCATED	= (1 << 2),	/* firmware resources allocated */
 	EQ_ENABLED	= (1 << 3),	/* open for business */
 	EQ_QFLUSH	= (1 << 4),	/* if_qflush in progress */
 };
 
 /* Listed in order of preference.  Update t4_sysctls too if you change these */
 enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB};
 
 /*
  * Egress Queue: driver is producer, T4 is consumer.
  *
  * Note: A free list is an egress queue (driver produces the buffers and T4
  * consumes them) but it's special enough to have its own struct (see sge_fl).
  */
 struct sge_eq {
 	unsigned int flags;	/* MUST be first */
 	unsigned int cntxt_id;	/* SGE context id for the eq */
 	unsigned int abs_id;	/* absolute SGE id for the eq */
 	struct mtx eq_lock;
 
 	struct tx_desc *desc;	/* KVA of descriptor ring */
 	uint8_t doorbells;
 	volatile uint32_t *udb;	/* KVA of doorbell (lies within BAR2) */
 	u_int udb_qid;		/* relative qid within the doorbell page */
 	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer idx (desc idx) */
 	uint16_t pidx;		/* producer idx (desc idx) */
 	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
 	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint16_t iqid;		/* iq that gets egr_update for the eq */
 	uint8_t tx_chan;	/* tx channel used by the eq */
 	volatile u_int equiq;	/* EQUIQ outstanding */
 
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	bus_addr_t ba;		/* bus address of descriptor ring */
 	char lockname[16];
 };
 
 struct sw_zone_info {
 	uma_zone_t zone;	/* zone that this cluster comes from */
 	int size;		/* size of cluster: 2K, 4K, 9K, 16K, etc. */
 	int type;		/* EXT_xxx type of the cluster */
 	int8_t head_hwidx;
 	int8_t tail_hwidx;
 };
 
 struct hw_buf_info {
 	int8_t zidx;		/* backpointer to zone; -ve means unused */
 	int8_t next;		/* next hwidx for this zone; -1 means no more */
 	int size;
 };
 
 enum {
 	NUM_MEMWIN = 3,
 
 	MEMWIN0_APERTURE = 2048,
 	MEMWIN0_BASE     = 0x1b800,
 
 	MEMWIN1_APERTURE = 32768,
 	MEMWIN1_BASE     = 0x28000,
 
 	MEMWIN2_APERTURE_T4 = 65536,
 	MEMWIN2_BASE_T4     = 0x30000,
 
 	MEMWIN2_APERTURE_T5 = 128 * 1024,
 	MEMWIN2_BASE_T5     = 0x60000,
 };
 
 struct memwin {
 	struct rwlock mw_lock __aligned(CACHE_LINE_SIZE);
 	uint32_t mw_base;	/* constant after setup_memwin */
 	uint32_t mw_aperture;	/* ditto */
 	uint32_t mw_curpos;	/* protected by mw_lock */
 };
 
 enum {
 	FL_STARVING	= (1 << 0), /* on the adapter's list of starving fl's */
 	FL_DOOMED	= (1 << 1), /* about to be destroyed */
 	FL_BUF_PACKING	= (1 << 2), /* buffer packing enabled */
 	FL_BUF_RESUME	= (1 << 3), /* resume from the middle of the frame */
 };
 
 #define FL_RUNNING_LOW(fl) \
     (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat)
 #define FL_NOT_RUNNING_LOW(fl) \
     (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat)
 
 struct sge_fl {
 	struct mtx fl_lock;
 	__be64 *desc;		/* KVA of descriptor ring, ptr to addresses */
 	struct fl_sdesc *sdesc;	/* KVA of software descriptor ring */
 	struct cluster_layout cll_def;	/* default refill zone, layout */
 	uint16_t lowat;		/* # of buffers <= this means fl needs help */
 	int flags;
 	uint16_t buf_boundary;
 
 	/* The 16b idx all deal with hw descriptors */
 	uint16_t dbidx;		/* hw pidx after last doorbell */
 	uint16_t sidx;		/* index of status page */
 	volatile uint16_t hw_cidx;
 
 	/* The 32b idx are all buffer idx, not hardware descriptor idx */
 	uint32_t cidx;		/* consumer index */
 	uint32_t pidx;		/* producer index */
 
 	uint32_t dbval;
 	u_int rx_offset;	/* offset in fl buf (when buffer packing) */
 	volatile uint32_t *udb;
 
 	uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */
 	uint64_t mbuf_inlined;	/* # of mbuf created within clusters */
 	uint64_t cl_allocated;	/* # of clusters allocated */
 	uint64_t cl_recycled;	/* # of clusters recycled */
 	uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */
 
 	/* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */
 	struct mbuf *m0;
 	struct mbuf **pnext;
 	u_int remaining;
 
 	uint16_t qsize;		/* # of hw descriptors (status page included) */
 	uint16_t cntxt_id;	/* SGE context id for the freelist */
 	TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	char lockname[16];
 	bus_addr_t ba;		/* bus address of descriptor ring */
 	struct cluster_layout cll_alt;	/* alternate refill zone, layout */
 };
 
 struct mp_ring;
 
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct ifnet *ifp;	/* the interface this txq belongs to */
 	struct mp_ring *r;	/* tx software ring */
 	struct tx_sdesc *sdesc;	/* KVA of software descriptor ring */
 	struct sglist *gl;
 	__be32 cpl_ctrl0;	/* for convenience */
 	int tc_idx;		/* traffic class */
 
 	struct task tx_reclaim_task;
 	/* stats for common events first */
 
 	uint64_t txcsum;	/* # of times hardware assisted with checksum */
 	uint64_t tso_wrs;	/* # of TSO work requests */
 	uint64_t vlan_insertion;/* # of times VLAN tag was inserted */
 	uint64_t imm_wrs;	/* # of work requests with immediate data */
 	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
 	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
 	uint64_t txpkts0_wrs;	/* # of type0 coalesced tx work requests */
 	uint64_t txpkts1_wrs;	/* # of type1 coalesced tx work requests */
 	uint64_t txpkts0_pkts;	/* # of frames in type0 coalesced tx WRs */
 	uint64_t txpkts1_pkts;	/* # of frames in type1 coalesced tx WRs */
 	uint64_t raw_wrs;	/* # of raw work requests (alloc_wr_mbuf) */
 
 	/* stats for not-that-common events */
 } __aligned(CACHE_LINE_SIZE);
 
 /* rxq: SGE ingress queue + SGE free list + miscellaneous items */
 struct sge_rxq {
 	struct sge_iq iq;	/* MUST be first */
 	struct sge_fl fl;	/* MUST follow iq */
 
 	struct ifnet *ifp;	/* the interface this rxq belongs to */
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl lro;	/* LRO state */
 #endif
 
 	/* stats for common events first */
 
 	uint64_t rxcsum;	/* # of times hardware assisted with checksum */
 	uint64_t vlan_extraction;/* # of times VLAN tag was extracted */
 
 	/* stats for not-that-common events */
 
 } __aligned(CACHE_LINE_SIZE);
 
 static inline struct sge_rxq *
 iq_to_rxq(struct sge_iq *iq)
 {
 
 	return (__containerof(iq, struct sge_rxq, iq));
 }
 
 
 /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */
 struct sge_ofld_rxq {
 	struct sge_iq iq;	/* MUST be first */
 	struct sge_fl fl;	/* MUST follow iq */
 } __aligned(CACHE_LINE_SIZE);
 
 static inline struct sge_ofld_rxq *
 iq_to_ofld_rxq(struct sge_iq *iq)
 {
 
 	return (__containerof(iq, struct sge_ofld_rxq, iq));
 }
 
 struct wrqe {
 	STAILQ_ENTRY(wrqe) link;
 	struct sge_wrq *wrq;
 	int wr_len;
 	char wr[] __aligned(16);
 };
 
 struct wrq_cookie {
 	TAILQ_ENTRY(wrq_cookie) link;
 	int ndesc;
 	int pidx;
 };
 
 /*
  * wrq: SGE egress queue that is given prebuilt work requests.  Both the control
  * and offload tx queues are of this type.
  */
 struct sge_wrq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
 	struct task wrq_tx_task;
 
 	/* Tx desc reserved but WR not "committed" yet. */
 	TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs;
 
 	/* List of WRs ready to go out as soon as descriptors are available. */
 	STAILQ_HEAD(, wrqe) wr_list;
 	u_int nwr_pending;
 	u_int ndesc_needed;
 
 	/* stats for common events first */
 
 	uint64_t tx_wrs_direct;	/* # of WRs written directly to desc ring. */
 	uint64_t tx_wrs_ss;	/* # of WRs copied from scratch space. */
 	uint64_t tx_wrs_copied;	/* # of WRs queued and copied to desc ring. */
 
 	/* stats for not-that-common events */
 
 	/*
 	 * Scratch space for work requests that wrap around after reaching the
 	 * status page, and some information about the last WR that used it.
 	 */
 	uint16_t ss_pidx;
 	uint16_t ss_len;
 	uint8_t ss[SGE_MAX_WR_LEN];
 
 } __aligned(CACHE_LINE_SIZE);
 
 #define INVALID_NM_RXQ_CNTXT_ID ((uint16_t)(-1))
 struct sge_nm_rxq {
 	volatile int nm_state;	/* NM_OFF, NM_ON, or NM_BUSY */
 	struct vi_info *vi;
 
 	struct iq_desc *iq_desc;
 	uint16_t iq_abs_id;
 	uint16_t iq_cntxt_id;
 	uint16_t iq_cidx;
 	uint16_t iq_sidx;
 	uint8_t iq_gen;
 
 	__be64  *fl_desc;
 	uint16_t fl_cntxt_id;
 	uint32_t fl_cidx;
 	uint32_t fl_pidx;
 	uint32_t fl_sidx;
 	uint32_t fl_db_val;
 	u_int fl_hwidx:4;
 
 	u_int fl_db_saved;
 	u_int nid;		/* netmap ring # for this queue */
 
 	/* infrequently used items after this */
 
 	bus_dma_tag_t iq_desc_tag;
 	bus_dmamap_t iq_desc_map;
 	bus_addr_t iq_ba;
 	int intr_idx;
 
 	bus_dma_tag_t fl_desc_tag;
 	bus_dmamap_t fl_desc_map;
 	bus_addr_t fl_ba;
 } __aligned(CACHE_LINE_SIZE);
 
 #define INVALID_NM_TXQ_CNTXT_ID ((u_int)(-1))
 struct sge_nm_txq {
 	struct tx_desc *desc;
 	uint16_t cidx;
 	uint16_t pidx;
 	uint16_t sidx;
 	uint16_t equiqidx;	/* EQUIQ last requested at this pidx */
 	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
 	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint8_t doorbells;
 	volatile uint32_t *udb;
 	u_int udb_qid;
 	u_int cntxt_id;
 	__be32 cpl_ctrl0;	/* for convenience */
 	u_int nid;		/* netmap ring # for this queue */
 
 	/* infrequently used items after this */
 
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	bus_addr_t ba;
 	int iqidx;
 } __aligned(CACHE_LINE_SIZE);
 
 struct sge {
 	int nrxq;	/* total # of Ethernet rx queues */
 	int ntxq;	/* total # of Ethernet tx queues */
 	int nofldrxq;	/* total # of TOE rx queues */
 	int nofldtxq;	/* total # of TOE tx queues */
 	int nnmrxq;	/* total # of netmap rx queues */
 	int nnmtxq;	/* total # of netmap tx queues */
 	int niq;	/* total # of ingress queues */
 	int neq;	/* total # of egress queues */
 
 	struct sge_iq fwq;	/* Firmware event queue */
 	struct sge_wrq *ctrlq;	/* Control queues */
 	struct sge_txq *txq;	/* NIC tx queues */
 	struct sge_rxq *rxq;	/* NIC rx queues */
 	struct sge_wrq *ofld_txq;	/* TOE tx queues */
 	struct sge_ofld_rxq *ofld_rxq;	/* TOE rx queues */
 	struct sge_nm_txq *nm_txq;	/* netmap tx queues */
 	struct sge_nm_rxq *nm_rxq;	/* netmap rx queues */
 
 	uint16_t iq_start;	/* first cntxt_id */
 	uint16_t iq_base;	/* first abs_id */
 	int eq_start;		/* first cntxt_id */
 	int eq_base;		/* first abs_id */
 	struct sge_iq **iqmap;	/* iq->cntxt_id to iq mapping */
 	struct sge_eq **eqmap;	/* eq->cntxt_id to eq mapping */
 
 	int8_t safe_hwidx1;	/* may not have room for metadata */
 	int8_t safe_hwidx2;	/* with room for metadata and maybe more */
 	struct sw_zone_info sw_zone_info[SW_ZONE_SIZES];
 	struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES];
 };
 
 struct devnames {
 	const char *nexus_name;
 	const char *ifnet_name;
 	const char *vi_ifnet_name;
 	const char *pf03_drv_name;
 	const char *vf_nexus_name;
 	const char *vf_ifnet_name;
 };
 
 struct clip_entry;
 
 struct adapter {
 	SLIST_ENTRY(adapter) link;
 	device_t dev;
 	struct cdev *cdev;
 	const struct devnames *names;
 
 	/* PCIe register resources */
 	int regs_rid;
 	struct resource *regs_res;
 	int msix_rid;
 	struct resource *msix_res;
 	bus_space_handle_t bh;
 	bus_space_tag_t bt;
 	bus_size_t mmio_len;
 	int udbs_rid;
 	struct resource *udbs_res;
 	volatile uint8_t *udbs_base;
 
 	unsigned int pf;
 	unsigned int mbox;
 	unsigned int vpd_busy;
 	unsigned int vpd_flag;
 
 	/* Interrupt information */
 	int intr_type;
 	int intr_count;
 	struct irq {
 		struct resource *res;
 		int rid;
 		void *tag;
 		struct sge_rxq *rxq;
 		struct sge_nm_rxq *nm_rxq;
 	} __aligned(CACHE_LINE_SIZE) *irq;
 	int sge_gts_reg;
 	int sge_kdoorbell_reg;
 
 	bus_dma_tag_t dmat;	/* Parent DMA tag */
 
 	struct sge sge;
 	int lro_timeout;
 	int sc_do_rxcopy;
 
 	struct taskqueue *tq[MAX_NCHAN];	/* General purpose taskqueues */
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[MAX_NCHAN];		/* channel -> port */
 
 	struct mtx clip_table_lock;
 	TAILQ_HEAD(, clip_entry) clip_table;
 	int clip_gen;
 
 	void *tom_softc;	/* (struct tom_data *) */
 	struct tom_tunables tt;
 	struct t4_offload_policy *policy;
 	struct rwlock policy_lock;
 
 	void *iwarp_softc;	/* (struct c4iw_dev *) */
 	struct iw_tunables iwt;
 	void *iscsi_ulp_softc;	/* (struct cxgbei_data *) */
 	void *ccr_softc;	/* (struct ccr_softc *) */
 	struct l2t_data *l2t;	/* L2 table */
 	struct smt_data *smt;	/* Source MAC Table */
 	struct tid_info tids;
 	vmem_t *key_map;
 
 	uint8_t doorbells;
 	int offload_map;	/* ports with IFCAP_TOE enabled */
 	int active_ulds;	/* ULDs activated on this adapter */
 	int flags;
 	int debug_flags;
 
 	char ifp_lockname[16];
 	struct mtx ifp_lock;
 	struct ifnet *ifp;	/* tracer ifp */
 	struct ifmedia media;
 	int traceq;		/* iq used by all tracers, -1 if none */
 	int tracer_valid;	/* bitmap of valid tracers */
 	int tracer_enabled;	/* bitmap of enabled tracers */
 
 	char fw_version[16];
 	char tp_version[16];
 	char er_version[16];
 	char bs_version[16];
 	char cfg_file[32];
 	u_int cfcsum;
 	struct adapter_params params;
 	const struct chip_params *chip_params;
 	struct t4_virt_res vres;
 
 	uint16_t nbmcaps;
 	uint16_t linkcaps;
 	uint16_t switchcaps;
 	uint16_t niccaps;
 	uint16_t toecaps;
 	uint16_t rdmacaps;
 	uint16_t cryptocaps;
 	uint16_t iscsicaps;
 	uint16_t fcoecaps;
 
 	struct sysctl_ctx_list ctx; /* from adapter_full_init to full_uninit */
 
 	struct mtx sc_lock;
 	char lockname[16];
 
 	/* Starving free lists */
 	struct mtx sfl_lock;	/* same cache-line as sc_lock? but that's ok */
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
 
 	struct mtx reg_lock;	/* for indirect register access */
 
 	struct memwin memwin[NUM_MEMWIN];	/* memory windows */
 
 	struct mtx tc_lock;
 	struct task tc_task;
 
 	const char *last_op;
 	const void *last_op_thr;
 	int last_op_flags;
+
+	int swintr;
 };
 
 #define ADAPTER_LOCK(sc)		mtx_lock(&(sc)->sc_lock)
 #define ADAPTER_UNLOCK(sc)		mtx_unlock(&(sc)->sc_lock)
 #define ADAPTER_LOCK_ASSERT_OWNED(sc)	mtx_assert(&(sc)->sc_lock, MA_OWNED)
 #define ADAPTER_LOCK_ASSERT_NOTOWNED(sc) mtx_assert(&(sc)->sc_lock, MA_NOTOWNED)
 
 #define ASSERT_SYNCHRONIZED_OP(sc)	\
     KASSERT(IS_BUSY(sc) && \
 	(mtx_owned(&(sc)->sc_lock) || sc->last_op_thr == curthread), \
 	("%s: operation not synchronized.", __func__))
 
 #define PORT_LOCK(pi)			mtx_lock(&(pi)->pi_lock)
 #define PORT_UNLOCK(pi)			mtx_unlock(&(pi)->pi_lock)
 #define PORT_LOCK_ASSERT_OWNED(pi)	mtx_assert(&(pi)->pi_lock, MA_OWNED)
 #define PORT_LOCK_ASSERT_NOTOWNED(pi)	mtx_assert(&(pi)->pi_lock, MA_NOTOWNED)
 
 #define FL_LOCK(fl)			mtx_lock(&(fl)->fl_lock)
 #define FL_TRYLOCK(fl)			mtx_trylock(&(fl)->fl_lock)
 #define FL_UNLOCK(fl)			mtx_unlock(&(fl)->fl_lock)
 #define FL_LOCK_ASSERT_OWNED(fl)	mtx_assert(&(fl)->fl_lock, MA_OWNED)
 #define FL_LOCK_ASSERT_NOTOWNED(fl)	mtx_assert(&(fl)->fl_lock, MA_NOTOWNED)
 
 #define RXQ_FL_LOCK(rxq)		FL_LOCK(&(rxq)->fl)
 #define RXQ_FL_UNLOCK(rxq)		FL_UNLOCK(&(rxq)->fl)
 #define RXQ_FL_LOCK_ASSERT_OWNED(rxq)	FL_LOCK_ASSERT_OWNED(&(rxq)->fl)
 #define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl)
 
 #define EQ_LOCK(eq)			mtx_lock(&(eq)->eq_lock)
 #define EQ_TRYLOCK(eq)			mtx_trylock(&(eq)->eq_lock)
 #define EQ_UNLOCK(eq)			mtx_unlock(&(eq)->eq_lock)
 #define EQ_LOCK_ASSERT_OWNED(eq)	mtx_assert(&(eq)->eq_lock, MA_OWNED)
 #define EQ_LOCK_ASSERT_NOTOWNED(eq)	mtx_assert(&(eq)->eq_lock, MA_NOTOWNED)
 
 #define TXQ_LOCK(txq)			EQ_LOCK(&(txq)->eq)
 #define TXQ_TRYLOCK(txq)		EQ_TRYLOCK(&(txq)->eq)
 #define TXQ_UNLOCK(txq)			EQ_UNLOCK(&(txq)->eq)
 #define TXQ_LOCK_ASSERT_OWNED(txq)	EQ_LOCK_ASSERT_OWNED(&(txq)->eq)
 #define TXQ_LOCK_ASSERT_NOTOWNED(txq)	EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq)
 
 #define for_each_txq(vi, iter, q) \
 	for (q = &vi->pi->adapter->sge.txq[vi->first_txq], iter = 0; \
 	    iter < vi->ntxq; ++iter, ++q)
 #define for_each_rxq(vi, iter, q) \
 	for (q = &vi->pi->adapter->sge.rxq[vi->first_rxq], iter = 0; \
 	    iter < vi->nrxq; ++iter, ++q)
 #define for_each_ofld_txq(vi, iter, q) \
 	for (q = &vi->pi->adapter->sge.ofld_txq[vi->first_ofld_txq], iter = 0; \
 	    iter < vi->nofldtxq; ++iter, ++q)
 #define for_each_ofld_rxq(vi, iter, q) \
 	for (q = &vi->pi->adapter->sge.ofld_rxq[vi->first_ofld_rxq], iter = 0; \
 	    iter < vi->nofldrxq; ++iter, ++q)
 #define for_each_nm_txq(vi, iter, q) \
 	for (q = &vi->pi->adapter->sge.nm_txq[vi->first_nm_txq], iter = 0; \
 	    iter < vi->nnmtxq; ++iter, ++q)
 #define for_each_nm_rxq(vi, iter, q) \
 	for (q = &vi->pi->adapter->sge.nm_rxq[vi->first_nm_rxq], iter = 0; \
 	    iter < vi->nnmrxq; ++iter, ++q)
 #define for_each_vi(_pi, _iter, _vi) \
 	for ((_vi) = (_pi)->vi, (_iter) = 0; (_iter) < (_pi)->nvi; \
 	     ++(_iter), ++(_vi))
 
 #define IDXINCR(idx, incr, wrap) do { \
 	idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \
 } while (0)
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 /* One for errors, one for firmware events */
 #define T4_EXTRA_INTR 2
 
 /* One for firmware events */
 #define T4VF_EXTRA_INTR 1
 
 static inline int
 forwarding_intr_to_fwq(struct adapter *sc)
 {
 
 	return (sc->intr_count == 1);
 }
 
 static inline uint32_t
 t4_read_reg(struct adapter *sc, uint32_t reg)
 {
 
 	return bus_space_read_4(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val)
 {
 
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 }
 
 static inline uint64_t
 t4_read_reg64(struct adapter *sc, uint32_t reg)
 {
 
 #ifdef __LP64__
 	return bus_space_read_8(sc->bt, sc->bh, reg);
 #else
 	return (uint64_t)bus_space_read_4(sc->bt, sc->bh, reg) +
 	    ((uint64_t)bus_space_read_4(sc->bt, sc->bh, reg + 4) << 32);
 
 #endif
 }
 
 static inline void
 t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
 {
 
 #ifdef __LP64__
 	bus_space_write_8(sc->bt, sc->bh, reg, val);
 #else
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 	bus_space_write_4(sc->bt, sc->bh, reg + 4, val>> 32);
 #endif
 }
 
 static inline void
 t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val)
 {
 
 	*val = pci_read_config(sc->dev, reg, 1);
 }
 
 static inline void
 t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val)
 {
 
 	pci_write_config(sc->dev, reg, val, 1);
 }
 
 static inline void
 t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val)
 {
 
 	*val = pci_read_config(sc->dev, reg, 2);
 }
 
 static inline void
 t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val)
 {
 
 	pci_write_config(sc->dev, reg, val, 2);
 }
 
 static inline void
 t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val)
 {
 
 	*val = pci_read_config(sc->dev, reg, 4);
 }
 
 static inline void
 t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val)
 {
 
 	pci_write_config(sc->dev, reg, val, 4);
 }
 
 static inline struct port_info *
 adap2pinfo(struct adapter *sc, int idx)
 {
 
 	return (sc->port[idx]);
 }
 
 static inline void
 t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[])
 {
 
 	bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN);
 }
 
 static inline int
 tx_resume_threshold(struct sge_eq *eq)
 {
 
 	/* not quite the same as qsize / 4, but this will do. */
 	return (eq->sidx / 4);
 }
 
 static inline int
 t4_use_ldst(struct adapter *sc)
 {
 
 #ifdef notyet
 	return (sc->flags & FW_OK || !sc->use_bd);
 #else
 	return (0);
 #endif
 }
 
 static inline void
 CH_DUMP_MBOX(struct adapter *sc, int mbox, const int reg,
     const char *msg, const __be64 *const p, const bool err)
 {
 
 	if (!(sc->debug_flags & DF_DUMP_MBOX) && !err)
 		return;
 	if (p != NULL) {
 		log(err ? LOG_ERR : LOG_DEBUG,
 		    "%s: mbox %u %s %016llx %016llx %016llx %016llx "
 		    "%016llx %016llx %016llx %016llx\n",
 		    device_get_nameunit(sc->dev), mbox, msg,
 		    (long long)be64_to_cpu(p[0]), (long long)be64_to_cpu(p[1]),
 		    (long long)be64_to_cpu(p[2]), (long long)be64_to_cpu(p[3]),
 		    (long long)be64_to_cpu(p[4]), (long long)be64_to_cpu(p[5]),
 		    (long long)be64_to_cpu(p[6]), (long long)be64_to_cpu(p[7]));
 	} else {
 		log(err ? LOG_ERR : LOG_DEBUG,
 		    "%s: mbox %u %s %016llx %016llx %016llx %016llx "
 		    "%016llx %016llx %016llx %016llx\n",
 		    device_get_nameunit(sc->dev), mbox, msg,
 		    (long long)t4_read_reg64(sc, reg),
 		    (long long)t4_read_reg64(sc, reg + 8),
 		    (long long)t4_read_reg64(sc, reg + 16),
 		    (long long)t4_read_reg64(sc, reg + 24),
 		    (long long)t4_read_reg64(sc, reg + 32),
 		    (long long)t4_read_reg64(sc, reg + 40),
 		    (long long)t4_read_reg64(sc, reg + 48),
 		    (long long)t4_read_reg64(sc, reg + 56));
 	}
 }
 
 /* t4_main.c */
 extern int t4_ntxq;
 extern int t4_nrxq;
 extern int t4_intr_types;
 extern int t4_tmr_idx;
 extern int t4_pktc_idx;
 extern unsigned int t4_qsize_rxq;
 extern unsigned int t4_qsize_txq;
 extern device_method_t cxgbe_methods[];
 
 int t4_os_find_pci_capability(struct adapter *, int);
 int t4_os_pci_save_state(struct adapter *);
 int t4_os_pci_restore_state(struct adapter *);
 void t4_os_portmod_changed(struct port_info *);
 void t4_os_link_changed(struct port_info *);
 void t4_iterate(void (*)(struct adapter *, void *), void *);
 void t4_init_devnames(struct adapter *);
 void t4_add_adapter(struct adapter *);
 void t4_aes_getdeckey(void *, const void *, unsigned int);
 int t4_detach_common(device_t);
 int t4_map_bars_0_and_4(struct adapter *);
 int t4_map_bar_2(struct adapter *);
 int t4_setup_intr_handlers(struct adapter *);
 void t4_sysctls(struct adapter *);
 int begin_synchronized_op(struct adapter *, struct vi_info *, int, char *);
 void doom_vi(struct adapter *, struct vi_info *);
 void end_synchronized_op(struct adapter *, int);
 int update_mac_settings(struct ifnet *, int);
 int adapter_full_init(struct adapter *);
 int adapter_full_uninit(struct adapter *);
 uint64_t cxgbe_get_counter(struct ifnet *, ift_counter);
 int vi_full_init(struct vi_info *);
 int vi_full_uninit(struct vi_info *);
 void vi_sysctls(struct vi_info *);
 void vi_tick(void *);
 int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int);
 int alloc_atid_tab(struct tid_info *, int);
 void free_atid_tab(struct tid_info *);
 int alloc_atid(struct adapter *, void *);
 void *lookup_atid(struct adapter *, int);
 void free_atid(struct adapter *, int);
 void release_tid(struct adapter *, int, struct sge_wrq *);
 int cxgbe_media_change(struct ifnet *);
 void cxgbe_media_status(struct ifnet *, struct ifmediareq *);
 bool t4_os_dump_cimla(struct adapter *, int, bool);
 void t4_os_dump_devlog(struct adapter *);
 
 #ifdef DEV_NETMAP
 /* t4_netmap.c */
 struct sge_nm_rxq;
 void cxgbe_nm_attach(struct vi_info *);
 void cxgbe_nm_detach(struct vi_info *);
 void service_nm_rxq(struct sge_nm_rxq *);
 #endif
 
 /* t4_sge.c */
 void t4_sge_modload(void);
 void t4_sge_modunload(void);
 uint64_t t4_sge_extfree_refs(void);
 void t4_tweak_chip_settings(struct adapter *);
 int t4_read_chip_settings(struct adapter *);
 int t4_create_dma_tag(struct adapter *);
 void t4_sge_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid_list *);
 int t4_destroy_dma_tag(struct adapter *);
 int t4_setup_adapter_queues(struct adapter *);
 int t4_teardown_adapter_queues(struct adapter *);
 int t4_setup_vi_queues(struct vi_info *);
 int t4_teardown_vi_queues(struct vi_info *);
 void t4_intr_all(void *);
 void t4_intr(void *);
 #ifdef DEV_NETMAP
 void t4_nm_intr(void *);
 void t4_vi_intr(void *);
 #endif
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
 void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
 void t4_update_fl_bufsize(struct ifnet *);
 struct mbuf *alloc_wr_mbuf(int, int);
 int parse_pkt(struct adapter *, struct mbuf **);
 void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *);
 void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *);
 int tnl_cong(struct port_info *, int);
 void t4_register_an_handler(an_handler_t);
 void t4_register_fw_msg_handler(int, fw_msg_handler_t);
 void t4_register_cpl_handler(int, cpl_handler_t);
 void t4_register_shared_cpl_handler(int, cpl_handler_t, int);
 #ifdef RATELIMIT
 int ethofld_transmit(struct ifnet *, struct mbuf *);
 void send_etid_flush_wr(struct cxgbe_snd_tag *);
 #endif
 
 /* t4_tracer.c */
 struct t4_tracer;
 void t4_tracer_modload(void);
 void t4_tracer_modunload(void);
 void t4_tracer_port_detach(struct adapter *);
 int t4_get_tracer(struct adapter *, struct t4_tracer *);
 int t4_set_tracer(struct adapter *, struct t4_tracer *);
 int t4_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t5_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *);
 
 /* t4_sched.c */
 int t4_set_sched_class(struct adapter *, struct t4_sched_params *);
 int t4_set_sched_queue(struct adapter *, struct t4_sched_queue *);
 int t4_init_tx_sched(struct adapter *);
 int t4_free_tx_sched(struct adapter *);
 void t4_update_tx_sched(struct adapter *);
 int t4_reserve_cl_rl_kbps(struct adapter *, int, u_int, int *);
 void t4_release_cl_rl(struct adapter *, int, int);
 int sysctl_tc(SYSCTL_HANDLER_ARGS);
 int sysctl_tc_params(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 void t4_init_etid_table(struct adapter *);
 void t4_free_etid_table(struct adapter *);
 struct cxgbe_snd_tag *lookup_etid(struct adapter *, int);
 int cxgbe_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
 int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *);
 int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
 void cxgbe_snd_tag_free(struct m_snd_tag *);
 void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
 #endif
 
 /* t4_filter.c */
 int get_filter_mode(struct adapter *, uint32_t *);
 int set_filter_mode(struct adapter *, uint32_t);
 int get_filter(struct adapter *, struct t4_filter *);
 int set_filter(struct adapter *, struct t4_filter *);
 int del_filter(struct adapter *, struct t4_filter *);
 int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t4_hashfilter_ao_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t4_hashfilter_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t4_del_hashfilter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 void free_hftid_hash(struct tid_info *);
 
 static inline struct wrqe *
 alloc_wrqe(int wr_len, struct sge_wrq *wrq)
 {
 	int len = offsetof(struct wrqe, wr) + wr_len;
 	struct wrqe *wr;
 
 	wr = malloc(len, M_CXGBE, M_NOWAIT);
 	if (__predict_false(wr == NULL))
 		return (NULL);
 	wr->wr_len = wr_len;
 	wr->wrq = wrq;
 	return (wr);
 }
 
 static inline void *
 wrtod(struct wrqe *wr)
 {
 	return (&wr->wr[0]);
 }
 
 static inline void
 free_wrqe(struct wrqe *wr)
 {
 	free(wr, M_CXGBE);
 }
 
 static inline void
 t4_wrq_tx(struct adapter *sc, struct wrqe *wr)
 {
 	struct sge_wrq *wrq = wr->wrq;
 
 	TXQ_LOCK(wrq);
 	t4_wrq_tx_locked(sc, wrq, wr);
 	TXQ_UNLOCK(wrq);
 }
 
 static inline int
 read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val,
     int len)
 {
 
 	return (rw_via_memwin(sc, idx, addr, val, len, 0));
 }
 
 static inline int
 write_via_memwin(struct adapter *sc, int idx, uint32_t addr,
     const uint32_t *val, int len)
 {
 
 	return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1));
 }
 #endif
Index: projects/capsicum-test/sys/dev/cxgbe/offload.h
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/offload.h	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/offload.h	(revision 345710)
@@ -1,245 +1,246 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef __T4_OFFLOAD_H__
 #define __T4_OFFLOAD_H__
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 
 #define INIT_ULPTX_WRH(w, wrlen, atomic, tid) do { \
 	(w)->wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
 	(w)->wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
 			       V_FW_WR_FLOWID(tid)); \
 	(w)->wr_lo = cpu_to_be64(0); \
 } while (0)
 
 #define INIT_ULPTX_WR(w, wrlen, atomic, tid) \
     INIT_ULPTX_WRH(&((w)->wr), wrlen, atomic, tid)
 
 #define INIT_TP_WR(w, tid) do { \
 	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | \
                               V_FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \
 	(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(sizeof(*w), 16)) | \
                                V_FW_WR_FLOWID(tid)); \
 	(w)->wr.wr_lo = cpu_to_be64(0); \
 } while (0)
 
 #define INIT_TP_WR_MIT_CPL(w, cpl, tid) do { \
 	INIT_TP_WR(w, tid); \
 	OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \
 } while (0)
 
 TAILQ_HEAD(stid_head, stid_region);
 struct listen_ctx;
 
 struct stid_region {
 	TAILQ_ENTRY(stid_region) link;
 	u_int used;	/* # of stids used by this region */
 	u_int free;	/* # of contiguous stids free right after this region */
 };
 
 /*
  * Max # of ATIDs.  The absolute HW max is 14b (enough for 16K) but we reserve
  * the upper 3b for use as a cookie to demux the reply.
  */
 #define MAX_ATIDS 2048U
 
 union aopen_entry {
 	void *data;
 	union aopen_entry *next;
 };
 
 /* cxgbe_snd_tag flags */
 enum {
 	EO_FLOWC_PENDING	= (1 << 0),	/* flowc needs to be sent */
 	EO_FLOWC_RPL_PENDING	= (1 << 1),	/* flowc credits due back */
 	EO_SND_TAG_REF		= (1 << 2),	/* kernel has a ref on us */
 	EO_FLUSH_RPL_PENDING	= (1 << 3),	/* credit flush rpl due back */
 };
 
 struct cxgbe_snd_tag {
 	struct m_snd_tag com;
 	struct adapter *adapter;
 	u_int flags;
 	struct mtx lock;
 	int port_id;
 	int etid;
 	struct mbufq pending_tx, pending_fwack;
 	int plen;
 	struct sge_wrq *eo_txq;
 	uint32_t ctrl0;
 	uint16_t iqid;
 	int8_t schedcl;
 	uint64_t max_rate;      /* in bytes/s */
 	uint8_t tx_total;	/* total tx WR credits (in 16B units) */
 	uint8_t tx_credits;	/* tx WR credits (in 16B units) available */
 	uint8_t tx_nocompl;	/* tx WR credits since last compl request */
 	uint8_t ncompl;		/* # of completions outstanding. */
 };
 
 static inline struct cxgbe_snd_tag *
 mst_to_cst(struct m_snd_tag *t)
 {
 
 	return (__containerof(t, struct cxgbe_snd_tag, com));
 }
 
 union etid_entry {
 	struct cxgbe_snd_tag *cst;
 	union etid_entry *next;
 };
 
 /*
  * Holds the size, base address, start, end, etc. of various types of TIDs.  The
  * tables themselves are allocated dynamically.
  */
 struct tid_info {
 	u_int nstids;
 	u_int stid_base;
 
 	u_int natids;
 
 	u_int nftids;
 	u_int ftid_base;
 	u_int ftid_end;
 
 	u_int nhpftids;
 	u_int hpftid_base;
 	u_int hpftid_end;
 
 	u_int ntids;
 	u_int tid_base;
 
 	u_int netids;
 	u_int etid_base;
 	u_int etid_end;
 
 	struct mtx stid_lock __aligned(CACHE_LINE_SIZE);
 	struct listen_ctx **stid_tab;
 	u_int stids_in_use;
 	u_int nstids_free_head;	/* # of available stids at the beginning */
 	struct stid_head stids;
 
 	struct mtx atid_lock __aligned(CACHE_LINE_SIZE);
 	union aopen_entry *atid_tab;
 	union aopen_entry *afree;
 	u_int atids_in_use;
 
 	/* High priority filters and normal filters share the lock and cv. */
 	struct mtx ftid_lock __aligned(CACHE_LINE_SIZE);
 	struct cv ftid_cv;
 	struct filter_entry *ftid_tab;
 	struct filter_entry *hpftid_tab;
 	u_int ftids_in_use;
 	u_int hpftids_in_use;
 
 	/*
 	 * hashfilter and TOE are mutually exclusive and both use ntids and
 	 * tids_in_use.  The lock and cv are used only by hashfilter.
 	 */
 	struct mtx hftid_lock __aligned(CACHE_LINE_SIZE);
 	struct cv hftid_cv;
 	void **tid_tab;
 	u_int tids_in_use;
 
 	void *hftid_hash_4t;	/* LIST_HEAD(, filter_entry) *hftid_hash_4t; */
 	u_long hftid_4t_mask;
 	void *hftid_hash_tid;	/* LIST_HEAD(, filter_entry) *hftid_hash_tid; */
 	u_long hftid_tid_mask;
 
 	struct mtx etid_lock __aligned(CACHE_LINE_SIZE);
 	union etid_entry *etid_tab;
 	union etid_entry *efree;
 	u_int etids_in_use;
 };
 
 struct t4_range {
 	u_int start;
 	u_int size;
 };
 
 struct t4_virt_res {                      /* virtualized HW resources */
 	struct t4_range ddp;
 	struct t4_range iscsi;
 	struct t4_range stag;
 	struct t4_range rq;
 	struct t4_range pbl;
 	struct t4_range qp;
 	struct t4_range cq;
 	struct t4_range srq;
 	struct t4_range ocq;
 	struct t4_range l2t;
 	struct t4_range key;
 };
 
 enum {
 	ULD_TOM = 0,
 	ULD_IWARP,
 	ULD_ISCSI,
 	ULD_MAX = ULD_ISCSI
 };
 
 struct adapter;
 struct port_info;
 struct uld_info {
 	SLIST_ENTRY(uld_info) link;
 	int refcount;
 	int uld_id;
 	int (*activate)(struct adapter *);
 	int (*deactivate)(struct adapter *);
 };
 
 struct tom_tunables {
 	int cong_algorithm;
 	int sndbuf;
 	int ddp;
 	int rx_coalesce;
 	int tls;
 	int *tls_rx_ports;
 	int num_tls_rx_ports;
 	int tx_align;
 	int tx_zcopy;
 	int cop_managed_offloading;
+	int autorcvbuf_inc;
 };
 /* iWARP driver tunables */
 struct iw_tunables {
 	int wc_en;
 };
 #ifdef TCP_OFFLOAD
 int t4_register_uld(struct uld_info *);
 int t4_unregister_uld(struct uld_info *);
 int t4_activate_uld(struct adapter *, int);
 int t4_deactivate_uld(struct adapter *, int);
 int uld_active(struct adapter *, int);
 #endif
 #endif
Index: projects/capsicum-test/sys/dev/cxgbe/t4_main.c
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/t4_main.c	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/t4_main.c	(revision 345710)
@@ -1,10841 +1,10849 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/pciio.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 #include <sys/firmware.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_vlan_var.h>
 #ifdef RSS
 #include <net/rss_config.h>
 #endif
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/md_var.h>
 #include <machine/cputypes.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #endif
 #include <crypto/rijndael/rijndael.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "cudbg/cudbg.h"
 #include "t4_clip.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 #include "t4_if.h"
 #include "t4_smt.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
 static int t4_attach(device_t);
 static int t4_detach(device_t);
 static int t4_child_location_str(device_t, device_t, char *, size_t);
 static int t4_ready(device_t);
 static int t4_read_port_device(device_t, int, device_t *);
 static device_method_t t4_methods[] = {
 	DEVMETHOD(device_probe,		t4_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t4_driver = {
 	"t4nex",
 	t4_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T4 port (cxgbe) interface */
 static int cxgbe_probe(device_t);
 static int cxgbe_attach(device_t);
 static int cxgbe_detach(device_t);
 device_method_t cxgbe_methods[] = {
 	DEVMETHOD(device_probe,		cxgbe_probe),
 	DEVMETHOD(device_attach,	cxgbe_attach),
 	DEVMETHOD(device_detach,	cxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t cxgbe_driver = {
 	"cxgbe",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T4 VI (vcxgbe) interface */
 static int vcxgbe_probe(device_t);
 static int vcxgbe_attach(device_t);
 static int vcxgbe_detach(device_t);
 static device_method_t vcxgbe_methods[] = {
 	DEVMETHOD(device_probe,		vcxgbe_probe),
 	DEVMETHOD(device_attach,	vcxgbe_attach),
 	DEVMETHOD(device_detach,	vcxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t vcxgbe_driver = {
 	"vcxgbe",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 static d_ioctl_t t4_ioctl;
 
 static struct cdevsw t4_cdevsw = {
        .d_version = D_VERSION,
        .d_ioctl = t4_ioctl,
        .d_name = "t4nex",
 };
 
 /* T5 bus driver interface */
 static int t5_probe(device_t);
 static device_method_t t5_methods[] = {
 	DEVMETHOD(device_probe,		t5_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t5_driver = {
 	"t5nex",
 	t5_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T5 port (cxl) interface */
 static driver_t cxl_driver = {
 	"cxl",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T5 VI (vcxl) interface */
 static driver_t vcxl_driver = {
 	"vcxl",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 /* T6 bus driver interface */
 static int t6_probe(device_t);
 static device_method_t t6_methods[] = {
 	DEVMETHOD(device_probe,		t6_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t6_driver = {
 	"t6nex",
 	t6_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T6 port (cc) interface */
 static driver_t cc_driver = {
 	"cc",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T6 VI (vcc) interface */
 static driver_t vcc_driver = {
 	"vcc",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 /* ifnet interface */
 static void cxgbe_init(void *);
 static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t);
 static int cxgbe_transmit(struct ifnet *, struct mbuf *);
 static void cxgbe_qflush(struct ifnet *);
 
 MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services");
 
 /*
  * Correct lock order when you need to acquire multiple locks is t4_list_lock,
  * then ADAPTER_LOCK, then t4_uld_list_lock.
  */
 static struct sx t4_list_lock;
 SLIST_HEAD(, adapter) t4_list;
 #ifdef TCP_OFFLOAD
 static struct sx t4_uld_list_lock;
 SLIST_HEAD(, uld_info) t4_uld_list;
 #endif
 
 /*
  * Tunables.  See tweak_tunables() too.
  *
  * Each tunable is set to a default value here if it's known at compile-time.
  * Otherwise it is set to -n as an indication to tweak_tunables() that it should
  * provide a reasonable default (upto n) when the driver is loaded.
  *
  * Tunables applicable to both T4 and T5 are under hw.cxgbe.  Those specific to
  * T5 are under hw.cxl.
  */
 SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD, 0, "cxgbe(4) parameters");
 SYSCTL_NODE(_hw, OID_AUTO, cxl, CTLFLAG_RD, 0, "cxgbe(4) T5+ parameters");
 SYSCTL_NODE(_hw_cxgbe, OID_AUTO, toe, CTLFLAG_RD, 0, "cxgbe(4) TOE parameters");
 
 /*
  * Number of queues for tx and rx, NIC and offload.
  */
 #define NTXQ 16
 int t4_ntxq = -NTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq, CTLFLAG_RDTUN, &t4_ntxq, 0,
     "Number of TX queues per port");
 TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq);	/* Old name, undocumented */
 
 #define NRXQ 8
 int t4_nrxq = -NRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq, CTLFLAG_RDTUN, &t4_nrxq, 0,
     "Number of RX queues per port");
 TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq);	/* Old name, undocumented */
 
 #define NTXQ_VI 1
 static int t4_ntxq_vi = -NTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq_vi, CTLFLAG_RDTUN, &t4_ntxq_vi, 0,
     "Number of TX queues per VI");
 
 #define NRXQ_VI 1
 static int t4_nrxq_vi = -NRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq_vi, CTLFLAG_RDTUN, &t4_nrxq_vi, 0,
     "Number of RX queues per VI");
 
 static int t4_rsrv_noflowq = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, rsrv_noflowq, CTLFLAG_RDTUN, &t4_rsrv_noflowq,
     0, "Reserve TX queue 0 of each VI for non-flowid packets");
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 #define NOFLDTXQ 8
 static int t4_nofldtxq = -NOFLDTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0,
     "Number of offload TX queues per port");
 
 #define NOFLDRXQ 2
 static int t4_nofldrxq = -NOFLDRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0,
     "Number of offload RX queues per port");
 
 #define NOFLDTXQ_VI 1
 static int t4_nofldtxq_vi = -NOFLDTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq_vi, CTLFLAG_RDTUN, &t4_nofldtxq_vi, 0,
     "Number of offload TX queues per VI");
 
 #define NOFLDRXQ_VI 1
 static int t4_nofldrxq_vi = -NOFLDRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq_vi, CTLFLAG_RDTUN, &t4_nofldrxq_vi, 0,
     "Number of offload RX queues per VI");
 
 #define TMR_IDX_OFLD 1
 int t4_tmr_idx_ofld = TMR_IDX_OFLD;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx_ofld, CTLFLAG_RDTUN,
     &t4_tmr_idx_ofld, 0, "Holdoff timer index for offload queues");
 
 #define PKTC_IDX_OFLD (-1)
 int t4_pktc_idx_ofld = PKTC_IDX_OFLD;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx_ofld, CTLFLAG_RDTUN,
     &t4_pktc_idx_ofld, 0, "holdoff packet counter index for offload queues");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_keepalive_idle = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_idle, CTLFLAG_RDTUN,
     &t4_toe_keepalive_idle, 0, "TOE keepalive idle timer (us)");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_keepalive_interval = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_interval, CTLFLAG_RDTUN,
     &t4_toe_keepalive_interval, 0, "TOE keepalive interval timer (us)");
 
 /* 0 means chip/fw default, non-zero number is # of keepalives before abort */
 static int t4_toe_keepalive_count = 0;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, keepalive_count, CTLFLAG_RDTUN,
     &t4_toe_keepalive_count, 0, "Number of TOE keepalive probes before abort");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_rexmt_min = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_min, CTLFLAG_RDTUN,
     &t4_toe_rexmt_min, 0, "Minimum TOE retransmit interval (us)");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_rexmt_max = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_max, CTLFLAG_RDTUN,
     &t4_toe_rexmt_max, 0, "Maximum TOE retransmit interval (us)");
 
 /* 0 means chip/fw default, non-zero number is # of rexmt before abort */
 static int t4_toe_rexmt_count = 0;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, rexmt_count, CTLFLAG_RDTUN,
     &t4_toe_rexmt_count, 0, "Number of TOE retransmissions before abort");
 
 /* -1 means chip/fw default, other values are raw backoff values to use */
 static int t4_toe_rexmt_backoff[16] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 };
 SYSCTL_NODE(_hw_cxgbe_toe, OID_AUTO, rexmt_backoff, CTLFLAG_RD, 0,
     "cxgbe(4) TOE retransmit backoff values");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 0, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[0], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 1, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[1], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 2, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[2], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 3, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[3], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 4, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[4], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 5, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[5], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 6, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[6], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 7, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[7], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 8, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[8], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 9, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[9], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 10, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[10], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 11, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[11], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 12, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[12], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 13, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[13], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[14], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[15], 0, "");
 #endif
 
 #ifdef DEV_NETMAP
 #define NNMTXQ_VI 2
 static int t4_nnmtxq_vi = -NNMTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq_vi, CTLFLAG_RDTUN, &t4_nnmtxq_vi, 0,
     "Number of netmap TX queues per VI");
 
 #define NNMRXQ_VI 2
 static int t4_nnmrxq_vi = -NNMRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq_vi, CTLFLAG_RDTUN, &t4_nnmrxq_vi, 0,
     "Number of netmap RX queues per VI");
 #endif
 
 /*
  * Holdoff parameters for ports.
  */
 #define TMR_IDX 1
 int t4_tmr_idx = TMR_IDX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx, CTLFLAG_RDTUN, &t4_tmr_idx,
     0, "Holdoff timer index");
 TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx);	/* Old name */
 
 #define PKTC_IDX (-1)
 int t4_pktc_idx = PKTC_IDX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx, CTLFLAG_RDTUN, &t4_pktc_idx,
     0, "Holdoff packet counter index");
 TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx);	/* Old name */
 
 /*
  * Size (# of entries) of each tx and rx queue.
  */
 unsigned int t4_qsize_txq = TX_EQ_QSIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_txq, CTLFLAG_RDTUN, &t4_qsize_txq, 0,
     "Number of descriptors in each TX queue");
 
 unsigned int t4_qsize_rxq = RX_IQ_QSIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_rxq, CTLFLAG_RDTUN, &t4_qsize_rxq, 0,
     "Number of descriptors in each RX queue");
 
 /*
  * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively).
  */
 int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, interrupt_types, CTLFLAG_RDTUN, &t4_intr_types,
     0, "Interrupt types allowed (bit 0 = INTx, 1 = MSI, 2 = MSI-X)");
 
 /*
  * Configuration file.  All the _CF names here are special.
  */
 #define DEFAULT_CF	"default"
 #define BUILTIN_CF	"built-in"
 #define FLASH_CF	"flash"
 #define UWIRE_CF	"uwire"
 #define FPGA_CF		"fpga"
 static char t4_cfg_file[32] = DEFAULT_CF;
 SYSCTL_STRING(_hw_cxgbe, OID_AUTO, config_file, CTLFLAG_RDTUN, t4_cfg_file,
     sizeof(t4_cfg_file), "Firmware configuration file");
 
 /*
  * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively).
  * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them.
  * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water
  *            mark or when signalled to do so, 0 to never emit PAUSE.
  * pause_autoneg = 1 means PAUSE will be negotiated if possible and the
  *                 negotiated settings will override rx_pause/tx_pause.
  *                 Otherwise rx_pause/tx_pause are applied forcibly.
  */
 static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, pause_settings, CTLFLAG_RDTUN,
     &t4_pause_settings, 0,
     "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)");
 
 /*
  * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively).
  * -1 to run with the firmware default.  Same as FEC_AUTO (bit 5)
  *  0 to disable FEC.
  */
 static int t4_fec = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fec, CTLFLAG_RDTUN, &t4_fec, 0,
     "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)");
 
 /*
  * Link autonegotiation.
  * -1 to run with the firmware default.
  *  0 to disable.
  *  1 to enable.
  */
 static int t4_autoneg = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, autoneg, CTLFLAG_RDTUN, &t4_autoneg, 0,
     "Link autonegotiation");
 
 /*
  * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed,
  * encouraged respectively).  '-n' is the same as 'n' except the firmware
  * version used in the checks is read from the firmware bundled with the driver.
  */
 static int t4_fw_install = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fw_install, CTLFLAG_RDTUN, &t4_fw_install, 0,
     "Firmware auto-install (0 = prohibited, 1 = allowed, 2 = encouraged)");
 
 /*
  * ASIC features that will be used.  Disable the ones you don't want so that the
  * chip resources aren't wasted on features that will not be used.
  */
 static int t4_nbmcaps_allowed = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nbmcaps_allowed, CTLFLAG_RDTUN,
     &t4_nbmcaps_allowed, 0, "Default NBM capabilities");
 
 static int t4_linkcaps_allowed = 0;	/* No DCBX, PPP, etc. by default */
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, linkcaps_allowed, CTLFLAG_RDTUN,
     &t4_linkcaps_allowed, 0, "Default link capabilities");
 
 static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS |
     FW_CAPS_CONFIG_SWITCH_EGRESS;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, switchcaps_allowed, CTLFLAG_RDTUN,
     &t4_switchcaps_allowed, 0, "Default switch capabilities");
 
 #ifdef RATELIMIT
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC |
 	FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD;
 #else
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC |
 	FW_CAPS_CONFIG_NIC_HASHFILTER;
 #endif
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, niccaps_allowed, CTLFLAG_RDTUN,
     &t4_niccaps_allowed, 0, "Default NIC capabilities");
 
 static int t4_toecaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, toecaps_allowed, CTLFLAG_RDTUN,
     &t4_toecaps_allowed, 0, "Default TCP offload capabilities");
 
 static int t4_rdmacaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, rdmacaps_allowed, CTLFLAG_RDTUN,
     &t4_rdmacaps_allowed, 0, "Default RDMA capabilities");
 
 static int t4_cryptocaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cryptocaps_allowed, CTLFLAG_RDTUN,
     &t4_cryptocaps_allowed, 0, "Default crypto capabilities");
 
 static int t4_iscsicaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, iscsicaps_allowed, CTLFLAG_RDTUN,
     &t4_iscsicaps_allowed, 0, "Default iSCSI capabilities");
 
 static int t4_fcoecaps_allowed = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fcoecaps_allowed, CTLFLAG_RDTUN,
     &t4_fcoecaps_allowed, 0, "Default FCoE capabilities");
 
 static int t5_write_combine = 0;
 SYSCTL_INT(_hw_cxl, OID_AUTO, write_combine, CTLFLAG_RDTUN, &t5_write_combine,
     0, "Use WC instead of UC for BAR2");
 
 static int t4_num_vis = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, num_vis, CTLFLAG_RDTUN, &t4_num_vis, 0,
     "Number of VIs per port");
 
 /*
  * PCIe Relaxed Ordering.
  * -1: driver should figure out a good value.
  * 0: disable RO.
  * 1: enable RO.
  * 2: leave RO alone.
  */
 static int pcie_relaxed_ordering = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, CTLFLAG_RDTUN,
     &pcie_relaxed_ordering, 0,
     "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone");
 
 static int t4_panic_on_fatal_err = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RDTUN,
     &t4_panic_on_fatal_err, 0, "panic on fatal errors");
 
 #ifdef TCP_OFFLOAD
 /*
  * TOE tunables.
  */
 static int t4_cop_managed_offloading = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cop_managed_offloading, CTLFLAG_RDTUN,
     &t4_cop_managed_offloading, 0,
     "COP (Connection Offload Policy) controls all TOE offload");
 #endif
 
 /* Functions used by VIs to obtain unique MAC addresses for each VI. */
 static int vi_mac_funcs[] = {
 	FW_VI_FUNC_ETH,
 	FW_VI_FUNC_OFLD,
 	FW_VI_FUNC_IWARP,
 	FW_VI_FUNC_OPENISCSI,
 	FW_VI_FUNC_OPENFCOE,
 	FW_VI_FUNC_FOISCSI,
 	FW_VI_FUNC_FOFCOE,
 };
 
 struct intrs_and_queues {
 	uint16_t intr_type;	/* INTx, MSI, or MSI-X */
 	uint16_t num_vis;	/* number of VIs for each port */
 	uint16_t nirq;		/* Total # of vectors */
 	uint16_t ntxq;		/* # of NIC txq's for each port */
 	uint16_t nrxq;		/* # of NIC rxq's for each port */
 	uint16_t nofldtxq;	/* # of TOE/ETHOFLD txq's for each port */
 	uint16_t nofldrxq;	/* # of TOE rxq's for each port */
 
 	/* The vcxgbe/vcxl interfaces use these and not the ones above. */
 	uint16_t ntxq_vi;	/* # of NIC txq's */
 	uint16_t nrxq_vi;	/* # of NIC rxq's */
 	uint16_t nofldtxq_vi;	/* # of TOE txq's */
 	uint16_t nofldrxq_vi;	/* # of TOE rxq's */
 	uint16_t nnmtxq_vi;	/* # of netmap txq's */
 	uint16_t nnmrxq_vi;	/* # of netmap rxq's */
 };
 
 static void setup_memwin(struct adapter *);
 static void position_memwin(struct adapter *, int, uint32_t);
 static int validate_mem_range(struct adapter *, uint32_t, uint32_t);
 static int fwmtype_to_hwmtype(int);
 static int validate_mt_off_len(struct adapter *, int, uint32_t, uint32_t,
     uint32_t *);
 static int fixup_devlog_params(struct adapter *);
 static int cfg_itype_and_nqueues(struct adapter *, struct intrs_and_queues *);
 static int contact_firmware(struct adapter *);
 static int partition_resources(struct adapter *);
 static int get_params__pre_init(struct adapter *);
 static int set_params__pre_init(struct adapter *);
 static int get_params__post_init(struct adapter *);
 static int set_params__post_init(struct adapter *);
 static void t4_set_desc(struct adapter *);
 static bool fixed_ifmedia(struct port_info *);
 static void build_medialist(struct port_info *);
 static void init_link_config(struct port_info *);
 static int fixup_link_config(struct port_info *);
 static int apply_link_config(struct port_info *);
 static int cxgbe_init_synchronized(struct vi_info *);
 static int cxgbe_uninit_synchronized(struct vi_info *);
 static void quiesce_txq(struct adapter *, struct sge_txq *);
 static void quiesce_wrq(struct adapter *, struct sge_wrq *);
 static void quiesce_iq(struct adapter *, struct sge_iq *);
 static void quiesce_fl(struct adapter *, struct sge_fl *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
     driver_intr_t *, void *, char *);
 static int t4_free_irq(struct adapter *, struct irq *);
 static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
 static void vi_refresh_stats(struct adapter *, struct vi_info *);
 static void cxgbe_refresh_stats(struct adapter *, struct port_info *);
 static void cxgbe_tick(void *);
 static void cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS);
 static int sysctl_btphy(SYSCTL_HANDLER_ARGS);
 static int sysctl_noflowq(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS);
 static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS);
 static int sysctl_fec(SYSCTL_HANDLER_ARGS);
 static int sysctl_autoneg(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS);
 static int sysctl_temperature(SYSCTL_HANDLER_ARGS);
 static int sysctl_loadavg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cctrl(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_devlog(SYSCTL_HANDLER_ARGS);
 static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS);
 static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS);
 static int sysctl_meminfo(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS);
 static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS);
 static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tids(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpus(SYSCTL_HANDLER_ARGS);
 #ifdef TCP_OFFLOAD
 static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS);
 #endif
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int load_fw(struct adapter *, struct t4_data *);
 static int load_cfg(struct adapter *, struct t4_data *);
 static int load_boot(struct adapter *, struct t4_bootrom *);
 static int load_bootcfg(struct adapter *, struct t4_data *);
 static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *);
 static void free_offload_policy(struct t4_offload_policy *);
 static int set_offload_policy(struct adapter *, struct t4_offload_policy *);
 static int read_card_mem(struct adapter *, int, struct t4_mem_range *);
 static int read_i2c(struct adapter *, struct t4_i2c_data *);
 #ifdef TCP_OFFLOAD
 static int toe_capability(struct vi_info *, int);
 #endif
 static int mod_event(module_t, int, void *);
 static int notify_siblings(device_t, int);
 
 struct {
 	uint16_t device;
 	char *desc;
 } t4_pciids[] = {
 	{0xa000, "Chelsio Terminator 4 FPGA"},
 	{0x4400, "Chelsio T440-dbg"},
 	{0x4401, "Chelsio T420-CR"},
 	{0x4402, "Chelsio T422-CR"},
 	{0x4403, "Chelsio T440-CR"},
 	{0x4404, "Chelsio T420-BCH"},
 	{0x4405, "Chelsio T440-BCH"},
 	{0x4406, "Chelsio T440-CH"},
 	{0x4407, "Chelsio T420-SO"},
 	{0x4408, "Chelsio T420-CX"},
 	{0x4409, "Chelsio T420-BT"},
 	{0x440a, "Chelsio T404-BT"},
 	{0x440e, "Chelsio T440-LP-CR"},
 }, t5_pciids[] = {
 	{0xb000, "Chelsio Terminator 5 FPGA"},
 	{0x5400, "Chelsio T580-dbg"},
 	{0x5401,  "Chelsio T520-CR"},		/* 2 x 10G */
 	{0x5402,  "Chelsio T522-CR"},		/* 2 x 10G, 2 X 1G */
 	{0x5403,  "Chelsio T540-CR"},		/* 4 x 10G */
 	{0x5407,  "Chelsio T520-SO"},		/* 2 x 10G, nomem */
 	{0x5409,  "Chelsio T520-BT"},		/* 2 x 10GBaseT */
 	{0x540a,  "Chelsio T504-BT"},		/* 4 x 1G */
 	{0x540d,  "Chelsio T580-CR"},		/* 2 x 40G */
 	{0x540e,  "Chelsio T540-LP-CR"},	/* 4 x 10G */
 	{0x5410,  "Chelsio T580-LP-CR"},	/* 2 x 40G */
 	{0x5411,  "Chelsio T520-LL-CR"},	/* 2 x 10G */
 	{0x5412,  "Chelsio T560-CR"},		/* 1 x 40G, 2 x 10G */
 	{0x5414,  "Chelsio T580-LP-SO-CR"},	/* 2 x 40G, nomem */
 	{0x5415,  "Chelsio T502-BT"},		/* 2 x 1G */
 	{0x5418,  "Chelsio T540-BT"},		/* 4 x 10GBaseT */
 	{0x5419,  "Chelsio T540-LP-BT"},	/* 4 x 10GBaseT */
 	{0x541a,  "Chelsio T540-SO-BT"},	/* 4 x 10GBaseT, nomem */
 	{0x541b,  "Chelsio T540-SO-CR"},	/* 4 x 10G, nomem */
 
 	/* Custom */
 	{0x5483, "Custom T540-CR"},
 	{0x5484, "Custom T540-BT"},
 }, t6_pciids[] = {
 	{0xc006, "Chelsio Terminator 6 FPGA"},	/* T6 PE10K6 FPGA (PF0) */
 	{0x6400, "Chelsio T6-DBG-25"},		/* 2 x 10/25G, debug */
 	{0x6401, "Chelsio T6225-CR"},		/* 2 x 10/25G */
 	{0x6402, "Chelsio T6225-SO-CR"},	/* 2 x 10/25G, nomem */
 	{0x6403, "Chelsio T6425-CR"},		/* 4 x 10/25G */
 	{0x6404, "Chelsio T6425-SO-CR"},	/* 4 x 10/25G, nomem */
 	{0x6405, "Chelsio T6225-OCP-SO"},	/* 2 x 10/25G, nomem */
 	{0x6406, "Chelsio T62100-OCP-SO"},	/* 2 x 40/50/100G, nomem */
 	{0x6407, "Chelsio T62100-LP-CR"},	/* 2 x 40/50/100G */
 	{0x6408, "Chelsio T62100-SO-CR"},	/* 2 x 40/50/100G, nomem */
 	{0x6409, "Chelsio T6210-BT"},		/* 2 x 10GBASE-T */
 	{0x640d, "Chelsio T62100-CR"},		/* 2 x 40/50/100G */
 	{0x6410, "Chelsio T6-DBG-100"},		/* 2 x 40/50/100G, debug */
 	{0x6411, "Chelsio T6225-LL-CR"},	/* 2 x 10/25G */
 	{0x6414, "Chelsio T61100-OCP-SO"},	/* 1 x 40/50/100G, nomem */
 	{0x6415, "Chelsio T6201-BT"},		/* 2 x 1000BASE-T */
 
 	/* Custom */
 	{0x6480, "Custom T6225-CR"},
 	{0x6481, "Custom T62100-CR"},
 	{0x6482, "Custom T6225-CR"},
 	{0x6483, "Custom T62100-CR"},
 	{0x6484, "Custom T64100-CR"},
 	{0x6485, "Custom T6240-SO"},
 	{0x6486, "Custom T6225-SO-CR"},
 	{0x6487, "Custom T6225-CR"},
 };
 
 #ifdef TCP_OFFLOAD
 /*
  * service_iq_fl() has an iq and needs the fl.  Offset of fl from the iq should
  * be exactly the same for both rxq and ofld_rxq.
  */
 CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
 
 static int
 t4_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xa000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t4_pciids); i++) {
 		if (d == t4_pciids[i].device) {
 			device_set_desc(dev, t4_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t5_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xb000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t5_pciids); i++) {
 		if (d == t5_pciids[i].device) {
 			device_set_desc(dev, t5_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t6_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t6_pciids); i++) {
 		if (d == t6_pciids[i].device) {
 			device_set_desc(dev, t6_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static void
 t5_attribute_workaround(device_t dev)
 {
 	device_t root_port;
 	uint32_t v;
 
 	/*
 	 * The T5 chips do not properly echo the No Snoop and Relaxed
 	 * Ordering attributes when replying to a TLP from a Root
 	 * Port.  As a workaround, find the parent Root Port and
 	 * disable No Snoop and Relaxed Ordering.  Note that this
 	 * affects all devices under this root port.
 	 */
 	root_port = pci_find_pcie_root_port(dev);
 	if (root_port == NULL) {
 		device_printf(dev, "Unable to find parent root port\n");
 		return;
 	}
 
 	v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL,
 	    PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2);
 	if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) !=
 	    0)
 		device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n",
 		    device_get_nameunit(root_port));
 }
 
 static const struct devnames devnames[] = {
 	{
 		.nexus_name = "t4nex",
 		.ifnet_name = "cxgbe",
 		.vi_ifnet_name = "vcxgbe",
 		.pf03_drv_name = "t4iov",
 		.vf_nexus_name = "t4vf",
 		.vf_ifnet_name = "cxgbev"
 	}, {
 		.nexus_name = "t5nex",
 		.ifnet_name = "cxl",
 		.vi_ifnet_name = "vcxl",
 		.pf03_drv_name = "t5iov",
 		.vf_nexus_name = "t5vf",
 		.vf_ifnet_name = "cxlv"
 	}, {
 		.nexus_name = "t6nex",
 		.ifnet_name = "cc",
 		.vi_ifnet_name = "vcc",
 		.pf03_drv_name = "t6iov",
 		.vf_nexus_name = "t6vf",
 		.vf_ifnet_name = "ccv"
 	}
 };
 
 void
 t4_init_devnames(struct adapter *sc)
 {
 	int id;
 
 	id = chip_id(sc);
 	if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames))
 		sc->names = &devnames[id - CHELSIO_T4];
 	else {
 		device_printf(sc->dev, "chip id %d is not supported.\n", id);
 		sc->names = NULL;
 	}
 }
 
 static int
 t4_ifnet_unit(struct adapter *sc, struct port_info *pi)
 {
 	const char *parent, *name;
 	long value;
 	int line, unit;
 
 	line = 0;
 	parent = device_get_nameunit(sc->dev);
 	name = sc->names->ifnet_name;
 	while (resource_find_dev(&line, name, &unit, "at", parent) == 0) {
 		if (resource_long_value(name, unit, "port", &value) == 0 &&
 		    value == pi->port_id)
 			return (unit);
 	}
 	return (-1);
 }
 
 static int
 t4_attach(device_t dev)
 {
 	struct adapter *sc;
 	int rc = 0, i, j, rqidx, tqidx, nports;
 	struct make_dev_args mda;
 	struct intrs_and_queues iaq;
 	struct sge *s;
 	uint32_t *buf;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	int ofld_tqidx;
 #endif
 #ifdef TCP_OFFLOAD
 	int ofld_rqidx;
 #endif
 #ifdef DEV_NETMAP
 	int nm_rqidx, nm_tqidx;
 #endif
 	int num_vis;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 	TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags);
 
 	if ((pci_get_device(dev) & 0xff00) == 0x5400)
 		t5_attribute_workaround(dev);
 	pci_enable_busmaster(dev);
 	if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) {
 		uint32_t v;
 
 		pci_set_max_read_req(dev, 4096);
 		v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2);
 		sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5);
 		if (pcie_relaxed_ordering == 0 &&
 		    (v & PCIEM_CTL_RELAXED_ORD_ENABLE) != 0) {
 			v &= ~PCIEM_CTL_RELAXED_ORD_ENABLE;
 			pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 		} else if (pcie_relaxed_ordering == 1 &&
 		    (v & PCIEM_CTL_RELAXED_ORD_ENABLE) == 0) {
 			v |= PCIEM_CTL_RELAXED_ORD_ENABLE;
 			pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 		}
 	}
 
 	sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS);
 	sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL);
 	sc->traceq = -1;
 	mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF);
 	snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer",
 	    device_get_nameunit(dev));
 
 	snprintf(sc->lockname, sizeof(sc->lockname), "%s",
 	    device_get_nameunit(dev));
 	mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF);
 	t4_add_adapter(sc);
 
 	mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF);
 	TAILQ_INIT(&sc->sfl);
 	callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0);
 
 	mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF);
 
 	sc->policy = NULL;
 	rw_init(&sc->policy_lock, "connection offload policy");
 
 	rc = t4_map_bars_0_and_4(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
 
 	/* Prepare the adapter for operation. */
 	buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK);
 	rc = -t4_prep_adapter(sc, buf);
 	free(buf, M_CXGBE);
 	if (rc != 0) {
 		device_printf(dev, "failed to prepare adapter: %d.\n", rc);
 		goto done;
 	}
 
 	/*
 	 * This is the real PF# to which we're attaching.  Works from within PCI
 	 * passthrough environments too, where pci_get_function() could return a
 	 * different PF# depending on the passthrough configuration.  We need to
 	 * use the real PF# in all our communication with the firmware.
 	 */
 	j = t4_read_reg(sc, A_PL_WHOAMI);
 	sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j);
 	sc->mbox = sc->pf;
 
 	t4_init_devnames(sc);
 	if (sc->names == NULL) {
 		rc = ENOTSUP;
 		goto done; /* error message displayed already */
 	}
 
 	/*
 	 * Do this really early, with the memory windows set up even before the
 	 * character device.  The userland tool's register i/o and mem read
 	 * will work even in "recovery mode".
 	 */
 	setup_memwin(sc);
 	if (t4_init_devlog_params(sc, 0) == 0)
 		fixup_devlog_params(sc);
 	make_dev_args_init(&mda);
 	mda.mda_devsw = &t4_cdevsw;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = sc;
 	rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
 	if (rc != 0)
 		device_printf(dev, "failed to create nexus char device: %d.\n",
 		    rc);
 
 	/* Go no further if recovery mode has been requested. */
 	if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) {
 		device_printf(dev, "recovery mode.\n");
 		goto done;
 	}
 
 #if defined(__i386__)
 	if ((cpu_feature & CPUID_CX8) == 0) {
 		device_printf(dev, "64 bit atomics not available.\n");
 		rc = ENOTSUP;
 		goto done;
 	}
 #endif
 
 	/* Contact the firmware and try to become the master driver. */
 	rc = contact_firmware(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 	MPASS(sc->flags & FW_OK);
 
 	rc = get_params__pre_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	if (sc->flags & MASTER_PF) {
 		rc = partition_resources(sc);
 		if (rc != 0)
 			goto done; /* error message displayed already */
 		t4_intr_clear(sc);
 	}
 
 	rc = get_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = set_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_map_bar_2(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_create_dma_tag(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	/*
 	 * First pass over all the ports - allocate VIs and initialize some
 	 * basic parameters like mac address, port type, etc.
 	 */
 	for_each_port(sc, i) {
 		struct port_info *pi;
 
 		pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK);
 		sc->port[i] = pi;
 
 		/* These must be set before t4_port_init */
 		pi->adapter = sc;
 		pi->port_id = i;
 		/*
 		 * XXX: vi[0] is special so we can't delay this allocation until
 		 * pi->nvi's final value is known.
 		 */
 		pi->vi = malloc(sizeof(struct vi_info) * t4_num_vis, M_CXGBE,
 		    M_ZERO | M_WAITOK);
 
 		/*
 		 * Allocate the "main" VI and initialize parameters
 		 * like mac addr.
 		 */
 		rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i);
 		if (rc != 0) {
 			device_printf(dev, "unable to initialize port %d: %d\n",
 			    i, rc);
 			free(pi->vi, M_CXGBE);
 			free(pi, M_CXGBE);
 			sc->port[i] = NULL;
 			goto done;
 		}
 
 		snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d",
 		    device_get_nameunit(dev), i);
 		mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF);
 		sc->chan_map[pi->tx_chan] = i;
 
 		/* All VIs on this port share this media. */
 		ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change,
 		    cxgbe_media_status);
 
 		PORT_LOCK(pi);
 		init_link_config(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		if (fixed_ifmedia(pi))
 			pi->flags |= FIXED_IFMEDIA;
 		PORT_UNLOCK(pi);
 
 		pi->dev = device_add_child(dev, sc->names->ifnet_name,
 		    t4_ifnet_unit(sc, pi));
 		if (pi->dev == NULL) {
 			device_printf(dev,
 			    "failed to add device for port %d.\n", i);
 			rc = ENXIO;
 			goto done;
 		}
 		pi->vi[0].dev = pi->dev;
 		device_set_softc(pi->dev, pi);
 	}
 
 	/*
 	 * Interrupt type, # of interrupts, # of rx/tx queues, etc.
 	 */
 	nports = sc->params.nports;
 	rc = cfg_itype_and_nqueues(sc, &iaq);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	num_vis = iaq.num_vis;
 	sc->intr_type = iaq.intr_type;
 	sc->intr_count = iaq.nirq;
 
 	s = &sc->sge;
 	s->nrxq = nports * iaq.nrxq;
 	s->ntxq = nports * iaq.ntxq;
 	if (num_vis > 1) {
 		s->nrxq += nports * (num_vis - 1) * iaq.nrxq_vi;
 		s->ntxq += nports * (num_vis - 1) * iaq.ntxq_vi;
 	}
 	s->neq = s->ntxq + s->nrxq;	/* the free list in an rxq is an eq */
 	s->neq += nports;		/* ctrl queues: 1 per port */
 	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (is_offload(sc) || is_ethoffload(sc)) {
 		s->nofldtxq = nports * iaq.nofldtxq;
 		if (num_vis > 1)
 			s->nofldtxq += nports * (num_vis - 1) * iaq.nofldtxq_vi;
 		s->neq += s->nofldtxq;
 
 		s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		s->nofldrxq = nports * iaq.nofldrxq;
 		if (num_vis > 1)
 			s->nofldrxq += nports * (num_vis - 1) * iaq.nofldrxq_vi;
 		s->neq += s->nofldrxq;	/* free list */
 		s->niq += s->nofldrxq;
 
 		s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef DEV_NETMAP
 	if (num_vis > 1) {
 		s->nnmrxq = nports * (num_vis - 1) * iaq.nnmrxq_vi;
 		s->nnmtxq = nports * (num_vis - 1) * iaq.nnmtxq_vi;
 	}
 	s->neq += s->nnmtxq + s->nnmrxq;
 	s->niq += s->nnmrxq;
 
 	s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 	s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 #endif
 
 	s->ctrlq = malloc(nports * sizeof(struct sge_wrq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_init_l2t(sc, M_WAITOK);
 	t4_init_smt(sc, M_WAITOK);
 	t4_init_tx_sched(sc);
 #ifdef RATELIMIT
 	t4_init_etid_table(sc);
 #endif
 #ifdef INET6
 	t4_init_clip_table(sc);
 #endif
 	if (sc->vres.key.size != 0)
 		sc->key_map = vmem_create("T4TLS key map", sc->vres.key.start,
 		    sc->vres.key.size, 32, 0, M_FIRSTFIT | M_WAITOK);
 
 	/*
 	 * Second pass over the ports.  This time we know the number of rx and
 	 * tx queues that each port should get.
 	 */
 	rqidx = tqidx = 0;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	ofld_tqidx = 0;
 #endif
 #ifdef TCP_OFFLOAD
 	ofld_rqidx = 0;
 #endif
 #ifdef DEV_NETMAP
 	nm_rqidx = nm_tqidx = 0;
 #endif
 	for_each_port(sc, i) {
 		struct port_info *pi = sc->port[i];
 		struct vi_info *vi;
 
 		if (pi == NULL)
 			continue;
 
 		pi->nvi = num_vis;
 		for_each_vi(pi, j, vi) {
 			vi->pi = pi;
 			vi->qsize_rxq = t4_qsize_rxq;
 			vi->qsize_txq = t4_qsize_txq;
 
 			vi->first_rxq = rqidx;
 			vi->first_txq = tqidx;
 			vi->tmr_idx = t4_tmr_idx;
 			vi->pktc_idx = t4_pktc_idx;
 			vi->nrxq = j == 0 ? iaq.nrxq : iaq.nrxq_vi;
 			vi->ntxq = j == 0 ? iaq.ntxq : iaq.ntxq_vi;
 
 			rqidx += vi->nrxq;
 			tqidx += vi->ntxq;
 
 			if (j == 0 && vi->ntxq > 1)
 				vi->rsrv_noflowq = t4_rsrv_noflowq ? 1 : 0;
 			else
 				vi->rsrv_noflowq = 0;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 			vi->first_ofld_txq = ofld_tqidx;
 			vi->nofldtxq = j == 0 ? iaq.nofldtxq : iaq.nofldtxq_vi;
 			ofld_tqidx += vi->nofldtxq;
 #endif
 #ifdef TCP_OFFLOAD
 			vi->ofld_tmr_idx = t4_tmr_idx_ofld;
 			vi->ofld_pktc_idx = t4_pktc_idx_ofld;
 			vi->first_ofld_rxq = ofld_rqidx;
 			vi->nofldrxq = j == 0 ? iaq.nofldrxq : iaq.nofldrxq_vi;
 
 			ofld_rqidx += vi->nofldrxq;
 #endif
 #ifdef DEV_NETMAP
 			if (j > 0) {
 				vi->first_nm_rxq = nm_rqidx;
 				vi->first_nm_txq = nm_tqidx;
 				vi->nnmrxq = iaq.nnmrxq_vi;
 				vi->nnmtxq = iaq.nnmtxq_vi;
 				nm_rqidx += vi->nnmrxq;
 				nm_tqidx += vi->nnmtxq;
 			}
 #endif
 		}
 	}
 
 	rc = t4_setup_intr_handlers(sc);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt handlers: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_generic_probe(dev);
 	if (rc != 0) {
 		device_printf(dev, "failed to probe child drivers: %d\n", rc);
 		goto done;
 	}
 
 	/*
 	 * Ensure thread-safe mailbox access (in debug builds).
 	 *
 	 * So far this was the only thread accessing the mailbox but various
 	 * ifnets and sysctls are about to be created and their handlers/ioctls
 	 * will access the mailbox from different threads.
 	 */
 	sc->flags |= CHK_MBOX_ACCESS;
 
 	rc = bus_generic_attach(dev);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to attach all child ports: %d\n", rc);
 		goto done;
 	}
 
 	device_printf(dev,
 	    "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n",
 	    sc->params.pci.speed, sc->params.pci.width, sc->params.nports,
 	    sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" :
 	    (sc->intr_type == INTR_MSI ? "MSI" : "INTx"),
 	    sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq);
 
 	t4_set_desc(sc);
 
 	notify_siblings(dev, 0);
 
 done:
 	if (rc != 0 && sc->cdev) {
 		/* cdev was created and so cxgbetool works; recover that way. */
 		device_printf(dev,
 		    "error during attach, adapter is now in recovery mode.\n");
 		rc = 0;
 	}
 
 	if (rc != 0)
 		t4_detach_common(dev);
 	else
 		t4_sysctls(sc);
 
 	return (rc);
 }
 
 static int
 t4_child_location_str(device_t bus, device_t dev, char *buf, size_t buflen)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i;
 
 	sc = device_get_softc(bus);
 	buf[0] = '\0';
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		if (pi != NULL && pi->dev == dev) {
 			snprintf(buf, buflen, "port=%d", pi->port_id);
 			break;
 		}
 	}
 	return (0);
 }
 
 static int
 t4_ready(device_t dev)
 {
 	struct adapter *sc;
 
 	sc = device_get_softc(dev);
 	if (sc->flags & FW_OK)
 		return (0);
 	return (ENXIO);
 }
 
 static int
 t4_read_port_device(device_t dev, int port, device_t *child)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 
 	sc = device_get_softc(dev);
 	if (port < 0 || port >= MAX_NPORTS)
 		return (EINVAL);
 	pi = sc->port[port];
 	if (pi == NULL || pi->dev == NULL)
 		return (ENXIO);
 	*child = pi->dev;
 	return (0);
 }
 
 static int
 notify_siblings(device_t dev, int detaching)
 {
 	device_t sibling;
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < PCI_FUNCMAX; i++) {
 		if (i == pci_get_function(dev))
 			continue;
 		sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev),
 		    pci_get_slot(dev), i);
 		if (sibling == NULL || !device_is_attached(sibling))
 			continue;
 		if (detaching)
 			error = T4_DETACH_CHILD(sibling);
 		else
 			(void)T4_ATTACH_CHILD(sibling);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Idempotent
  */
 static int
 t4_detach(device_t dev)
 {
 	struct adapter *sc;
 	int rc;
 
 	sc = device_get_softc(dev);
 
 	rc = notify_siblings(dev, 1);
 	if (rc) {
 		device_printf(dev,
 		    "failed to detach sibling devices: %d\n", rc);
 		return (rc);
 	}
 
 	return (t4_detach_common(dev));
 }
 
 int
 t4_detach_common(device_t dev)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i, rc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->cdev) {
 		destroy_dev(sc->cdev);
 		sc->cdev = NULL;
 	}
 
 	sc->flags &= ~CHK_MBOX_ACCESS;
 	if (sc->flags & FULL_INIT_DONE) {
 		if (!(sc->flags & IS_VF))
 			t4_intr_disable(sc);
 	}
 
 	if (device_is_attached(dev)) {
 		rc = bus_generic_detach(dev);
 		if (rc) {
 			device_printf(dev,
 			    "failed to detach child devices: %d\n", rc);
 			return (rc);
 		}
 	}
 
 	for (i = 0; i < sc->intr_count; i++)
 		t4_free_irq(sc, &sc->irq[i]);
 
 	if ((sc->flags & (IS_VF | FW_OK)) == FW_OK)
 		t4_free_tx_sched(sc);
 
 	for (i = 0; i < MAX_NPORTS; i++) {
 		pi = sc->port[i];
 		if (pi) {
 			t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid);
 			if (pi->dev)
 				device_delete_child(dev, pi->dev);
 
 			mtx_destroy(&pi->pi_lock);
 			free(pi->vi, M_CXGBE);
 			free(pi, M_CXGBE);
 		}
 	}
 
 	device_delete_children(dev);
 
 	if (sc->flags & FULL_INIT_DONE)
 		adapter_full_uninit(sc);
 
 	if ((sc->flags & (IS_VF | FW_OK)) == FW_OK)
 		t4_fw_bye(sc, sc->mbox);
 
 	if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX)
 		pci_release_msi(dev);
 
 	if (sc->regs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid,
 		    sc->regs_res);
 
 	if (sc->udbs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid,
 		    sc->udbs_res);
 
 	if (sc->msix_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid,
 		    sc->msix_res);
 
 	if (sc->l2t)
 		t4_free_l2t(sc->l2t);
 	if (sc->smt)
 		t4_free_smt(sc->smt);
 #ifdef RATELIMIT
 	t4_free_etid_table(sc);
 #endif
 	if (sc->key_map)
 		vmem_destroy(sc->key_map);
 #ifdef INET6
 	t4_destroy_clip_table(sc);
 #endif
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	free(sc->sge.ofld_txq, M_CXGBE);
 #endif
 #ifdef TCP_OFFLOAD
 	free(sc->sge.ofld_rxq, M_CXGBE);
 #endif
 #ifdef DEV_NETMAP
 	free(sc->sge.nm_rxq, M_CXGBE);
 	free(sc->sge.nm_txq, M_CXGBE);
 #endif
 	free(sc->irq, M_CXGBE);
 	free(sc->sge.rxq, M_CXGBE);
 	free(sc->sge.txq, M_CXGBE);
 	free(sc->sge.ctrlq, M_CXGBE);
 	free(sc->sge.iqmap, M_CXGBE);
 	free(sc->sge.eqmap, M_CXGBE);
 	free(sc->tids.ftid_tab, M_CXGBE);
 	free(sc->tids.hpftid_tab, M_CXGBE);
 	free_hftid_hash(&sc->tids);
 	free(sc->tids.atid_tab, M_CXGBE);
 	free(sc->tids.tid_tab, M_CXGBE);
 	free(sc->tt.tls_rx_ports, M_CXGBE);
 	t4_destroy_dma_tag(sc);
 	if (mtx_initialized(&sc->sc_lock)) {
 		sx_xlock(&t4_list_lock);
 		SLIST_REMOVE(&t4_list, sc, adapter, link);
 		sx_xunlock(&t4_list_lock);
 		mtx_destroy(&sc->sc_lock);
 	}
 
 	callout_drain(&sc->sfl_callout);
 	if (mtx_initialized(&sc->tids.ftid_lock)) {
 		mtx_destroy(&sc->tids.ftid_lock);
 		cv_destroy(&sc->tids.ftid_cv);
 	}
 	if (mtx_initialized(&sc->tids.atid_lock))
 		mtx_destroy(&sc->tids.atid_lock);
 	if (mtx_initialized(&sc->sfl_lock))
 		mtx_destroy(&sc->sfl_lock);
 	if (mtx_initialized(&sc->ifp_lock))
 		mtx_destroy(&sc->ifp_lock);
 	if (mtx_initialized(&sc->reg_lock))
 		mtx_destroy(&sc->reg_lock);
 
 	if (rw_initialized(&sc->policy_lock)) {
 		rw_destroy(&sc->policy_lock);
 #ifdef TCP_OFFLOAD
 		if (sc->policy != NULL)
 			free_offload_policy(sc->policy);
 #endif
 	}
 
 	for (i = 0; i < NUM_MEMWIN; i++) {
 		struct memwin *mw = &sc->memwin[i];
 
 		if (rw_initialized(&mw->mw_lock))
 			rw_destroy(&mw->mw_lock);
 	}
 
 	bzero(sc, sizeof(*sc));
 
 	return (0);
 }
 
 static int
 cxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct port_info *pi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d", pi->port_id);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \
     IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \
     IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \
     IFCAP_HWRXTSTMP)
 #define T4_CAP_ENABLE (T4_CAP)
 
 static int
 cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 {
 	struct ifnet *ifp;
 	struct sbuf *sb;
 
 	vi->xact_addr_filt = -1;
 	callout_init(&vi->tick, 1);
 
 	/* Allocate an ifnet and set it up */
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "Cannot allocate ifnet\n");
 		return (ENOMEM);
 	}
 	vi->ifp = ifp;
 	ifp->if_softc = vi;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 
 	ifp->if_init = cxgbe_init;
 	ifp->if_ioctl = cxgbe_ioctl;
 	ifp->if_transmit = cxgbe_transmit;
 	ifp->if_qflush = cxgbe_qflush;
 	ifp->if_get_counter = cxgbe_get_counter;
 #ifdef RATELIMIT
 	ifp->if_snd_tag_alloc = cxgbe_snd_tag_alloc;
 	ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
 	ifp->if_snd_tag_query = cxgbe_snd_tag_query;
 	ifp->if_snd_tag_free = cxgbe_snd_tag_free;
 #endif
 
 	ifp->if_capabilities = T4_CAP;
 	ifp->if_capenable = T4_CAP_ENABLE;
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0)
 		ifp->if_capabilities |= IFCAP_TOE;
 #endif
 #ifdef RATELIMIT
 	if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) {
 		ifp->if_capabilities |= IFCAP_TXRTLMT;
 		ifp->if_capenable |= IFCAP_TXRTLMT;
 	}
 #endif
 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
 
 	ifp->if_hw_tsomax = IP_MAXPACKET;
 	ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
 #ifdef RATELIMIT
 	if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0)
 		ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
 #endif
 	ifp->if_hw_tsomaxsegsize = 65536;
 
 	ether_ifattach(ifp, vi->hw_addr);
 #ifdef DEV_NETMAP
 	if (vi->nnmrxq != 0)
 		cxgbe_nm_attach(vi);
 #endif
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	switch (ifp->if_capabilities & (IFCAP_TOE | IFCAP_TXRTLMT)) {
 	case IFCAP_TOE:
 		sbuf_printf(sb, "; %d txq (TOE)", vi->nofldtxq);
 		break;
 	case IFCAP_TOE | IFCAP_TXRTLMT:
 		sbuf_printf(sb, "; %d txq (TOE/ETHOFLD)", vi->nofldtxq);
 		break;
 	case IFCAP_TXRTLMT:
 		sbuf_printf(sb, "; %d txq (ETHOFLD)", vi->nofldtxq);
 		break;
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (ifp->if_capabilities & IFCAP_TOE)
 		sbuf_printf(sb, ", %d rxq (TOE)", vi->nofldrxq);
 #endif
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		sbuf_printf(sb, "; %d txq, %d rxq (netmap)",
 		    vi->nnmtxq, vi->nnmrxq);
 #endif
 	sbuf_finish(sb);
 	device_printf(dev, "%s\n", sbuf_data(sb));
 	sbuf_delete(sb);
 
 	vi_sysctls(vi);
 
 	return (0);
 }
 
 static int
 cxgbe_attach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	struct vi_info *vi;
 	int i, rc;
 
 	callout_init_mtx(&pi->tick, &pi->pi_lock, 0);
 
 	rc = cxgbe_vi_attach(dev, &pi->vi[0]);
 	if (rc)
 		return (rc);
 
 	for_each_vi(pi, i, vi) {
 		if (i == 0)
 			continue;
 		vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1);
 		if (vi->dev == NULL) {
 			device_printf(dev, "failed to add VI %d\n", i);
 			continue;
 		}
 		device_set_softc(vi->dev, vi);
 	}
 
 	cxgbe_sysctls(pi);
 
 	bus_generic_attach(dev);
 
 	return (0);
 }
 
 static void
 cxgbe_vi_detach(struct vi_info *vi)
 {
 	struct ifnet *ifp = vi->ifp;
 
 	ether_ifdetach(ifp);
 
 	/* Let detach proceed even if these fail. */
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		cxgbe_nm_detach(vi);
 #endif
 	cxgbe_uninit_synchronized(vi);
 	callout_drain(&vi->tick);
 	vi_full_uninit(vi);
 
 	if_free(vi->ifp);
 	vi->ifp = NULL;
 }
 
 static int
 cxgbe_detach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	int rc;
 
 	/* Detach the extra VIs first. */
 	rc = bus_generic_detach(dev);
 	if (rc)
 		return (rc);
 	device_delete_children(dev);
 
 	doom_vi(sc, &pi->vi[0]);
 
 	if (pi->flags & HAS_TRACEQ) {
 		sc->traceq = -1;	/* cloner should not create ifnet */
 		t4_tracer_port_detach(sc);
 	}
 
 	cxgbe_vi_detach(&pi->vi[0]);
 	callout_drain(&pi->tick);
 	ifmedia_removeall(&pi->media);
 
 	end_synchronized_op(sc, 0);
 
 	return (0);
 }
 
 static void
 cxgbe_init(void *arg)
 {
 	struct vi_info *vi = arg;
 	struct adapter *sc = vi->pi->adapter;
 
 	if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0)
 		return;
 	cxgbe_init_synchronized(vi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 {
 	int rc = 0, mtu, flags;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifreq *ifr = (struct ifreq *)data;
 	uint32_t mask;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		mtu = ifr->ifr_mtu;
 		if (mtu < ETHERMIN || mtu > MAX_MTU)
 			return (EINVAL);
 
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu");
 		if (rc)
 			return (rc);
 		ifp->if_mtu = mtu;
 		if (vi->flags & VI_INIT_DONE) {
 			t4_update_fl_bufsize(ifp);
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_MTU);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFFLAGS:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4flg");
 		if (rc)
 			return (rc);
 
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				flags = vi->if_flags;
 				if ((ifp->if_flags ^ flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					rc = update_mac_settings(ifp,
 					    XGMAC_PROMISC | XGMAC_ALLMULTI);
 				}
 			} else {
 				rc = cxgbe_init_synchronized(vi);
 			}
 			vi->if_flags = ifp->if_flags;
 		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			rc = cxgbe_uninit_synchronized(vi);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi");
 		if (rc)
 			return (rc);
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			rc = update_mac_settings(ifp, XGMAC_MCADDRS);
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFCAP:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap");
 		if (rc)
 			return (rc);
 
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				if_printf(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				if_printf(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 
 		/*
 		 * Note that we leave CSUM_TSO alone (it is always set).  The
 		 * kernel takes both IFCAP_TSOx and CSUM_TSO into account before
 		 * sending a TSO request our way, so it's sufficient to toggle
 		 * IFCAP_TSOx only.
 		 */
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum6 first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 		}
 		if (mask & IFCAP_LRO) {
 #if defined(INET) || defined(INET6)
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 			for_each_rxq(vi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_LRO)
 					rxq->iq.flags |= IQ_LRO_ENABLED;
 				else
 					rxq->iq.flags &= ~IQ_LRO_ENABLED;
 			}
 #endif
 		}
 #ifdef TCP_OFFLOAD
 		if (mask & IFCAP_TOE) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
 
 			rc = toe_capability(vi, enable);
 			if (rc != 0)
 				goto fail;
 
 			ifp->if_capenable ^= mask;
 		}
 #endif
 		if (mask & IFCAP_VLAN_HWTAGGING) {
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_VLANEX);
 		}
 		if (mask & IFCAP_VLAN_MTU) {
 			ifp->if_capenable ^= IFCAP_VLAN_MTU;
 
 			/* Need to find out how to disable auto-mtu-inflation */
 		}
 		if (mask & IFCAP_VLAN_HWTSO)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
 		if (mask & IFCAP_VLAN_HWCSUM)
 			ifp->if_capenable ^= IFCAP_VLAN_HWCSUM;
 #ifdef RATELIMIT
 		if (mask & IFCAP_TXRTLMT)
 			ifp->if_capenable ^= IFCAP_TXRTLMT;
 #endif
 		if (mask & IFCAP_HWRXTSTMP) {
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
 			for_each_rxq(vi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_HWRXTSTMP)
 					rxq->iq.flags |= IQ_RX_TIMESTAMP;
 				else
 					rxq->iq.flags &= ~IQ_RX_TIMESTAMP;
 			}
 		}
 
 #ifdef VLAN_CAPABILITIES
 		VLAN_CAPABILITIES(ifp);
 #endif
 fail:
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifmedia_ioctl(ifp, ifr, &pi->media, cmd);
 		break;
 
 	case SIOCGI2C: {
 		struct ifi2creq i2c;
 
 		rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (rc != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			rc = EPERM;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			rc = EINVAL;
 			break;
 		}
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c");
 		if (rc)
 			return (rc);
 		rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr,
 		    i2c.offset, i2c.len, &i2c.data[0]);
 		end_synchronized_op(sc, 0);
 		if (rc == 0)
 			rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 		break;
 	}
 
 	default:
 		rc = ether_ioctl(ifp, cmd, data);
 	}
 
 	return (rc);
 }
 
 static int
 cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_txq *txq;
 	void *items[1];
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_nextpkt == NULL);	/* not quite ready for this yet */
 
 	if (__predict_false(pi->link_cfg.link_ok == false)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	rc = parse_pkt(sc, &m);
 	if (__predict_false(rc != 0)) {
 		MPASS(m == NULL);			/* was freed already */
 		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
 #ifdef RATELIMIT
 	if (m->m_pkthdr.snd_tag != NULL) {
 		/* EAGAIN tells the stack we are not the correct interface. */
 		if (__predict_false(ifp != m->m_pkthdr.snd_tag->ifp)) {
 			m_freem(m);
 			return (EAGAIN);
 		}
 
 		return (ethofld_transmit(ifp, m));
 	}
 #endif
 
 	/* Select a txq. */
 	txq = &sc->sge.txq[vi->first_txq];
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 
 	items[0] = m;
 	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
 	if (__predict_false(rc != 0))
 		m_freem(m);
 
 	return (rc);
 }
 
 static void
 cxgbe_qflush(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
 
 	/* queues do not exist if !VI_INIT_DONE. */
 	if (vi->flags & VI_INIT_DONE) {
 		for_each_txq(vi, i, txq) {
 			TXQ_LOCK(txq);
 			txq->eq.flags |= EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 			while (!mp_ring_is_idle(txq->r)) {
 				mp_ring_check_drainage(txq->r, 0);
 				pause("qflush", 1);
 			}
 			TXQ_LOCK(txq);
 			txq->eq.flags &= ~EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 		}
 	}
 	if_qflush(ifp);
 }
 
 static uint64_t
 vi_get_counter(struct ifnet *ifp, ift_counter c)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct fw_vi_stats_vf *s = &vi->stats;
 
 	vi_refresh_stats(vi->pi->adapter, vi);
 
 	switch (c) {
 	case IFCOUNTER_IPACKETS:
 		return (s->rx_bcast_frames + s->rx_mcast_frames +
 		    s->rx_ucast_frames);
 	case IFCOUNTER_IERRORS:
 		return (s->rx_err_frames);
 	case IFCOUNTER_OPACKETS:
 		return (s->tx_bcast_frames + s->tx_mcast_frames +
 		    s->tx_ucast_frames + s->tx_offload_frames);
 	case IFCOUNTER_OERRORS:
 		return (s->tx_drop_frames);
 	case IFCOUNTER_IBYTES:
 		return (s->rx_bcast_bytes + s->rx_mcast_bytes +
 		    s->rx_ucast_bytes);
 	case IFCOUNTER_OBYTES:
 		return (s->tx_bcast_bytes + s->tx_mcast_bytes +
 		    s->tx_ucast_bytes + s->tx_offload_bytes);
 	case IFCOUNTER_IMCASTS:
 		return (s->rx_mcast_frames);
 	case IFCOUNTER_OMCASTS:
 		return (s->tx_mcast_frames);
 	case IFCOUNTER_OQDROPS: {
 		uint64_t drops;
 
 		drops = 0;
 		if (vi->flags & VI_INIT_DONE) {
 			int i;
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
 				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
 
 	}
 
 	default:
 		return (if_get_counter_default(ifp, c));
 	}
 }
 
 uint64_t
 cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct port_stats *s = &pi->stats;
 
 	if (pi->nvi > 1 || sc->flags & IS_VF)
 		return (vi_get_counter(ifp, c));
 
 	cxgbe_refresh_stats(sc, pi);
 
 	switch (c) {
 	case IFCOUNTER_IPACKETS:
 		return (s->rx_frames);
 
 	case IFCOUNTER_IERRORS:
 		return (s->rx_jabber + s->rx_runt + s->rx_too_long +
 		    s->rx_fcs_err + s->rx_len_err);
 
 	case IFCOUNTER_OPACKETS:
 		return (s->tx_frames);
 
 	case IFCOUNTER_OERRORS:
 		return (s->tx_error_frames);
 
 	case IFCOUNTER_IBYTES:
 		return (s->rx_octets);
 
 	case IFCOUNTER_OBYTES:
 		return (s->tx_octets);
 
 	case IFCOUNTER_IMCASTS:
 		return (s->rx_mcast_frames);
 
 	case IFCOUNTER_OMCASTS:
 		return (s->tx_mcast_frames);
 
 	case IFCOUNTER_IQDROPS:
 		return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
 		    s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
 		    s->rx_trunc3 + pi->tnl_cong_drops);
 
 	case IFCOUNTER_OQDROPS: {
 		uint64_t drops;
 
 		drops = s->tx_drop;
 		if (vi->flags & VI_INIT_DONE) {
 			int i;
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
 				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
 
 	}
 
 	default:
 		return (if_get_counter_default(ifp, c));
 	}
 }
 
 /*
  * The kernel picks a media from the list we had provided but we still validate
  * the requeste.
  */
 int
 cxgbe_media_change(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct ifmedia *ifm = &pi->media;
 	struct link_config *lc = &pi->link_cfg;
 	struct adapter *sc = pi->adapter;
 	int rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mec");
 	if (rc != 0)
 		return (rc);
 	PORT_LOCK(pi);
 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) {
 		/* ifconfig .. media autoselect */
 		if (!(lc->supported & FW_PORT_CAP32_ANEG)) {
 			rc = ENOTSUP; /* AN not supported by transceiver */
 			goto done;
 		}
 		lc->requested_aneg = AUTONEG_ENABLE;
 		lc->requested_speed = 0;
 		lc->requested_fc |= PAUSE_AUTONEG;
 	} else {
 		lc->requested_aneg = AUTONEG_DISABLE;
 		lc->requested_speed =
 		    ifmedia_baudrate(ifm->ifm_media) / 1000000;
 		lc->requested_fc = 0;
 		if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE)
 			lc->requested_fc |= PAUSE_RX;
 		if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE)
 			lc->requested_fc |= PAUSE_TX;
 	}
 	if (pi->up_vis > 0) {
 		fixup_link_config(pi);
 		rc = apply_link_config(pi);
 	}
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 /*
  * Base media word (without ETHER, pause, link active, etc.) for the port at the
  * given speed.
  */
 static int
 port_mword(struct port_info *pi, uint32_t speed)
 {
 
 	MPASS(speed & M_FW_PORT_CAP32_SPEED);
 	MPASS(powerof2(speed));
 
 	switch(pi->port_type) {
 	case FW_PORT_TYPE_BT_SGMII:
 	case FW_PORT_TYPE_BT_XFI:
 	case FW_PORT_TYPE_BT_XAUI:
 		/* BaseT */
 		switch (speed) {
 		case FW_PORT_CAP32_SPEED_100M:
 			return (IFM_100_T);
 		case FW_PORT_CAP32_SPEED_1G:
 			return (IFM_1000_T);
 		case FW_PORT_CAP32_SPEED_10G:
 			return (IFM_10G_T);
 		}
 		break;
 	case FW_PORT_TYPE_KX4:
 		if (speed == FW_PORT_CAP32_SPEED_10G)
 			return (IFM_10G_KX4);
 		break;
 	case FW_PORT_TYPE_CX4:
 		if (speed == FW_PORT_CAP32_SPEED_10G)
 			return (IFM_10G_CX4);
 		break;
 	case FW_PORT_TYPE_KX:
 		if (speed == FW_PORT_CAP32_SPEED_1G)
 			return (IFM_1000_KX);
 		break;
 	case FW_PORT_TYPE_KR:
 	case FW_PORT_TYPE_BP_AP:
 	case FW_PORT_TYPE_BP4_AP:
 	case FW_PORT_TYPE_BP40_BA:
 	case FW_PORT_TYPE_KR4_100G:
 	case FW_PORT_TYPE_KR_SFP28:
 	case FW_PORT_TYPE_KR_XLAUI:
 		switch (speed) {
 		case FW_PORT_CAP32_SPEED_1G:
 			return (IFM_1000_KX);
 		case FW_PORT_CAP32_SPEED_10G:
 			return (IFM_10G_KR);
 		case FW_PORT_CAP32_SPEED_25G:
 			return (IFM_25G_KR);
 		case FW_PORT_CAP32_SPEED_40G:
 			return (IFM_40G_KR4);
 		case FW_PORT_CAP32_SPEED_50G:
 			return (IFM_50G_KR2);
 		case FW_PORT_CAP32_SPEED_100G:
 			return (IFM_100G_KR4);
 		}
 		break;
 	case FW_PORT_TYPE_FIBER_XFI:
 	case FW_PORT_TYPE_FIBER_XAUI:
 	case FW_PORT_TYPE_SFP:
 	case FW_PORT_TYPE_QSFP_10G:
 	case FW_PORT_TYPE_QSA:
 	case FW_PORT_TYPE_QSFP:
 	case FW_PORT_TYPE_CR4_QSFP:
 	case FW_PORT_TYPE_CR_QSFP:
 	case FW_PORT_TYPE_CR2_QSFP:
 	case FW_PORT_TYPE_SFP28:
 		/* Pluggable transceiver */
 		switch (pi->mod_type) {
 		case FW_PORT_MOD_TYPE_LR:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_LX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_LR);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_LR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_LR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_LR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_LR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_SR:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_SX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_SR);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_SR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_SR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_SR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_SR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_ER:
 			if (speed == FW_PORT_CAP32_SPEED_10G)
 				return (IFM_10G_ER);
 			break;
 		case FW_PORT_MOD_TYPE_TWINAX_PASSIVE:
 		case FW_PORT_MOD_TYPE_TWINAX_ACTIVE:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_CX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_TWINAX);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_CR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_CR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_CR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_CR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_LRM:
 			if (speed == FW_PORT_CAP32_SPEED_10G)
 				return (IFM_10G_LRM);
 			break;
 		case FW_PORT_MOD_TYPE_NA:
 			MPASS(0);	/* Not pluggable? */
 			/* fall throough */
 		case FW_PORT_MOD_TYPE_ERROR:
 		case FW_PORT_MOD_TYPE_UNKNOWN:
 		case FW_PORT_MOD_TYPE_NOTSUPPORTED:
 			break;
 		case FW_PORT_MOD_TYPE_NONE:
 			return (IFM_NONE);
 		}
 		break;
 	case FW_PORT_TYPE_NONE:
 		return (IFM_NONE);
 	}
 
 	return (IFM_UNKNOWN);
 }
 
 void
 cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4med") != 0)
 		return;
 	PORT_LOCK(pi);
 
 	if (pi->up_vis == 0) {
 		/*
 		 * If all the interfaces are administratively down the firmware
 		 * does not report transceiver changes.  Refresh port info here
 		 * so that ifconfig displays accurate ifmedia at all times.
 		 * This is the only reason we have a synchronized op in this
 		 * function.  Just PORT_LOCK would have been enough otherwise.
 		 */
 		t4_update_port_info(pi);
 		build_medialist(pi);
 	}
 
 	/* ifm_status */
 	ifmr->ifm_status = IFM_AVALID;
 	if (lc->link_ok == false)
 		goto done;
 	ifmr->ifm_status |= IFM_ACTIVE;
 
 	/* ifm_active */
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 	ifmr->ifm_active &= ~(IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE);
 	if (lc->fc & PAUSE_RX)
 		ifmr->ifm_active |= IFM_ETH_RXPAUSE;
 	if (lc->fc & PAUSE_TX)
 		ifmr->ifm_active |= IFM_ETH_TXPAUSE;
 	ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed));
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 vcxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct vi_info *vi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id,
 	    vi - vi->pi->vi);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 alloc_extra_vi(struct adapter *sc, struct port_info *pi, struct vi_info *vi)
 {
 	int func, index, rc;
 	uint32_t param, val;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	index = vi - pi->vi;
 	MPASS(index > 0);	/* This function deals with _extra_ VIs only */
 	KASSERT(index < nitems(vi_mac_funcs),
 	    ("%s: VI %s doesn't have a MAC func", __func__,
 	    device_get_nameunit(vi->dev)));
 	func = vi_mac_funcs[index];
 	rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1,
 	    vi->hw_addr, &vi->rss_size, &vi->vfvld, &vi->vin, func, 0);
 	if (rc < 0) {
 		device_printf(vi->dev, "failed to allocate virtual interface %d"
 		    "for port %d: %d\n", index, pi->port_id, -rc);
 		return (-rc);
 	}
 	vi->viid = rc;
 
 	if (vi->rss_size == 1) {
 		/*
 		 * This VI didn't get a slice of the RSS table.  Reduce the
 		 * number of VIs being created (hw.cxgbe.num_vis) or modify the
 		 * configuration file (nvi, rssnvi for this PF) if this is a
 		 * problem.
 		 */
 		device_printf(vi->dev, "RSS table not available.\n");
 		vi->rss_base = 0xffff;
 
 		return (0);
 	}
 
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) |
 	    V_FW_PARAMS_PARAM_YZ(vi->viid);
 	rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	if (rc)
 		vi->rss_base = 0xffff;
 	else {
 		MPASS((val >> 16) == vi->rss_size);
 		vi->rss_base = val & 0xffff;
 	}
 
 	return (0);
 }
 
 static int
 vcxgbe_attach(device_t dev)
 {
 	struct vi_info *vi;
 	struct port_info *pi;
 	struct adapter *sc;
 	int rc;
 
 	vi = device_get_softc(dev);
 	pi = vi->pi;
 	sc = pi->adapter;
 
 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4via");
 	if (rc)
 		return (rc);
 	rc = alloc_extra_vi(sc, pi, vi);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	rc = cxgbe_vi_attach(dev, vi);
 	if (rc) {
 		t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid);
 		return (rc);
 	}
 	return (0);
 }
 
 static int
 vcxgbe_detach(device_t dev)
 {
 	struct vi_info *vi;
 	struct adapter *sc;
 
 	vi = device_get_softc(dev);
 	sc = vi->pi->adapter;
 
 	doom_vi(sc, vi);
 
 	cxgbe_vi_detach(vi);
 	t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid);
 
 	end_synchronized_op(sc, 0);
 
 	return (0);
 }
 
 static struct callout fatal_callout;
 
 static void
 delayed_panic(void *arg)
 {
 	struct adapter *sc = arg;
 
 	panic("%s: panic on fatal error", device_get_nameunit(sc->dev));
 }
 
 void
 t4_fatal_err(struct adapter *sc, bool fw_error)
 {
 
 	t4_shutdown_adapter(sc);
 	log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n",
 	    device_get_nameunit(sc->dev));
 	if (fw_error) {
 		ASSERT_SYNCHRONIZED_OP(sc);
 		sc->flags |= ADAP_ERR;
 	} else {
 		ADAPTER_LOCK(sc);
 		sc->flags |= ADAP_ERR;
 		ADAPTER_UNLOCK(sc);
 	}
 
 	if (t4_panic_on_fatal_err) {
 		log(LOG_ALERT, "%s: panic on fatal error after 30s",
 		    device_get_nameunit(sc->dev));
 		callout_reset(&fatal_callout, hz * 30, delayed_panic, sc);
 	}
 }
 
 void
 t4_add_adapter(struct adapter *sc)
 {
 	sx_xlock(&t4_list_lock);
 	SLIST_INSERT_HEAD(&t4_list, sc, link);
 	sx_xunlock(&t4_list_lock);
 }
 
 int
 t4_map_bars_0_and_4(struct adapter *sc)
 {
 	sc->regs_rid = PCIR_BAR(0);
 	sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->regs_rid, RF_ACTIVE);
 	if (sc->regs_res == NULL) {
 		device_printf(sc->dev, "cannot map registers.\n");
 		return (ENXIO);
 	}
 	sc->bt = rman_get_bustag(sc->regs_res);
 	sc->bh = rman_get_bushandle(sc->regs_res);
 	sc->mmio_len = rman_get_size(sc->regs_res);
 	setbit(&sc->doorbells, DOORBELL_KDB);
 
 	sc->msix_rid = PCIR_BAR(4);
 	sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->msix_rid, RF_ACTIVE);
 	if (sc->msix_res == NULL) {
 		device_printf(sc->dev, "cannot map MSI-X BAR.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 int
 t4_map_bar_2(struct adapter *sc)
 {
 
 	/*
 	 * T4: only iWARP driver uses the userspace doorbells.  There is no need
 	 * to map it if RDMA is disabled.
 	 */
 	if (is_t4(sc) && sc->rdmacaps == 0)
 		return (0);
 
 	sc->udbs_rid = PCIR_BAR(2);
 	sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->udbs_rid, RF_ACTIVE);
 	if (sc->udbs_res == NULL) {
 		device_printf(sc->dev, "cannot map doorbell BAR.\n");
 		return (ENXIO);
 	}
 	sc->udbs_base = rman_get_virtual(sc->udbs_res);
 
 	if (chip_id(sc) >= CHELSIO_T5) {
 		setbit(&sc->doorbells, DOORBELL_UDB);
 #if defined(__i386__) || defined(__amd64__)
 		if (t5_write_combine) {
 			int rc, mode;
 
 			/*
 			 * Enable write combining on BAR2.  This is the
 			 * userspace doorbell BAR and is split into 128B
 			 * (UDBS_SEG_SIZE) doorbell regions, each associated
 			 * with an egress queue.  The first 64B has the doorbell
 			 * and the second 64B can be used to submit a tx work
 			 * request with an implicit doorbell.
 			 */
 
 			rc = pmap_change_attr((vm_offset_t)sc->udbs_base,
 			    rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING);
 			if (rc == 0) {
 				clrbit(&sc->doorbells, DOORBELL_UDB);
 				setbit(&sc->doorbells, DOORBELL_WCWR);
 				setbit(&sc->doorbells, DOORBELL_UDBWC);
 			} else {
 				device_printf(sc->dev,
 				    "couldn't enable write combining: %d\n",
 				    rc);
 			}
 
 			mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0);
 			t4_write_reg(sc, A_SGE_STAT_CFG,
 			    V_STATSOURCE_T5(7) | mode);
 		}
 #endif
 	}
 	sc->iwt.wc_en = isset(&sc->doorbells, DOORBELL_UDBWC) ? 1 : 0;
 
 	return (0);
 }
 
 struct memwin_init {
 	uint32_t base;
 	uint32_t aperture;
 };
 
 static const struct memwin_init t4_memwin[NUM_MEMWIN] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 }
 };
 
 static const struct memwin_init t5_memwin[NUM_MEMWIN] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 },
 };
 
 static void
 setup_memwin(struct adapter *sc)
 {
 	const struct memwin_init *mw_init;
 	struct memwin *mw;
 	int i;
 	uint32_t bar0;
 
 	if (is_t4(sc)) {
 		/*
 		 * Read low 32b of bar0 indirectly via the hardware backdoor
 		 * mechanism.  Works from within PCI passthrough environments
 		 * too, where rman_get_start() can return a different value.  We
 		 * need to program the T4 memory window decoders with the actual
 		 * addresses that will be coming across the PCIe link.
 		 */
 		bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0));
 		bar0 &= (uint32_t) PCIM_BAR_MEM_BASE;
 
 		mw_init = &t4_memwin[0];
 	} else {
 		/* T5+ use the relative offset inside the PCIe BAR */
 		bar0 = 0;
 
 		mw_init = &t5_memwin[0];
 	}
 
 	for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) {
 		rw_init(&mw->mw_lock, "memory window access");
 		mw->mw_base = mw_init->base;
 		mw->mw_aperture = mw_init->aperture;
 		mw->mw_curpos = 0;
 		t4_write_reg(sc,
 		    PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i),
 		    (mw->mw_base + bar0) | V_BIR(0) |
 		    V_WINDOW(ilog2(mw->mw_aperture) - 10));
 		rw_wlock(&mw->mw_lock);
 		position_memwin(sc, i, 0);
 		rw_wunlock(&mw->mw_lock);
 	}
 
 	/* flush */
 	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2));
 }
 
 /*
  * Positions the memory window at the given address in the card's address space.
  * There are some alignment requirements and the actual position may be at an
  * address prior to the requested address.  mw->mw_curpos always has the actual
  * position of the window.
  */
 static void
 position_memwin(struct adapter *sc, int idx, uint32_t addr)
 {
 	struct memwin *mw;
 	uint32_t pf;
 	uint32_t reg;
 
 	MPASS(idx >= 0 && idx < NUM_MEMWIN);
 	mw = &sc->memwin[idx];
 	rw_assert(&mw->mw_lock, RA_WLOCKED);
 
 	if (is_t4(sc)) {
 		pf = 0;
 		mw->mw_curpos = addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		mw->mw_curpos = addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx);
 	t4_write_reg(sc, reg, mw->mw_curpos | pf);
 	t4_read_reg(sc, reg);	/* flush */
 }
 
 int
 rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val,
     int len, int rw)
 {
 	struct memwin *mw;
 	uint32_t mw_end, v;
 
 	MPASS(idx >= 0 && idx < NUM_MEMWIN);
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len <= 0)
 		return (EINVAL);
 
 	mw = &sc->memwin[idx];
 	while (len > 0) {
 		rw_rlock(&mw->mw_lock);
 		mw_end = mw->mw_curpos + mw->mw_aperture;
 		if (addr >= mw_end || addr < mw->mw_curpos) {
 			/* Will need to reposition the window */
 			if (!rw_try_upgrade(&mw->mw_lock)) {
 				rw_runlock(&mw->mw_lock);
 				rw_wlock(&mw->mw_lock);
 			}
 			rw_assert(&mw->mw_lock, RA_WLOCKED);
 			position_memwin(sc, idx, addr);
 			rw_downgrade(&mw->mw_lock);
 			mw_end = mw->mw_curpos + mw->mw_aperture;
 		}
 		rw_assert(&mw->mw_lock, RA_RLOCKED);
 		while (addr < mw_end && len > 0) {
 			if (rw == 0) {
 				v = t4_read_reg(sc, mw->mw_base + addr -
 				    mw->mw_curpos);
 				*val++ = le32toh(v);
 			} else {
 				v = *val++;
 				t4_write_reg(sc, mw->mw_base + addr -
 				    mw->mw_curpos, htole32(v));
 			}
 			addr += 4;
 			len -= 4;
 		}
 		rw_runlock(&mw->mw_lock);
 	}
 
 	return (0);
 }
 
 int
 alloc_atid_tab(struct tid_info *t, int flags)
 {
 	int i;
 
 	MPASS(t->natids > 0);
 	MPASS(t->atid_tab == NULL);
 
 	t->atid_tab = malloc(t->natids * sizeof(*t->atid_tab), M_CXGBE,
 	    M_ZERO | flags);
 	if (t->atid_tab == NULL)
 		return (ENOMEM);
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 
 	return (0);
 }
 
 void
 free_atid_tab(struct tid_info *t)
 {
 
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	free(t->atid_tab, M_CXGBE);
 	t->atid_tab = NULL;
 }
 
 int
 alloc_atid(struct adapter *sc, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 	int atid = -1;
 
 	mtx_lock(&t->atid_lock);
 	if (t->afree) {
 		union aopen_entry *p = t->afree;
 
 		atid = p - t->atid_tab;
 		MPASS(atid <= M_TID_TID);
 		t->afree = p->next;
 		p->data = ctx;
 		t->atids_in_use++;
 	}
 	mtx_unlock(&t->atid_lock);
 	return (atid);
 }
 
 void *
 lookup_atid(struct adapter *sc, int atid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->atid_tab[atid].data);
 }
 
 void
 free_atid(struct adapter *sc, int atid)
 {
 	struct tid_info *t = &sc->tids;
 	union aopen_entry *p = &t->atid_tab[atid];
 
 	mtx_lock(&t->atid_lock);
 	p->next = t->afree;
 	t->afree = p;
 	t->atids_in_use--;
 	mtx_unlock(&t->atid_lock);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 t4_range_cmp(const void *a, const void *b)
 {
 	return ((const struct t4_range *)a)->start -
 	       ((const struct t4_range *)b)->start;
 }
 
 /*
  * Verify that the memory range specified by the addr/len pair is valid within
  * the card's address space.
  */
 static int
 validate_mem_range(struct adapter *sc, uint32_t addr, uint32_t len)
 {
 	struct t4_range mem_ranges[4], *r, *next;
 	uint32_t em, addr_len;
 	int i, n, remaining;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	/* Enabled memories */
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 
 	r = &mem_ranges[0];
 	n = 0;
 	bzero(r, sizeof(mem_ranges));
 	if (em & F_EDRAM0_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		r->size = G_EDRAM0_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EDRAM0_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (em & F_EDRAM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		r->size = G_EDRAM1_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EDRAM1_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (em & F_EXT_MEM_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		r->size = G_EXT_MEM_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EXT_MEM_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		r->size = G_EXT_MEM1_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EXT_MEM1_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	MPASS(n <= nitems(mem_ranges));
 
 	if (n > 1) {
 		/* Sort and merge the ranges. */
 		qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp);
 
 		/* Start from index 0 and examine the next n - 1 entries. */
 		r = &mem_ranges[0];
 		for (remaining = n - 1; remaining > 0; remaining--, r++) {
 
 			MPASS(r->size > 0);	/* r is a valid entry. */
 			next = r + 1;
 			MPASS(next->size > 0);	/* and so is the next one. */
 
 			while (r->start + r->size >= next->start) {
 				/* Merge the next one into the current entry. */
 				r->size = max(r->start + r->size,
 				    next->start + next->size) - r->start;
 				n--;	/* One fewer entry in total. */
 				if (--remaining == 0)
 					goto done;	/* short circuit */
 				next++;
 			}
 			if (next != r + 1) {
 				/*
 				 * Some entries were merged into r and next
 				 * points to the first valid entry that couldn't
 				 * be merged.
 				 */
 				MPASS(next->size > 0);	/* must be valid */
 				memcpy(r + 1, next, remaining * sizeof(*r));
 #ifdef INVARIANTS
 				/*
 				 * This so that the foo->size assertion in the
 				 * next iteration of the loop do the right
 				 * thing for entries that were pulled up and are
 				 * no longer valid.
 				 */
 				MPASS(n < nitems(mem_ranges));
 				bzero(&mem_ranges[n], (nitems(mem_ranges) - n) *
 				    sizeof(struct t4_range));
 #endif
 			}
 		}
 done:
 		/* Done merging the ranges. */
 		MPASS(n > 0);
 		r = &mem_ranges[0];
 		for (i = 0; i < n; i++, r++) {
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 		}
 	}
 
 	return (EFAULT);
 }
 
 static int
 fwmtype_to_hwmtype(int mtype)
 {
 
 	switch (mtype) {
 	case FW_MEMTYPE_EDC0:
 		return (MEM_EDC0);
 	case FW_MEMTYPE_EDC1:
 		return (MEM_EDC1);
 	case FW_MEMTYPE_EXTMEM:
 		return (MEM_MC0);
 	case FW_MEMTYPE_EXTMEM1:
 		return (MEM_MC1);
 	default:
 		panic("%s: cannot translate fw mtype %d.", __func__, mtype);
 	}
 }
 
 /*
  * Verify that the memory range specified by the memtype/offset/len pair is
  * valid and lies entirely within the memtype specified.  The global address of
  * the start of the range is returned in addr.
  */
 static int
 validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, uint32_t len,
     uint32_t *addr)
 {
 	uint32_t em, addr_len, maddr;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (off & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	switch (fwmtype_to_hwmtype(mtype)) {
 	case MEM_EDC0:
 		if (!(em & F_EDRAM0_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		maddr = G_EDRAM0_BASE(addr_len) << 20;
 		break;
 	case MEM_EDC1:
 		if (!(em & F_EDRAM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		maddr = G_EDRAM1_BASE(addr_len) << 20;
 		break;
 	case MEM_MC:
 		if (!(em & F_EXT_MEM_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		maddr = G_EXT_MEM_BASE(addr_len) << 20;
 		break;
 	case MEM_MC1:
 		if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		maddr = G_EXT_MEM1_BASE(addr_len) << 20;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	*addr = maddr + off;	/* global address */
 	return (validate_mem_range(sc, *addr, len));
 }
 
 static int
 fixup_devlog_params(struct adapter *sc)
 {
 	struct devlog_params *dparams = &sc->params.devlog;
 	int rc;
 
 	rc = validate_mt_off_len(sc, dparams->memtype, dparams->start,
 	    dparams->size, &dparams->addr);
 
 	return (rc);
 }
 
 static void
 update_nirq(struct intrs_and_queues *iaq, int nports)
 {
 	int extra = T4_EXTRA_INTR;
 
 	iaq->nirq = extra;
 	iaq->nirq += nports * (iaq->nrxq + iaq->nofldrxq);
 	iaq->nirq += nports * (iaq->num_vis - 1) *
 	    max(iaq->nrxq_vi, iaq->nnmrxq_vi);
 	iaq->nirq += nports * (iaq->num_vis - 1) * iaq->nofldrxq_vi;
 }
 
 /*
  * Adjust requirements to fit the number of interrupts available.
  */
 static void
 calculate_iaq(struct adapter *sc, struct intrs_and_queues *iaq, int itype,
     int navail)
 {
 	int old_nirq;
 	const int nports = sc->params.nports;
 
 	MPASS(nports > 0);
 	MPASS(navail > 0);
 
 	bzero(iaq, sizeof(*iaq));
 	iaq->intr_type = itype;
 	iaq->num_vis = t4_num_vis;
 	iaq->ntxq = t4_ntxq;
 	iaq->ntxq_vi = t4_ntxq_vi;
 	iaq->nrxq = t4_nrxq;
 	iaq->nrxq_vi = t4_nrxq_vi;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (is_offload(sc) || is_ethoffload(sc)) {
 		iaq->nofldtxq = t4_nofldtxq;
 		iaq->nofldtxq_vi = t4_nofldtxq_vi;
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		iaq->nofldrxq = t4_nofldrxq;
 		iaq->nofldrxq_vi = t4_nofldrxq_vi;
 	}
 #endif
 #ifdef DEV_NETMAP
 	iaq->nnmtxq_vi = t4_nnmtxq_vi;
 	iaq->nnmrxq_vi = t4_nnmrxq_vi;
 #endif
 
 	update_nirq(iaq, nports);
 	if (iaq->nirq <= navail &&
 	    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 		/*
 		 * This is the normal case -- there are enough interrupts for
 		 * everything.
 		 */
 		goto done;
 	}
 
 	/*
 	 * If extra VIs have been configured try reducing their count and see if
 	 * that works.
 	 */
 	while (iaq->num_vis > 1) {
 		iaq->num_vis--;
 		update_nirq(iaq, nports);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			device_printf(sc->dev, "virtual interfaces per port "
 			    "reduced to %d from %d.  nrxq=%u, nofldrxq=%u, "
 			    "nrxq_vi=%u nofldrxq_vi=%u, nnmrxq_vi=%u.  "
 			    "itype %d, navail %u, nirq %d.\n",
 			    iaq->num_vis, t4_num_vis, iaq->nrxq, iaq->nofldrxq,
 			    iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi,
 			    itype, navail, iaq->nirq);
 			goto done;
 		}
 	}
 
 	/*
 	 * Extra VIs will not be created.  Log a message if they were requested.
 	 */
 	MPASS(iaq->num_vis == 1);
 	iaq->ntxq_vi = iaq->nrxq_vi = 0;
 	iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0;
 	iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0;
 	if (iaq->num_vis != t4_num_vis) {
 		device_printf(sc->dev, "extra virtual interfaces disabled.  "
 		    "nrxq=%u, nofldrxq=%u, nrxq_vi=%u nofldrxq_vi=%u, "
 		    "nnmrxq_vi=%u.  itype %d, navail %u, nirq %d.\n",
 		    iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi,
 		    iaq->nnmrxq_vi, itype, navail, iaq->nirq);
 	}
 
 	/*
 	 * Keep reducing the number of NIC rx queues to the next lower power of
 	 * 2 (for even RSS distribution) and halving the TOE rx queues and see
 	 * if that works.
 	 */
 	do {
 		if (iaq->nrxq > 1) {
 			do {
 				iaq->nrxq--;
 			} while (!powerof2(iaq->nrxq));
 		}
 		if (iaq->nofldrxq > 1)
 			iaq->nofldrxq >>= 1;
 
 		old_nirq = iaq->nirq;
 		update_nirq(iaq, nports);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			device_printf(sc->dev, "running with reduced number of "
 			    "rx queues because of shortage of interrupts.  "
 			    "nrxq=%u, nofldrxq=%u.  "
 			    "itype %d, navail %u, nirq %d.\n", iaq->nrxq,
 			    iaq->nofldrxq, itype, navail, iaq->nirq);
 			goto done;
 		}
 	} while (old_nirq != iaq->nirq);
 
 	/* One interrupt for everything.  Ugh. */
 	device_printf(sc->dev, "running with minimal number of queues.  "
 	    "itype %d, navail %u.\n", itype, navail);
 	iaq->nirq = 1;
 	MPASS(iaq->nrxq == 1);
 	iaq->ntxq = 1;
 	if (iaq->nofldrxq > 1)
 		iaq->nofldtxq = 1;
 done:
 	MPASS(iaq->num_vis > 0);
 	if (iaq->num_vis > 1) {
 		MPASS(iaq->nrxq_vi > 0);
 		MPASS(iaq->ntxq_vi > 0);
 	}
 	MPASS(iaq->nirq > 0);
 	MPASS(iaq->nrxq > 0);
 	MPASS(iaq->ntxq > 0);
 	if (itype == INTR_MSI) {
 		MPASS(powerof2(iaq->nirq));
 	}
 }
 
 static int
 cfg_itype_and_nqueues(struct adapter *sc, struct intrs_and_queues *iaq)
 {
 	int rc, itype, navail, nalloc;
 
 	for (itype = INTR_MSIX; itype; itype >>= 1) {
 
 		if ((itype & t4_intr_types) == 0)
 			continue;	/* not allowed */
 
 		if (itype == INTR_MSIX)
 			navail = pci_msix_count(sc->dev);
 		else if (itype == INTR_MSI)
 			navail = pci_msi_count(sc->dev);
 		else
 			navail = 1;
 restart:
 		if (navail == 0)
 			continue;
 
 		calculate_iaq(sc, iaq, itype, navail);
 		nalloc = iaq->nirq;
 		rc = 0;
 		if (itype == INTR_MSIX)
 			rc = pci_alloc_msix(sc->dev, &nalloc);
 		else if (itype == INTR_MSI)
 			rc = pci_alloc_msi(sc->dev, &nalloc);
 
 		if (rc == 0 && nalloc > 0) {
 			if (nalloc == iaq->nirq)
 				return (0);
 
 			/*
 			 * Didn't get the number requested.  Use whatever number
 			 * the kernel is willing to allocate.
 			 */
 			device_printf(sc->dev, "fewer vectors than requested, "
 			    "type=%d, req=%d, rcvd=%d; will downshift req.\n",
 			    itype, iaq->nirq, nalloc);
 			pci_release_msi(sc->dev);
 			navail = nalloc;
 			goto restart;
 		}
 
 		device_printf(sc->dev,
 		    "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n",
 		    itype, rc, iaq->nirq, nalloc);
 	}
 
 	device_printf(sc->dev,
 	    "failed to find a usable interrupt type.  "
 	    "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types,
 	    pci_msix_count(sc->dev), pci_msi_count(sc->dev));
 
 	return (ENXIO);
 }
 
 #define FW_VERSION(chip) ( \
     V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \
     V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \
     V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \
     V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD))
 #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf)
 
 /* Just enough of fw_hdr to cover all version info. */
 struct fw_h {
 	__u8	ver;
 	__u8	chip;
 	__be16	len512;
 	__be32	fw_ver;
 	__be32	tp_microcode_ver;
 	__u8	intfver_nic;
 	__u8	intfver_vnic;
 	__u8	intfver_ofld;
 	__u8	intfver_ri;
 	__u8	intfver_iscsipdu;
 	__u8	intfver_iscsi;
 	__u8	intfver_fcoepdu;
 	__u8	intfver_fcoe;
 };
 /* Spot check a couple of fields. */
 CTASSERT(offsetof(struct fw_h, fw_ver) == offsetof(struct fw_hdr, fw_ver));
 CTASSERT(offsetof(struct fw_h, intfver_nic) == offsetof(struct fw_hdr, intfver_nic));
 CTASSERT(offsetof(struct fw_h, intfver_fcoe) == offsetof(struct fw_hdr, intfver_fcoe));
 
 struct fw_info {
 	uint8_t chip;
 	char *kld_name;
 	char *fw_mod_name;
 	struct fw_h fw_h;
 } fw_info[] = {
 	{
 		.chip = CHELSIO_T4,
 		.kld_name = "t4fw_cfg",
 		.fw_mod_name = "t4fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T4,
 			.fw_ver = htobe32(FW_VERSION(T4)),
 			.intfver_nic = FW_INTFVER(T4, NIC),
 			.intfver_vnic = FW_INTFVER(T4, VNIC),
 			.intfver_ofld = FW_INTFVER(T4, OFLD),
 			.intfver_ri = FW_INTFVER(T4, RI),
 			.intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T4, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T4, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T5,
 		.kld_name = "t5fw_cfg",
 		.fw_mod_name = "t5fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T5,
 			.fw_ver = htobe32(FW_VERSION(T5)),
 			.intfver_nic = FW_INTFVER(T5, NIC),
 			.intfver_vnic = FW_INTFVER(T5, VNIC),
 			.intfver_ofld = FW_INTFVER(T5, OFLD),
 			.intfver_ri = FW_INTFVER(T5, RI),
 			.intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T5, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T5, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T6,
 		.kld_name = "t6fw_cfg",
 		.fw_mod_name = "t6fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T6,
 			.fw_ver = htobe32(FW_VERSION(T6)),
 			.intfver_nic = FW_INTFVER(T6, NIC),
 			.intfver_vnic = FW_INTFVER(T6, VNIC),
 			.intfver_ofld = FW_INTFVER(T6, OFLD),
 			.intfver_ri = FW_INTFVER(T6, RI),
 			.intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T6, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T6, FCOE),
 		},
 	}
 };
 
 static struct fw_info *
 find_fw_info(int chip)
 {
 	int i;
 
 	for (i = 0; i < nitems(fw_info); i++) {
 		if (fw_info[i].chip == chip)
 			return (&fw_info[i]);
 	}
 	return (NULL);
 }
 
 /*
  * Is the given firmware API compatible with the one the driver was compiled
  * with?
  */
 static int
 fw_compatible(const struct fw_h *hdr1, const struct fw_h *hdr2)
 {
 
 	/* short circuit if it's the exact same firmware version */
 	if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver)
 		return (1);
 
 	/*
 	 * XXX: Is this too conservative?  Perhaps I should limit this to the
 	 * features that are supported in the driver.
 	 */
 #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x)
 	if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) &&
 	    SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) &&
 	    SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe))
 		return (1);
 #undef SAME_INTF
 
 	return (0);
 }
 
 static int
 load_fw_module(struct adapter *sc, const struct firmware **dcfg,
     const struct firmware **fw)
 {
 	struct fw_info *fw_info;
 
 	*dcfg = NULL;
 	if (fw != NULL)
 		*fw = NULL;
 
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 
 	*dcfg = firmware_get(fw_info->kld_name);
 	if (*dcfg != NULL) {
 		if (fw != NULL)
 			*fw = firmware_get(fw_info->fw_mod_name);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 unload_fw_module(struct adapter *sc, const struct firmware *dcfg,
     const struct firmware *fw)
 {
 
 	if (fw != NULL)
 		firmware_put(fw, FIRMWARE_UNLOAD);
 	if (dcfg != NULL)
 		firmware_put(dcfg, FIRMWARE_UNLOAD);
 }
 
 /*
  * Return values:
  * 0 means no firmware install attempted.
  * ERESTART means a firmware install was attempted and was successful.
  * +ve errno means a firmware install was attempted but failed.
  */
 static int
 install_kld_firmware(struct adapter *sc, struct fw_h *card_fw,
     const struct fw_h *drv_fw, const char *reason, int *already)
 {
 	const struct firmware *cfg, *fw;
 	const uint32_t c = be32toh(card_fw->fw_ver);
 	uint32_t d, k;
 	int rc, fw_install;
 	struct fw_h bundled_fw;
 	bool load_attempted;
 
 	cfg = fw = NULL;
 	load_attempted = false;
 	fw_install = t4_fw_install < 0 ? -t4_fw_install : t4_fw_install;
 
 	if (reason != NULL)
 		goto install;
 
 	if ((sc->flags & FW_OK) == 0) {
 
 		if (c == 0xffffffff) {
 			reason = "missing";
 			goto install;
 		}
 
 		return (0);
 	}
 
 	memcpy(&bundled_fw, drv_fw, sizeof(bundled_fw));
 	if (t4_fw_install < 0) {
 		rc = load_fw_module(sc, &cfg, &fw);
 		if (rc != 0 || fw == NULL) {
 			device_printf(sc->dev,
 			    "failed to load firmware module: %d. cfg %p, fw %p;"
 			    " will use compiled-in firmware version for"
 			    "hw.cxgbe.fw_install checks.\n",
 			    rc, cfg, fw);
 		} else {
 			memcpy(&bundled_fw, fw->data, sizeof(bundled_fw));
 		}
 		load_attempted = true;
 	}
 	d = be32toh(bundled_fw.fw_ver);
 
 	if (!fw_compatible(card_fw, &bundled_fw)) {
 		reason = "incompatible or unusable";
 		goto install;
 	}
 
 	if (d > c) {
 		reason = "older than the version bundled with this driver";
 		goto install;
 	}
 
 	if (fw_install == 2 && d != c) {
 		reason = "different than the version bundled with this driver";
 		goto install;
 	}
 
 	/* No reason to do anything to the firmware already on the card. */
 	rc = 0;
 	goto done;
 
 install:
 	rc = 0;
 	if ((*already)++)
 		goto done;
 
 	if (fw_install == 0) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver is prohibited from installing a firmware "
 		    "on the card.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 
 		goto done;
 	}
 
 	/*
 	 * We'll attempt to install a firmware.  Load the module first (if it
 	 * hasn't been loaded already).
 	 */
 	if (!load_attempted) {
 		rc = load_fw_module(sc, &cfg, &fw);
 		if (rc != 0 || fw == NULL) {
 			device_printf(sc->dev,
 			    "failed to load firmware module: %d. cfg %p, fw %p\n",
 			    rc, cfg, fw);
 			/* carry on */
 		}
 	}
 	if (fw == NULL) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver cannot take corrective action because it "
 		    "is unable to load the firmware module.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 		rc = sc->flags & FW_OK ? 0 : ENOENT;
 		goto done;
 	}
 	k = be32toh(((const struct fw_hdr *)fw->data)->fw_ver);
 	if (k != d) {
 		MPASS(t4_fw_install > 0);
 		device_printf(sc->dev,
 		    "firmware in KLD (%u.%u.%u.%u) is not what the driver was "
 		    "expecting (%u.%u.%u.%u) and will not be used.\n",
 		    G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k),
 		    G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k),
 		    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 		    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d));
 		rc = sc->flags & FW_OK ? 0 : EINVAL;
 		goto done;
 	}
 
 	device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 	    "installing firmware %u.%u.%u.%u on card.\n",
 	    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 	    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason,
 	    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 	    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d));
 
 	rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to install firmware: %d\n", rc);
 	} else {
 		/* Installed successfully, update the cached header too. */
 		rc = ERESTART;
 		memcpy(card_fw, fw->data, sizeof(*card_fw));
 	}
 done:
 	unload_fw_module(sc, cfg, fw);
 
 	return (rc);
 }
 
 /*
  * Establish contact with the firmware and attempt to become the master driver.
  *
  * A firmware will be installed to the card if needed (if the driver is allowed
  * to do so).
  */
 static int
 contact_firmware(struct adapter *sc)
 {
 	int rc, already = 0;
 	enum dev_state state;
 	struct fw_info *fw_info;
 	struct fw_hdr *card_fw;		/* fw on the card */
 	const struct fw_h *drv_fw;
 
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 	drv_fw = &fw_info->fw_h;
 
 	/* Read the header of the firmware on the card */
 	card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK);
 restart:
 	rc = -t4_get_fw_hdr(sc, card_fw);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "unable to read firmware header from card's flash: %d\n",
 		    rc);
 		goto done;
 	}
 
 	rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL,
 	    &already);
 	if (rc == ERESTART)
 		goto restart;
 	if (rc != 0)
 		goto done;
 
 	rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state);
 	if (rc < 0 || state == DEV_STATE_ERR) {
 		rc = -rc;
 		device_printf(sc->dev,
 		    "failed to connect to the firmware: %d, %d.  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 #if 0
 		if (install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw,
 		    "not responding properly to HELLO", &already) == ERESTART)
 			goto restart;
 #endif
 		goto done;
 	}
 	MPASS(be32toh(card_fw->flags) & FW_HDR_FLAGS_RESET_HALT);
 	sc->flags |= FW_OK;	/* The firmware responded to the FW_HELLO. */
 
 	if (rc == sc->pf) {
 		sc->flags |= MASTER_PF;
 		rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw,
 		    NULL, &already);
 		if (rc == ERESTART)
 			rc = 0;
 		else if (rc != 0)
 			goto done;
 	} else if (state == DEV_STATE_UNINIT) {
 		/*
 		 * We didn't get to be the master so we definitely won't be
 		 * configuring the chip.  It's a bug if someone else hasn't
 		 * configured it already.
 		 */
 		device_printf(sc->dev, "couldn't be master(%d), "
 		    "device not already initialized either(%d).  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 		rc = EPROTO;
 		goto done;
 	} else {
 		/*
 		 * Some other PF is the master and has configured the chip.
 		 * This is allowed but untested.
 		 */
 		device_printf(sc->dev, "PF%d is master, device state %d.  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 		snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", rc);
 		sc->cfcsum = 0;
 		rc = 0;
 	}
 done:
 	if (rc != 0 && sc->flags & FW_OK) {
 		t4_fw_bye(sc, sc->mbox);
 		sc->flags &= ~FW_OK;
 	}
 	free(card_fw, M_CXGBE);
 	return (rc);
 }
 
 static int
 copy_cfg_file_to_card(struct adapter *sc, char *cfg_file,
     uint32_t mtype, uint32_t moff)
 {
 	struct fw_info *fw_info;
 	const struct firmware *dcfg, *rcfg = NULL;
 	const uint32_t *cfdata;
 	uint32_t cflen, addr;
 	int rc;
 
 	load_fw_module(sc, &dcfg, NULL);
 
 	/* Card specific interpretation of "default". */
 	if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		if (pci_get_device(sc->dev) == 0x440a)
 			snprintf(cfg_file, sizeof(t4_cfg_file), UWIRE_CF);
 		if (is_fpga(sc))
 			snprintf(cfg_file, sizeof(t4_cfg_file), FPGA_CF);
 	}
 
 	if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		if (dcfg == NULL) {
 			device_printf(sc->dev,
 			    "KLD with default config is not available.\n");
 			rc = ENOENT;
 			goto done;
 		}
 		cfdata = dcfg->data;
 		cflen = dcfg->datasize & ~3;
 	} else {
 		char s[32];
 
 		fw_info = find_fw_info(chip_id(sc));
 		if (fw_info == NULL) {
 			device_printf(sc->dev,
 			    "unable to look up firmware information for chip %d.\n",
 			    chip_id(sc));
 			rc = EINVAL;
 			goto done;
 		}
 		snprintf(s, sizeof(s), "%s_%s", fw_info->kld_name, cfg_file);
 
 		rcfg = firmware_get(s);
 		if (rcfg == NULL) {
 			device_printf(sc->dev,
 			    "unable to load module \"%s\" for configuration "
 			    "profile \"%s\".\n", s, cfg_file);
 			rc = ENOENT;
 			goto done;
 		}
 		cfdata = rcfg->data;
 		cflen = rcfg->datasize & ~3;
 	}
 
 	if (cflen > FLASH_CFG_MAX_SIZE) {
 		device_printf(sc->dev,
 		    "config file too long (%d, max allowed is %d).\n",
 		    cflen, FLASH_CFG_MAX_SIZE);
 		rc = EINVAL;
 		goto done;
 	}
 
 	rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "%s: addr (%d/0x%x) or len %d is not valid: %d.\n",
 		    __func__, mtype, moff, cflen, rc);
 		rc = EINVAL;
 		goto done;
 	}
 	write_via_memwin(sc, 2, addr, cfdata, cflen);
 done:
 	if (rcfg != NULL)
 		firmware_put(rcfg, FIRMWARE_UNLOAD);
 	unload_fw_module(sc, dcfg, NULL);
 	return (rc);
 }
 
 struct caps_allowed {
 	uint16_t nbmcaps;
 	uint16_t linkcaps;
 	uint16_t switchcaps;
 	uint16_t niccaps;
 	uint16_t toecaps;
 	uint16_t rdmacaps;
 	uint16_t cryptocaps;
 	uint16_t iscsicaps;
 	uint16_t fcoecaps;
 };
 
 #define FW_PARAM_DEV(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param))
 #define FW_PARAM_PFVF(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param))
 
 /*
  * Provide a configuration profile to the firmware and have it initialize the
  * chip accordingly.  This may involve uploading a configuration file to the
  * card.
  */
 static int
 apply_cfg_and_initialize(struct adapter *sc, char *cfg_file,
     const struct caps_allowed *caps_allowed)
 {
 	int rc;
 	struct fw_caps_config_cmd caps;
 	uint32_t mtype, moff, finicsum, cfcsum, param, val;
 
 	rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST);
 	if (rc != 0) {
 		device_printf(sc->dev, "firmware reset failed: %d.\n", rc);
 		return (rc);
 	}
 
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	if (strncmp(cfg_file, BUILTIN_CF, sizeof(t4_cfg_file)) == 0) {
 		mtype = 0;
 		moff = 0;
 		caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	} else if (strncmp(cfg_file, FLASH_CF, sizeof(t4_cfg_file)) == 0) {
 		mtype = FW_MEMTYPE_FLASH;
 		moff = t4_flash_cfg_addr(sc);
 		caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 		    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 		    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) |
 		    FW_LEN16(caps));
 	} else {
 		/*
 		 * Ask the firmware where it wants us to upload the config file.
 		 */
 		param = FW_PARAM_DEV(CF);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* No support for config file?  Shouldn't happen. */
 			device_printf(sc->dev,
 			    "failed to query config file location: %d.\n", rc);
 			goto done;
 		}
 		mtype = G_FW_PARAMS_PARAM_Y(val);
 		moff = G_FW_PARAMS_PARAM_Z(val) << 16;
 		caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 		    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 		    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) |
 		    FW_LEN16(caps));
 
 		rc = copy_cfg_file_to_card(sc, cfg_file, mtype, moff);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to upload config file to card: %d.\n", rc);
 			goto done;
 		}
 	}
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to pre-process config file: %d "
 		    "(mtype %d, moff 0x%x).\n", rc, mtype, moff);
 		goto done;
 	}
 
 	finicsum = be32toh(caps.finicsum);
 	cfcsum = be32toh(caps.cfcsum);	/* actual */
 	if (finicsum != cfcsum) {
 		device_printf(sc->dev,
 		    "WARNING: config file checksum mismatch: %08x %08x\n",
 		    finicsum, cfcsum);
 	}
 	sc->cfcsum = cfcsum;
 	snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", cfg_file);
 
 	/*
 	 * Let the firmware know what features will (not) be used so it can tune
 	 * things accordingly.
 	 */
 #define LIMIT_CAPS(x) do { \
 	caps.x##caps &= htobe16(caps_allowed->x##caps); \
 } while (0)
 	LIMIT_CAPS(nbm);
 	LIMIT_CAPS(link);
 	LIMIT_CAPS(switch);
 	LIMIT_CAPS(nic);
 	LIMIT_CAPS(toe);
 	LIMIT_CAPS(rdma);
 	LIMIT_CAPS(crypto);
 	LIMIT_CAPS(iscsi);
 	LIMIT_CAPS(fcoe);
 #undef LIMIT_CAPS
 	if (caps.niccaps & htobe16(FW_CAPS_CONFIG_NIC_HASHFILTER)) {
 		/*
 		 * TOE and hashfilters are mutually exclusive.  It is a config
 		 * file or firmware bug if both are reported as available.  Try
 		 * to cope with the situation in non-debug builds by disabling
 		 * TOE.
 		 */
 		MPASS(caps.toecaps == 0);
 
 		caps.toecaps = 0;
 		caps.rdmacaps = 0;
 		caps.iscsicaps = 0;
 	}
 
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_WRITE);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to process config file: %d.\n", rc);
 		goto done;
 	}
 
 	t4_tweak_chip_settings(sc);
 	set_params__pre_init(sc);
 
 	/* get basic stuff going */
 	rc = -t4_fw_initialize(sc, sc->mbox);
 	if (rc != 0) {
 		device_printf(sc->dev, "fw_initialize failed: %d.\n", rc);
 		goto done;
 	}
 done:
 	return (rc);
 }
 
 /*
  * Partition chip resources for use between various PFs, VFs, etc.
  */
 static int
 partition_resources(struct adapter *sc)
 {
 	char cfg_file[sizeof(t4_cfg_file)];
 	struct caps_allowed caps_allowed;
 	int rc;
 	bool fallback;
 
 	/* Only the master driver gets to configure the chip resources. */
 	MPASS(sc->flags & MASTER_PF);
 
 #define COPY_CAPS(x) do { \
 	caps_allowed.x##caps = t4_##x##caps_allowed; \
 } while (0)
 	bzero(&caps_allowed, sizeof(caps_allowed));
 	COPY_CAPS(nbm);
 	COPY_CAPS(link);
 	COPY_CAPS(switch);
 	COPY_CAPS(nic);
 	COPY_CAPS(toe);
 	COPY_CAPS(rdma);
 	COPY_CAPS(crypto);
 	COPY_CAPS(iscsi);
 	COPY_CAPS(fcoe);
 	fallback = sc->debug_flags & DF_DISABLE_CFG_RETRY ? false : true;
 	snprintf(cfg_file, sizeof(cfg_file), "%s", t4_cfg_file);
 retry:
 	rc = apply_cfg_and_initialize(sc, cfg_file, &caps_allowed);
 	if (rc != 0 && fallback) {
 		device_printf(sc->dev,
 		    "failed (%d) to configure card with \"%s\" profile, "
 		    "will fall back to a basic configuration and retry.\n",
 		    rc, cfg_file);
 		snprintf(cfg_file, sizeof(cfg_file), "%s", BUILTIN_CF);
 		bzero(&caps_allowed, sizeof(caps_allowed));
 		COPY_CAPS(nbm);
 		COPY_CAPS(link);
 		COPY_CAPS(switch);
 		COPY_CAPS(nic);
 		fallback = false;
 		goto retry;
 	}
 #undef COPY_CAPS
 	return (rc);
 }
 
 /*
  * Retrieve parameters that are needed (or nice to have) very early.
  */
 static int
 get_params__pre_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[2], val[2];
 
 	t4_get_version_info(sc);
 
 	snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers));
 
 	snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers));
 
 	snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers));
 
 	snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.er_vers));
 
 	param[0] = FW_PARAM_DEV(PORTVEC);
 	param[1] = FW_PARAM_DEV(CCLK);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (pre_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->params.portvec = val[0];
 	sc->params.nports = bitcount32(val[0]);
 	sc->params.vpd.cclk = val[1];
 
 	/* Read device log parameters. */
 	rc = -t4_init_devlog_params(sc, 1);
 	if (rc == 0)
 		fixup_devlog_params(sc);
 	else {
 		device_printf(sc->dev,
 		    "failed to get devlog parameters: %d.\n", rc);
 		rc = 0;	/* devlog isn't critical for device operation */
 	}
 
 	return (rc);
 }
 
 /*
  * Any params that need to be set before FW_INITIALIZE.
  */
 static int
 set_params__pre_init(struct adapter *sc)
 {
 	int rc = 0;
 	uint32_t param, val;
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		param = FW_PARAM_DEV(HPFILTER_REGION_SUPPORT);
 		val = 1;
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		/* firmwares < 1.20.1.0 do not have this param. */
 		if (rc == FW_EINVAL && sc->params.fw_vers <
 		    (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) |
 		    V_FW_HDR_FW_VER_MICRO(1) | V_FW_HDR_FW_VER_BUILD(0))) {
 			rc = 0;
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to enable high priority filters :%d.\n",
 			    rc);
 		}
 	}
 
 	/* Enable opaque VIIDs with firmwares that support it. */
 	param = FW_PARAM_DEV(OPAQUE_VIID_SMT_EXTN);
 	val = 1;
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	if (rc == 0 && val == 1)
 		sc->params.viid_smt_extn_support = true;
 	else
 		sc->params.viid_smt_extn_support = false;
 
 	return (rc);
 }
 
 /*
  * Retrieve various parameters that are of interest to the driver.  The device
  * has been initialized by the firmware at this point.
  */
 static int
 get_params__post_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[7], val[7];
 	struct fw_caps_config_cmd caps;
 
 	param[0] = FW_PARAM_PFVF(IQFLINT_START);
 	param[1] = FW_PARAM_PFVF(EQ_START);
 	param[2] = FW_PARAM_PFVF(FILTER_START);
 	param[3] = FW_PARAM_PFVF(FILTER_END);
 	param[4] = FW_PARAM_PFVF(L2T_START);
 	param[5] = FW_PARAM_PFVF(L2T_END);
 	param[6] = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 	    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 7, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (post_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->sge.iq_start = val[0];
 	sc->sge.eq_start = val[1];
 	if ((int)val[3] > (int)val[2]) {
 		sc->tids.ftid_base = val[2];
 		sc->tids.ftid_end = val[3];
 		sc->tids.nftids = val[3] - val[2] + 1;
 	}
 	sc->vres.l2t.start = val[4];
 	sc->vres.l2t.size = val[5] - val[4] + 1;
 	KASSERT(sc->vres.l2t.size <= L2T_SIZE,
 	    ("%s: L2 table size (%u) larger than expected (%u)",
 	    __func__, sc->vres.l2t.size, L2T_SIZE));
 	sc->params.core_vdd = val[6];
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 
 		sc->tids.tid_base = t4_read_reg(sc,
 		    A_LE_DB_ACTIVE_TABLE_START_INDEX);
 
 		param[0] = FW_PARAM_PFVF(HPFILTER_START);
 		param[1] = FW_PARAM_PFVF(HPFILTER_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			   "failed to query hpfilter parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->tids.hpftid_base = val[0];
 			sc->tids.hpftid_end = val[1];
 			sc->tids.nhpftids = val[1] - val[0] + 1;
 
 			/*
 			 * These should go off if the layout changes and the
 			 * driver needs to catch up.
 			 */
 			MPASS(sc->tids.hpftid_base == 0);
 			MPASS(sc->tids.tid_base == sc->tids.nhpftids);
 		}
 	}
 
 	/*
 	 * MPSBGMAP is queried separately because only recent firmwares support
 	 * it as a parameter and we don't want the compound query above to fail
 	 * on older firmwares.
 	 */
 	param[0] = FW_PARAM_DEV(MPSBGMAP);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.mps_bg_map = val[0];
 	else
 		sc->params.mps_bg_map = 0;
 
 	/*
 	 * Determine whether the firmware supports the filter2 work request.
 	 * This is queried separately for the same reason as MPSBGMAP above.
 	 */
 	param[0] = FW_PARAM_DEV(FILTER2_WR);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.filter2_wr_support = val[0] != 0;
 	else
 		sc->params.filter2_wr_support = 0;
 
 	/*
 	 * Find out whether we're allowed to use the ULPTX MEMWRITE DSGL.
 	 * This is queried separately for the same reason as other params above.
 	 */
 	param[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.ulptx_memwrite_dsgl = val[0] != 0;
 	else
 		sc->params.ulptx_memwrite_dsgl = false;
 
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to get card capabilities: %d.\n", rc);
 		return (rc);
 	}
 
 #define READ_CAPS(x) do { \
 	sc->x = htobe16(caps.x); \
 } while (0)
 	READ_CAPS(nbmcaps);
 	READ_CAPS(linkcaps);
 	READ_CAPS(switchcaps);
 	READ_CAPS(niccaps);
 	READ_CAPS(toecaps);
 	READ_CAPS(rdmacaps);
 	READ_CAPS(cryptocaps);
 	READ_CAPS(iscsicaps);
 	READ_CAPS(fcoecaps);
 
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_HASHFILTER) {
 		MPASS(chip_id(sc) > CHELSIO_T4);
 		MPASS(sc->toecaps == 0);
 		sc->toecaps = 0;
 
 		param[0] = FW_PARAM_DEV(NTID);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query HASHFILTER parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		if (sc->params.fw_vers <
 		    (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) |
 		    V_FW_HDR_FW_VER_MICRO(5) | V_FW_HDR_FW_VER_BUILD(0))) {
 			MPASS(sc->tids.ntids >= sc->tids.nhpftids);
 			sc->tids.ntids -= sc->tids.nhpftids;
 		}
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		sc->params.hash_filter = 1;
 	}
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) {
 		param[0] = FW_PARAM_PFVF(ETHOFLD_START);
 		param[1] = FW_PARAM_PFVF(ETHOFLD_END);
 		param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query NIC parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->tids.etid_base = val[0];
 			sc->tids.etid_end = val[1];
 			sc->tids.netids = val[1] - val[0] + 1;
 			sc->params.eo_wr_cred = val[2];
 			sc->params.ethoffload = 1;
 		}
 	}
 	if (sc->toecaps) {
 		/* query offload-related parameters */
 		param[0] = FW_PARAM_DEV(NTID);
 		param[1] = FW_PARAM_PFVF(SERVER_START);
 		param[2] = FW_PARAM_PFVF(SERVER_END);
 		param[3] = FW_PARAM_PFVF(TDDP_START);
 		param[4] = FW_PARAM_PFVF(TDDP_END);
 		param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TOE parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		if (sc->params.fw_vers <
 		    (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) |
 		    V_FW_HDR_FW_VER_MICRO(5) | V_FW_HDR_FW_VER_BUILD(0))) {
 			MPASS(sc->tids.ntids >= sc->tids.nhpftids);
 			sc->tids.ntids -= sc->tids.nhpftids;
 		}
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		if ((int)val[2] > (int)val[1]) {
 			sc->tids.stid_base = val[1];
 			sc->tids.nstids = val[2] - val[1] + 1;
 		}
 		sc->vres.ddp.start = val[3];
 		sc->vres.ddp.size = val[4] - val[3] + 1;
 		sc->params.ofldq_wr_cred = val[5];
 		sc->params.offload = 1;
 	} else {
 		/*
 		 * The firmware attempts memfree TOE configuration for -SO cards
 		 * and will report toecaps=0 if it runs out of resources (this
 		 * depends on the config file).  It may not report 0 for other
 		 * capabilities dependent on the TOE in this case.  Set them to
 		 * 0 here so that the driver doesn't bother tracking resources
 		 * that will never be used.
 		 */
 		sc->iscsicaps = 0;
 		sc->rdmacaps = 0;
 	}
 	if (sc->rdmacaps) {
 		param[0] = FW_PARAM_PFVF(STAG_START);
 		param[1] = FW_PARAM_PFVF(STAG_END);
 		param[2] = FW_PARAM_PFVF(RQ_START);
 		param[3] = FW_PARAM_PFVF(RQ_END);
 		param[4] = FW_PARAM_PFVF(PBL_START);
 		param[5] = FW_PARAM_PFVF(PBL_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(1): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.stag.start = val[0];
 		sc->vres.stag.size = val[1] - val[0] + 1;
 		sc->vres.rq.start = val[2];
 		sc->vres.rq.size = val[3] - val[2] + 1;
 		sc->vres.pbl.start = val[4];
 		sc->vres.pbl.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SQRQ_START);
 		param[1] = FW_PARAM_PFVF(SQRQ_END);
 		param[2] = FW_PARAM_PFVF(CQ_START);
 		param[3] = FW_PARAM_PFVF(CQ_END);
 		param[4] = FW_PARAM_PFVF(OCQ_START);
 		param[5] = FW_PARAM_PFVF(OCQ_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(2): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.qp.start = val[0];
 		sc->vres.qp.size = val[1] - val[0] + 1;
 		sc->vres.cq.start = val[2];
 		sc->vres.cq.size = val[3] - val[2] + 1;
 		sc->vres.ocq.start = val[4];
 		sc->vres.ocq.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SRQ_START);
 		param[1] = FW_PARAM_PFVF(SRQ_END);
 		param[2] = FW_PARAM_DEV(MAXORDIRD_QP);
 		param[3] = FW_PARAM_DEV(MAXIRD_ADAPTER);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 4, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(3): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.srq.start = val[0];
 		sc->vres.srq.size = val[1] - val[0] + 1;
 		sc->params.max_ordird_qp = val[2];
 		sc->params.max_ird_adapter = val[3];
 	}
 	if (sc->iscsicaps) {
 		param[0] = FW_PARAM_PFVF(ISCSI_START);
 		param[1] = FW_PARAM_PFVF(ISCSI_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query iSCSI parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.iscsi.start = val[0];
 		sc->vres.iscsi.size = val[1] - val[0] + 1;
 	}
 	if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) {
 		param[0] = FW_PARAM_PFVF(TLS_START);
 		param[1] = FW_PARAM_PFVF(TLS_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TLS parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.key.start = val[0];
 		sc->vres.key.size = val[1] - val[0] + 1;
 	}
 
 	t4_init_sge_params(sc);
 
 	/*
 	 * We've got the params we wanted to query via the firmware.  Now grab
 	 * some others directly from the chip.
 	 */
 	rc = t4_read_chip_settings(sc);
 
 	return (rc);
 }
 
 static int
 set_params__post_init(struct adapter *sc)
 {
 	uint32_t param, val;
 #ifdef TCP_OFFLOAD
 	int i, v, shift;
 #endif
 
 	/* ask for encapsulated CPLs */
 	param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
 	val = 1;
 	(void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 
 	/* Enable 32b port caps if the firmware supports it. */
 	param = FW_PARAM_PFVF(PORT_CAPS32);
 	val = 1;
 	if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val) == 0)
 		sc->params.port_caps32 = 1;
 
 	/* Let filter + maskhash steer to a part of the VI's RSS region. */
 	val = 1 << (G_MASKSIZE(t4_read_reg(sc, A_TP_RSS_CONFIG_TNL)) - 1);
 	t4_set_reg_field(sc, A_TP_RSS_CONFIG_TNL, V_MASKFILTER(M_MASKFILTER),
 	    V_MASKFILTER(val - 1));
 
 #ifdef TCP_OFFLOAD
 	/*
 	 * Override the TOE timers with user provided tunables.  This is not the
 	 * recommended way to change the timers (the firmware config file is) so
 	 * these tunables are not documented.
 	 *
 	 * All the timer tunables are in microseconds.
 	 */
 	if (t4_toe_keepalive_idle != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_keepalive_idle);
 		v &= M_KEEPALIVEIDLE;
 		t4_set_reg_field(sc, A_TP_KEEP_IDLE,
 		    V_KEEPALIVEIDLE(M_KEEPALIVEIDLE), V_KEEPALIVEIDLE(v));
 	}
 	if (t4_toe_keepalive_interval != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_keepalive_interval);
 		v &= M_KEEPALIVEINTVL;
 		t4_set_reg_field(sc, A_TP_KEEP_INTVL,
 		    V_KEEPALIVEINTVL(M_KEEPALIVEINTVL), V_KEEPALIVEINTVL(v));
 	}
 	if (t4_toe_keepalive_count != 0) {
 		v = t4_toe_keepalive_count & M_KEEPALIVEMAXR2;
 		t4_set_reg_field(sc, A_TP_SHIFT_CNT,
 		    V_KEEPALIVEMAXR1(M_KEEPALIVEMAXR1) |
 		    V_KEEPALIVEMAXR2(M_KEEPALIVEMAXR2),
 		    V_KEEPALIVEMAXR1(1) | V_KEEPALIVEMAXR2(v));
 	}
 	if (t4_toe_rexmt_min != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_rexmt_min);
 		v &= M_RXTMIN;
 		t4_set_reg_field(sc, A_TP_RXT_MIN,
 		    V_RXTMIN(M_RXTMIN), V_RXTMIN(v));
 	}
 	if (t4_toe_rexmt_max != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_rexmt_max);
 		v &= M_RXTMAX;
 		t4_set_reg_field(sc, A_TP_RXT_MAX,
 		    V_RXTMAX(M_RXTMAX), V_RXTMAX(v));
 	}
 	if (t4_toe_rexmt_count != 0) {
 		v = t4_toe_rexmt_count & M_RXTSHIFTMAXR2;
 		t4_set_reg_field(sc, A_TP_SHIFT_CNT,
 		    V_RXTSHIFTMAXR1(M_RXTSHIFTMAXR1) |
 		    V_RXTSHIFTMAXR2(M_RXTSHIFTMAXR2),
 		    V_RXTSHIFTMAXR1(1) | V_RXTSHIFTMAXR2(v));
 	}
 	for (i = 0; i < nitems(t4_toe_rexmt_backoff); i++) {
 		if (t4_toe_rexmt_backoff[i] != -1) {
 			v = t4_toe_rexmt_backoff[i] & M_TIMERBACKOFFINDEX0;
 			shift = (i & 3) << 3;
 			t4_set_reg_field(sc, A_TP_TCP_BACKOFF_REG0 + (i & ~3),
 			    M_TIMERBACKOFFINDEX0 << shift, v << shift);
 		}
 	}
 #endif
 	return (0);
 }
 
 #undef FW_PARAM_PFVF
 #undef FW_PARAM_DEV
 
 static void
 t4_set_desc(struct adapter *sc)
 {
 	char buf[128];
 	struct adapter_params *p = &sc->params;
 
 	snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id);
 
 	device_set_desc_copy(sc->dev, buf);
 }
 
 static inline void
 ifmedia_add4(struct ifmedia *ifm, int m)
 {
 
 	ifmedia_add(ifm, m, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL);
 }
 
 /*
  * This is the selected media, which is not quite the same as the active media.
  * The media line in ifconfig is "media: Ethernet selected (active)" if selected
  * and active are not the same, and "media: Ethernet selected" otherwise.
  */
 static void
 set_current_media(struct port_info *pi)
 {
 	struct link_config *lc;
 	struct ifmedia *ifm;
 	int mword;
 	u_int speed;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	/* Leave current media alone if it's already set to IFM_NONE. */
 	ifm = &pi->media;
 	if (ifm->ifm_cur != NULL &&
 	    IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE)
 		return;
 
 	lc = &pi->link_cfg;
 	if (lc->requested_aneg != AUTONEG_DISABLE &&
 	    lc->supported & FW_PORT_CAP32_ANEG) {
 		ifmedia_set(ifm, IFM_ETHER | IFM_AUTO);
 		return;
 	}
 	mword = IFM_ETHER | IFM_FDX;
 	if (lc->requested_fc & PAUSE_TX)
 		mword |= IFM_ETH_TXPAUSE;
 	if (lc->requested_fc & PAUSE_RX)
 		mword |= IFM_ETH_RXPAUSE;
 	if (lc->requested_speed == 0)
 		speed = port_top_speed(pi) * 1000;	/* Gbps -> Mbps */
 	else
 		speed = lc->requested_speed;
 	mword |= port_mword(pi, speed_to_fwcap(speed));
 	ifmedia_set(ifm, mword);
 }
 
 /*
  * Returns true if the ifmedia list for the port cannot change.
  */
 static bool
 fixed_ifmedia(struct port_info *pi)
 {
 
 	return (pi->port_type == FW_PORT_TYPE_BT_SGMII ||
 	    pi->port_type == FW_PORT_TYPE_BT_XFI ||
 	    pi->port_type == FW_PORT_TYPE_BT_XAUI ||
 	    pi->port_type == FW_PORT_TYPE_KX4 ||
 	    pi->port_type == FW_PORT_TYPE_KX ||
 	    pi->port_type == FW_PORT_TYPE_KR ||
 	    pi->port_type == FW_PORT_TYPE_BP_AP ||
 	    pi->port_type == FW_PORT_TYPE_BP4_AP ||
 	    pi->port_type == FW_PORT_TYPE_BP40_BA ||
 	    pi->port_type == FW_PORT_TYPE_KR4_100G ||
 	    pi->port_type == FW_PORT_TYPE_KR_SFP28 ||
 	    pi->port_type == FW_PORT_TYPE_KR_XLAUI);
 }
 
 static void
 build_medialist(struct port_info *pi)
 {
 	uint32_t ss, speed;
 	int unknown, mword, bit;
 	struct link_config *lc;
 	struct ifmedia *ifm;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (pi->flags & FIXED_IFMEDIA)
 		return;
 
 	/*
 	 * Rebuild the ifmedia list.
 	 */
 	ifm = &pi->media;
 	ifmedia_removeall(ifm);
 	lc = &pi->link_cfg;
 	ss = G_FW_PORT_CAP32_SPEED(lc->supported); /* Supported Speeds */
 	if (__predict_false(ss == 0)) {	/* not supposed to happen. */
 		MPASS(ss != 0);
 no_media:
 		MPASS(LIST_EMPTY(&ifm->ifm_list));
 		ifmedia_add(ifm, IFM_ETHER | IFM_NONE, 0, NULL);
 		ifmedia_set(ifm, IFM_ETHER | IFM_NONE);
 		return;
 	}
 
 	unknown = 0;
 	for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) {
 		speed = 1 << bit;
 		MPASS(speed & M_FW_PORT_CAP32_SPEED);
 		if (ss & speed) {
 			mword = port_mword(pi, speed);
 			if (mword == IFM_NONE) {
 				goto no_media;
 			} else if (mword == IFM_UNKNOWN)
 				unknown++;
 			else
 				ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | mword);
 		}
 	}
 	if (unknown > 0) /* Add one unknown for all unknown media types. */
 		ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN);
 	if (lc->supported & FW_PORT_CAP32_ANEG)
 		ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL);
 
 	set_current_media(pi);
 }
 
 /*
  * Initialize the requested fields in the link config based on driver tunables.
  */
 static void
 init_link_config(struct port_info *pi)
 {
 	struct link_config *lc = &pi->link_cfg;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	lc->requested_speed = 0;
 
 	if (t4_autoneg == 0)
 		lc->requested_aneg = AUTONEG_DISABLE;
 	else if (t4_autoneg == 1)
 		lc->requested_aneg = AUTONEG_ENABLE;
 	else
 		lc->requested_aneg = AUTONEG_AUTO;
 
 	lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX |
 	    PAUSE_AUTONEG);
 
 	if (t4_fec == -1 || t4_fec & FEC_AUTO)
 		lc->requested_fec = FEC_AUTO;
 	else {
 		lc->requested_fec = FEC_NONE;
 		if (t4_fec & FEC_RS)
 			lc->requested_fec |= FEC_RS;
 		if (t4_fec & FEC_BASER_RS)
 			lc->requested_fec |= FEC_BASER_RS;
 	}
 }
 
 /*
  * Makes sure that all requested settings comply with what's supported by the
  * port.  Returns the number of settings that were invalid and had to be fixed.
  */
 static int
 fixup_link_config(struct port_info *pi)
 {
 	int n = 0;
 	struct link_config *lc = &pi->link_cfg;
 	uint32_t fwspeed;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	/* Speed (when not autonegotiating) */
 	if (lc->requested_speed != 0) {
 		fwspeed = speed_to_fwcap(lc->requested_speed);
 		if ((fwspeed & lc->supported) == 0) {
 			n++;
 			lc->requested_speed = 0;
 		}
 	}
 
 	/* Link autonegotiation */
 	MPASS(lc->requested_aneg == AUTONEG_ENABLE ||
 	    lc->requested_aneg == AUTONEG_DISABLE ||
 	    lc->requested_aneg == AUTONEG_AUTO);
 	if (lc->requested_aneg == AUTONEG_ENABLE &&
 	    !(lc->supported & FW_PORT_CAP32_ANEG)) {
 		n++;
 		lc->requested_aneg = AUTONEG_AUTO;
 	}
 
 	/* Flow control */
 	MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0);
 	if (lc->requested_fc & PAUSE_TX &&
 	    !(lc->supported & FW_PORT_CAP32_FC_TX)) {
 		n++;
 		lc->requested_fc &= ~PAUSE_TX;
 	}
 	if (lc->requested_fc & PAUSE_RX &&
 	    !(lc->supported & FW_PORT_CAP32_FC_RX)) {
 		n++;
 		lc->requested_fc &= ~PAUSE_RX;
 	}
 	if (!(lc->requested_fc & PAUSE_AUTONEG) &&
 	    !(lc->supported & FW_PORT_CAP32_FORCE_PAUSE)) {
 		n++;
 		lc->requested_fc |= PAUSE_AUTONEG;
 	}
 
 	/* FEC */
 	if ((lc->requested_fec & FEC_RS &&
 	    !(lc->supported & FW_PORT_CAP32_FEC_RS)) ||
 	    (lc->requested_fec & FEC_BASER_RS &&
 	    !(lc->supported & FW_PORT_CAP32_FEC_BASER_RS))) {
 		n++;
 		lc->requested_fec = FEC_AUTO;
 	}
 
 	return (n);
 }
 
 /*
  * Apply the requested L1 settings, which are expected to be valid, to the
  * hardware.
  */
 static int
 apply_link_config(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 
 #ifdef INVARIANTS
 	ASSERT_SYNCHRONIZED_OP(sc);
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (lc->requested_aneg == AUTONEG_ENABLE)
 		MPASS(lc->supported & FW_PORT_CAP32_ANEG);
 	if (!(lc->requested_fc & PAUSE_AUTONEG))
 		MPASS(lc->supported & FW_PORT_CAP32_FORCE_PAUSE);
 	if (lc->requested_fc & PAUSE_TX)
 		MPASS(lc->supported & FW_PORT_CAP32_FC_TX);
 	if (lc->requested_fc & PAUSE_RX)
 		MPASS(lc->supported & FW_PORT_CAP32_FC_RX);
 	if (lc->requested_fec & FEC_RS)
 		MPASS(lc->supported & FW_PORT_CAP32_FEC_RS);
 	if (lc->requested_fec & FEC_BASER_RS)
 		MPASS(lc->supported & FW_PORT_CAP32_FEC_BASER_RS);
 #endif
 	rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc);
 	if (rc != 0) {
 		/* Don't complain if the VF driver gets back an EPERM. */
 		if (!(sc->flags & IS_VF) || rc != FW_EPERM)
 			device_printf(pi->dev, "l1cfg failed: %d\n", rc);
 	} else {
 		/*
 		 * An L1_CFG will almost always result in a link-change event if
 		 * the link is up, and the driver will refresh the actual
 		 * fec/fc/etc. when the notification is processed.  If the link
 		 * is down then the actual settings are meaningless.
 		 *
 		 * This takes care of the case where a change in the L1 settings
 		 * may not result in a notification.
 		 */
 		if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG))
 			lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX);
 	}
 	return (rc);
 }
 
 #define FW_MAC_EXACT_CHUNK	7
 
 /*
  * Program the port's XGMAC based on parameters in ifnet.  The caller also
  * indicates which parameters should be programmed (the rest are left alone).
  */
 int
 update_mac_settings(struct ifnet *ifp, int flags)
 {
 	int rc = 0;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT(flags, ("%s: not told what to update.", __func__));
 
 	if (flags & XGMAC_MTU)
 		mtu = ifp->if_mtu;
 
 	if (flags & XGMAC_PROMISC)
 		promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0;
 
 	if (flags & XGMAC_ALLMULTI)
 		allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0;
 
 	if (flags & XGMAC_VLANEX)
 		vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0;
 
 	if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) {
 		rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc,
 		    allmulti, 1, vlanex, false);
 		if (rc) {
 			if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags,
 			    rc);
 			return (rc);
 		}
 	}
 
 	if (flags & XGMAC_UCADDR) {
 		uint8_t ucaddr[ETHER_ADDR_LEN];
 
 		bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr));
 		rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt,
 		    ucaddr, true, &vi->smt_idx);
 		if (rc < 0) {
 			rc = -rc;
 			if_printf(ifp, "change_mac failed: %d\n", rc);
 			return (rc);
 		} else {
 			vi->xact_addr_filt = rc;
 			rc = 0;
 		}
 	}
 
 	if (flags & XGMAC_MCADDRS) {
 		const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK];
 		int del = 1;
 		uint64_t hash = 0;
 		struct ifmultiaddr *ifma;
 		int i = 0, j;
 
 		if_maddr_rlock(ifp);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_LINK)
 				continue;
 			mcaddr[i] =
 			    LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
 			MPASS(ETHER_IS_MULTICAST(mcaddr[i]));
 			i++;
 
 			if (i == FW_MAC_EXACT_CHUNK) {
 				rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid,
 				    del, i, mcaddr, NULL, &hash, 0);
 				if (rc < 0) {
 					rc = -rc;
 					for (j = 0; j < i; j++) {
 						if_printf(ifp,
 						    "failed to add mc address"
 						    " %02x:%02x:%02x:"
 						    "%02x:%02x:%02x rc=%d\n",
 						    mcaddr[j][0], mcaddr[j][1],
 						    mcaddr[j][2], mcaddr[j][3],
 						    mcaddr[j][4], mcaddr[j][5],
 						    rc);
 					}
 					goto mcfail;
 				}
 				del = 0;
 				i = 0;
 			}
 		}
 		if (i > 0) {
 			rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i,
 			    mcaddr, NULL, &hash, 0);
 			if (rc < 0) {
 				rc = -rc;
 				for (j = 0; j < i; j++) {
 					if_printf(ifp,
 					    "failed to add mc address"
 					    " %02x:%02x:%02x:"
 					    "%02x:%02x:%02x rc=%d\n",
 					    mcaddr[j][0], mcaddr[j][1],
 					    mcaddr[j][2], mcaddr[j][3],
 					    mcaddr[j][4], mcaddr[j][5],
 					    rc);
 				}
 				goto mcfail;
 			}
 		}
 
 		rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, hash, 0);
 		if (rc != 0)
 			if_printf(ifp, "failed to set mc address hash: %d", rc);
 mcfail:
 		if_maddr_runlock(ifp);
 	}
 
 	return (rc);
 }
 
 /*
  * {begin|end}_synchronized_op must be called from the same thread.
  */
 int
 begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags,
     char *wmesg)
 {
 	int rc, pri;
 
 #ifdef WITNESS
 	/* the caller thinks it's ok to sleep, but is it really? */
 	if (flags & SLEEP_OK)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "begin_synchronized_op");
 #endif
 
 	if (INTR_OK)
 		pri = PCATCH;
 	else
 		pri = 0;
 
 	ADAPTER_LOCK(sc);
 	for (;;) {
 
 		if (vi && IS_DOOMED(vi)) {
 			rc = ENXIO;
 			goto done;
 		}
 
 		if (!IS_BUSY(sc)) {
 			rc = 0;
 			break;
 		}
 
 		if (!(flags & SLEEP_OK)) {
 			rc = EBUSY;
 			goto done;
 		}
 
 		if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) {
 			rc = EINTR;
 			goto done;
 		}
 	}
 
 	KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__));
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = wmesg;
 	sc->last_op_thr = curthread;
 	sc->last_op_flags = flags;
 #endif
 
 done:
 	if (!(flags & HOLD_LOCK) || rc)
 		ADAPTER_UNLOCK(sc);
 
 	return (rc);
 }
 
 /*
  * Tell if_ioctl and if_init that the VI is going away.  This is
  * special variant of begin_synchronized_op and must be paired with a
  * call to end_synchronized_op.
  */
 void
 doom_vi(struct adapter *sc, struct vi_info *vi)
 {
 
 	ADAPTER_LOCK(sc);
 	SET_DOOMED(vi);
 	wakeup(&sc->flags);
 	while (IS_BUSY(sc))
 		mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0);
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = "t4detach";
 	sc->last_op_thr = curthread;
 	sc->last_op_flags = 0;
 #endif
 	ADAPTER_UNLOCK(sc);
 }
 
 /*
  * {begin|end}_synchronized_op must be called from the same thread.
  */
 void
 end_synchronized_op(struct adapter *sc, int flags)
 {
 
 	if (flags & LOCK_HELD)
 		ADAPTER_LOCK_ASSERT_OWNED(sc);
 	else
 		ADAPTER_LOCK(sc);
 
 	KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__));
 	CLR_BUSY(sc);
 	wakeup(&sc->flags);
 	ADAPTER_UNLOCK(sc);
 }
 
 static int
 cxgbe_init_synchronized(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int rc = 0, i;
 	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return (0);	/* already running */
 
 	if (!(sc->flags & FULL_INIT_DONE) &&
 	    ((rc = adapter_full_init(sc)) != 0))
 		return (rc);	/* error message displayed already */
 
 	if (!(vi->flags & VI_INIT_DONE) &&
 	    ((rc = vi_full_init(vi)) != 0))
 		return (rc); /* error message displayed already */
 
 	rc = update_mac_settings(ifp, XGMAC_ALL);
 	if (rc)
 		goto done;	/* error message displayed already */
 
 	PORT_LOCK(pi);
 	if (pi->up_vis == 0) {
 		t4_update_port_info(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		apply_link_config(pi);
 	}
 
 	rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true);
 	if (rc != 0) {
 		if_printf(ifp, "enable_vi failed: %d\n", rc);
 		PORT_UNLOCK(pi);
 		goto done;
 	}
 
 	/*
 	 * Can't fail from this point onwards.  Review cxgbe_uninit_synchronized
 	 * if this changes.
 	 */
 
 	for_each_txq(vi, i, txq) {
 		TXQ_LOCK(txq);
 		txq->eq.flags |= EQ_ENABLED;
 		TXQ_UNLOCK(txq);
 	}
 
 	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0 && IS_MAIN_VI(vi)) {
 		sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id;
 		t4_write_reg(sc, is_t4(sc) ?  A_MPS_TRC_RSS_CONTROL :
 		    A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) |
 		    V_QUEUENUMBER(sc->traceq));
 		pi->flags |= HAS_TRACEQ;
 	}
 
 	/* all ok */
 	pi->up_vis++;
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	if (pi->nvi > 1 || sc->flags & IS_VF)
 		callout_reset(&vi->tick, hz, vi_tick, vi);
 	else
 		callout_reset(&pi->tick, hz, cxgbe_tick, pi);
 	if (pi->link_cfg.link_ok)
 		t4_os_link_changed(pi);
 	PORT_UNLOCK(pi);
 done:
 	if (rc != 0)
 		cxgbe_uninit_synchronized(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static int
 cxgbe_uninit_synchronized(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int rc, i;
 	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!(vi->flags & VI_INIT_DONE)) {
 		if (__predict_false(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			KASSERT(0, ("uninited VI is running"));
 			if_printf(ifp, "uninited VI with running ifnet.  "
 			    "vi->flags 0x%016lx, if_flags 0x%08x, "
 			    "if_drv_flags 0x%08x\n", vi->flags, ifp->if_flags,
 			    ifp->if_drv_flags);
 		}
 		return (0);
 	}
 
 	/*
 	 * Disable the VI so that all its data in either direction is discarded
 	 * by the MPS.  Leave everything else (the queues, interrupts, and 1Hz
 	 * tick) intact as the TP can deliver negative advice or data that it's
 	 * holding in its RAM (for an offloaded connection) even after the VI is
 	 * disabled.
 	 */
 	rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false);
 	if (rc) {
 		if_printf(ifp, "disable_vi failed: %d\n", rc);
 		return (rc);
 	}
 
 	for_each_txq(vi, i, txq) {
 		TXQ_LOCK(txq);
 		txq->eq.flags &= ~EQ_ENABLED;
 		TXQ_UNLOCK(txq);
 	}
 
 	PORT_LOCK(pi);
 	if (pi->nvi > 1 || sc->flags & IS_VF)
 		callout_stop(&vi->tick);
 	else
 		callout_stop(&pi->tick);
 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		PORT_UNLOCK(pi);
 		return (0);
 	}
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	pi->up_vis--;
 	if (pi->up_vis > 0) {
 		PORT_UNLOCK(pi);
 		return (0);
 	}
 
 	pi->link_cfg.link_ok = false;
 	pi->link_cfg.speed = 0;
 	pi->link_cfg.link_down_rc = 255;
 	t4_os_link_changed(pi);
 	PORT_UNLOCK(pi);
 
 	return (0);
 }
 
 /*
  * It is ok for this function to fail midway and return right away.  t4_detach
  * will walk the entire sc->irq list and clean up whatever is valid.
  */
 int
 t4_setup_intr_handlers(struct adapter *sc)
 {
 	int rc, rid, p, q, v;
 	char s[8];
 	struct irq *irq;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct sge *sge = &sc->sge;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 #endif
 #ifdef RSS
 	int nbuckets = rss_getnumbuckets();
 #endif
 
 	/*
 	 * Setup interrupts.
 	 */
 	irq = &sc->irq[0];
 	rid = sc->intr_type == INTR_INTX ? 0 : 1;
 	if (forwarding_intr_to_fwq(sc))
 		return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all"));
 
 	/* Multiple interrupts. */
 	if (sc->flags & IS_VF)
 		KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports,
 		    ("%s: too few intr.", __func__));
 	else
 		KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports,
 		    ("%s: too few intr.", __func__));
 
 	/* The first one is always error intr on PFs */
 	if (!(sc->flags & IS_VF)) {
 		rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err");
 		if (rc != 0)
 			return (rc);
 		irq++;
 		rid++;
 	}
 
 	/* The second one is always the firmware event queue (first on VFs) */
 	rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt");
 	if (rc != 0)
 		return (rc);
 	irq++;
 	rid++;
 
 	for_each_port(sc, p) {
 		pi = sc->port[p];
 		for_each_vi(pi, v, vi) {
 			vi->first_intr = rid - 1;
 
 			if (vi->nnmrxq > 0) {
 				int n = max(vi->nrxq, vi->nnmrxq);
 
 				rxq = &sge->rxq[vi->first_rxq];
 #ifdef DEV_NETMAP
 				nm_rxq = &sge->nm_rxq[vi->first_nm_rxq];
 #endif
 				for (q = 0; q < n; q++) {
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					if (q < vi->nrxq)
 						irq->rxq = rxq++;
 #ifdef DEV_NETMAP
 					if (q < vi->nnmrxq)
 						irq->nm_rxq = nm_rxq++;
 
 					if (irq->nm_rxq != NULL &&
 					    irq->rxq == NULL) {
 						/* Netmap rx only */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_nm_intr, irq->nm_rxq, s);
 					}
 					if (irq->nm_rxq != NULL &&
 					    irq->rxq != NULL) {
 						/* NIC and Netmap rx */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_vi_intr, irq, s);
 					}
 #endif
 					if (irq->rxq != NULL &&
 					    irq->nm_rxq == NULL) {
 						/* NIC rx only */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_intr, irq->rxq, s);
 					}
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
 					if (q < vi->nrxq) {
 						bus_bind_intr(sc->dev, irq->res,
 						    rss_getcpu(q % nbuckets));
 					}
 #endif
 					irq++;
 					rid++;
 					vi->nintr++;
 				}
 			} else {
 				for_each_rxq(vi, q, rxq) {
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					rc = t4_alloc_irq(sc, irq, rid,
 					    t4_intr, rxq, s);
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
 					bus_bind_intr(sc->dev, irq->res,
 					    rss_getcpu(q % nbuckets));
 #endif
 					irq++;
 					rid++;
 					vi->nintr++;
 				}
 			}
 #ifdef TCP_OFFLOAD
 			for_each_ofld_rxq(vi, q, ofld_rxq) {
 				snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q);
 				rc = t4_alloc_irq(sc, irq, rid, t4_intr,
 				    ofld_rxq, s);
 				if (rc != 0)
 					return (rc);
 				irq++;
 				rid++;
 				vi->nintr++;
 			}
 #endif
 		}
 	}
 	MPASS(irq == &sc->irq[sc->intr_count]);
 
 	return (0);
 }
 
 int
 adapter_full_init(struct adapter *sc)
 {
 	int rc, i;
 #ifdef RSS
 	uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)];
 	uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)];
 #endif
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 	KASSERT((sc->flags & FULL_INIT_DONE) == 0,
 	    ("%s: FULL_INIT_DONE already", __func__));
 
 	/*
 	 * queues that belong to the adapter (not any particular port).
 	 */
 	rc = t4_setup_adapter_queues(sc);
 	if (rc != 0)
 		goto done;
 
 	for (i = 0; i < nitems(sc->tq); i++) {
 		sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT,
 		    taskqueue_thread_enqueue, &sc->tq[i]);
 		if (sc->tq[i] == NULL) {
 			device_printf(sc->dev,
 			    "failed to allocate task queue %d\n", i);
 			rc = ENOMEM;
 			goto done;
 		}
 		taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d",
 		    device_get_nameunit(sc->dev), i);
 	}
 #ifdef RSS
 	MPASS(RSS_KEYSIZE == 40);
 	rss_getkey((void *)&raw_rss_key[0]);
 	for (i = 0; i < nitems(rss_key); i++) {
 		rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]);
 	}
 	t4_write_rss_key(sc, &rss_key[0], -1, 1);
 #endif
 
 	if (!(sc->flags & IS_VF))
 		t4_intr_enable(sc);
 	sc->flags |= FULL_INIT_DONE;
 done:
 	if (rc != 0)
 		adapter_full_uninit(sc);
 
 	return (rc);
 }
 
 int
 adapter_full_uninit(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	t4_teardown_adapter_queues(sc);
 
 	for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) {
 		taskqueue_free(sc->tq[i]);
 		sc->tq[i] = NULL;
 	}
 
 	sc->flags &= ~FULL_INIT_DONE;
 
 	return (0);
 }
 
 #ifdef RSS
 #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \
     RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \
     RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \
     RSS_HASHTYPE_RSS_UDP_IPV6)
 
 /* Translates kernel hash types to hardware. */
 static int
 hashconfig_to_hashen(int hashconfig)
 {
 	int hashen = 0;
 
 	if (hashconfig & RSS_HASHTYPE_RSS_IPV4)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_IPV6)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) {
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN;
 	}
 	if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) {
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN;
 	}
 	if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN;
 
 	return (hashen);
 }
 
 /* Translates hardware hash types to kernel. */
 static int
 hashen_to_hashconfig(int hashen)
 {
 	int hashconfig = 0;
 
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) {
 		/*
 		 * If UDP hashing was enabled it must have been enabled for
 		 * either IPv4 or IPv6 (inclusive or).  Enabling UDP without
 		 * enabling any 4-tuple hash is nonsense configuration.
 		 */
 		MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN));
 
 		if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
 			hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4;
 		if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
 			hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6;
 	}
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_IPV4;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_IPV6;
 
 	return (hashconfig);
 }
 #endif
 
 int
 vi_full_init(struct vi_info *vi)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	uint16_t *rss;
 	struct sge_rxq *rxq;
 	int rc, i, j;
 #ifdef RSS
 	int nbuckets = rss_getnumbuckets();
 	int hashconfig = rss_gethashconfig();
 	int extra;
 #endif
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT((vi->flags & VI_INIT_DONE) == 0,
 	    ("%s: VI_INIT_DONE already", __func__));
 
 	sysctl_ctx_init(&vi->ctx);
 	vi->flags |= VI_SYSCTL_CTX;
 
 	/*
 	 * Allocate tx/rx/fl queues for this VI.
 	 */
 	rc = t4_setup_vi_queues(vi);
 	if (rc != 0)
 		goto done;	/* error message displayed already */
 
 	/*
 	 * Setup RSS for this VI.  Save a copy of the RSS table for later use.
 	 */
 	if (vi->nrxq > vi->rss_size) {
 		if_printf(ifp, "nrxq (%d) > hw RSS table size (%d); "
 		    "some queues will never receive traffic.\n", vi->nrxq,
 		    vi->rss_size);
 	} else if (vi->rss_size % vi->nrxq) {
 		if_printf(ifp, "nrxq (%d), hw RSS table size (%d); "
 		    "expect uneven traffic distribution.\n", vi->nrxq,
 		    vi->rss_size);
 	}
 #ifdef RSS
 	if (vi->nrxq != nbuckets) {
 		if_printf(ifp, "nrxq (%d) != kernel RSS buckets (%d);"
 		    "performance will be impacted.\n", vi->nrxq, nbuckets);
 	}
 #endif
 	rss = malloc(vi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK);
 	for (i = 0; i < vi->rss_size;) {
 #ifdef RSS
 		j = rss_get_indirection_to_bucket(i);
 		j %= vi->nrxq;
 		rxq = &sc->sge.rxq[vi->first_rxq + j];
 		rss[i++] = rxq->iq.abs_id;
 #else
 		for_each_rxq(vi, j, rxq) {
 			rss[i++] = rxq->iq.abs_id;
 			if (i == vi->rss_size)
 				break;
 		}
 #endif
 	}
 
 	rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, rss,
 	    vi->rss_size);
 	if (rc != 0) {
 		free(rss, M_CXGBE);
 		if_printf(ifp, "rss_config failed: %d\n", rc);
 		goto done;
 	}
 
 #ifdef RSS
 	vi->hashen = hashconfig_to_hashen(hashconfig);
 
 	/*
 	 * We may have had to enable some hashes even though the global config
 	 * wants them disabled.  This is a potential problem that must be
 	 * reported to the user.
 	 */
 	extra = hashen_to_hashconfig(vi->hashen) ^ hashconfig;
 
 	/*
 	 * If we consider only the supported hash types, then the enabled hashes
 	 * are a superset of the requested hashes.  In other words, there cannot
 	 * be any supported hash that was requested but not enabled, but there
 	 * can be hashes that were not requested but had to be enabled.
 	 */
 	extra &= SUPPORTED_RSS_HASHTYPES;
 	MPASS((extra & hashconfig) == 0);
 
 	if (extra) {
 		if_printf(ifp,
 		    "global RSS config (0x%x) cannot be accommodated.\n",
 		    hashconfig);
 	}
 	if (extra & RSS_HASHTYPE_RSS_IPV4)
 		if_printf(ifp, "IPv4 2-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_TCP_IPV4)
 		if_printf(ifp, "TCP/IPv4 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_IPV6)
 		if_printf(ifp, "IPv6 2-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_TCP_IPV6)
 		if_printf(ifp, "TCP/IPv6 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_UDP_IPV4)
 		if_printf(ifp, "UDP/IPv4 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_UDP_IPV6)
 		if_printf(ifp, "UDP/IPv6 4-tuple hashing forced on.\n");
 #else
 	vi->hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN;
 #endif
 	rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, rss[0], 0, 0);
 	if (rc != 0) {
 		free(rss, M_CXGBE);
 		if_printf(ifp, "rss hash/defaultq config failed: %d\n", rc);
 		goto done;
 	}
 
 	vi->rss = rss;
 	vi->flags |= VI_INIT_DONE;
 done:
 	if (rc != 0)
 		vi_full_uninit(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 int
 vi_full_uninit(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_wrq *ofld_txq;
 #endif
 
 	if (vi->flags & VI_INIT_DONE) {
 
 		/* Need to quiesce queues.  */
 
 		/* XXX: Only for the first VI? */
 		if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
 			quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 		for_each_txq(vi, i, txq) {
 			quiesce_txq(sc, txq);
 		}
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 		for_each_ofld_txq(vi, i, ofld_txq) {
 			quiesce_wrq(sc, ofld_txq);
 		}
 #endif
 
 		for_each_rxq(vi, i, rxq) {
 			quiesce_iq(sc, &rxq->iq);
 			quiesce_fl(sc, &rxq->fl);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_rxq(vi, i, ofld_rxq) {
 			quiesce_iq(sc, &ofld_rxq->iq);
 			quiesce_fl(sc, &ofld_rxq->fl);
 		}
 #endif
 		free(vi->rss, M_CXGBE);
 		free(vi->nm_rss, M_CXGBE);
 	}
 
 	t4_teardown_vi_queues(vi);
 	vi->flags &= ~VI_INIT_DONE;
 
 	return (0);
 }
 
 static void
 quiesce_txq(struct adapter *sc, struct sge_txq *txq)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 
 	(void) sc;	/* unused */
 
 #ifdef INVARIANTS
 	TXQ_LOCK(txq);
 	MPASS((eq->flags & EQ_ENABLED) == 0);
 	TXQ_UNLOCK(txq);
 #endif
 
 	/* Wait for the mp_ring to empty. */
 	while (!mp_ring_is_idle(txq->r)) {
 		mp_ring_check_drainage(txq->r, 0);
 		pause("rquiesce", 1);
 	}
 
 	/* Then wait for the hardware to finish. */
 	while (spg->cidx != htobe16(eq->pidx))
 		pause("equiesce", 1);
 
 	/* Finally, wait for the driver to reclaim all descriptors. */
 	while (eq->cidx != eq->pidx)
 		pause("dquiesce", 1);
 }
 
 static void
 quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 
 	/* XXXTX */
 }
 
 static void
 quiesce_iq(struct adapter *sc, struct sge_iq *iq)
 {
 	(void) sc;	/* unused */
 
 	/* Synchronize with the interrupt handler */
 	while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED))
 		pause("iqfree", 1);
 }
 
 static void
 quiesce_fl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	fl->flags |= FL_DOOMED;
 	FL_UNLOCK(fl);
 	callout_stop(&sc->sfl_callout);
 	mtx_unlock(&sc->sfl_lock);
 
 	KASSERT((fl->flags & FL_STARVING) == 0,
 	    ("%s: still starving", __func__));
 }
 
 static int
 t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid,
     driver_intr_t *handler, void *arg, char *name)
 {
 	int rc;
 
 	irq->rid = rid;
 	irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid,
 	    RF_SHAREABLE | RF_ACTIVE);
 	if (irq->res == NULL) {
 		device_printf(sc->dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 
 	rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET,
 	    NULL, handler, arg, &irq->tag);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 		    rid, name, rc);
 	} else if (name)
 		bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name);
 
 	return (rc);
 }
 
 static int
 t4_free_irq(struct adapter *sc, struct irq *irq)
 {
 	if (irq->tag)
 		bus_teardown_intr(sc->dev, irq->res, irq->tag);
 	if (irq->res)
 		bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res);
 
 	bzero(irq, sizeof(*irq));
 
 	return (0);
 }
 
 static void
 get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf)
 {
 
 	regs->version = chip_id(sc) | chip_rev(sc) << 10;
 	t4_get_regs(sc, buf, regs->len);
 }
 
 #define	A_PL_INDIR_CMD	0x1f8
 
 #define	S_PL_AUTOINC	31
 #define	M_PL_AUTOINC	0x1U
 #define	V_PL_AUTOINC(x)	((x) << S_PL_AUTOINC)
 #define	G_PL_AUTOINC(x)	(((x) >> S_PL_AUTOINC) & M_PL_AUTOINC)
 
 #define	S_PL_VFID	20
 #define	M_PL_VFID	0xffU
 #define	V_PL_VFID(x)	((x) << S_PL_VFID)
 #define	G_PL_VFID(x)	(((x) >> S_PL_VFID) & M_PL_VFID)
 
 #define	S_PL_ADDR	0
 #define	M_PL_ADDR	0xfffffU
 #define	V_PL_ADDR(x)	((x) << S_PL_ADDR)
 #define	G_PL_ADDR(x)	(((x) >> S_PL_ADDR) & M_PL_ADDR)
 
 #define	A_PL_INDIR_DATA	0x1fc
 
 static uint64_t
 read_vf_stat(struct adapter *sc, u_int vin, int reg)
 {
 	u32 stats[2];
 
 	mtx_assert(&sc->reg_lock, MA_OWNED);
 	if (sc->flags & IS_VF) {
 		stats[0] = t4_read_reg(sc, VF_MPS_REG(reg));
 		stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4));
 	} else {
 		t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) |
 		    V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(reg)));
 		stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA);
 		stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA);
 	}
 	return (((uint64_t)stats[1]) << 32 | stats[0]);
 }
 
 static void
 t4_get_vi_stats(struct adapter *sc, u_int vin, struct fw_vi_stats_vf *stats)
 {
 
 #define GET_STAT(name) \
 	read_vf_stat(sc, vin, A_MPS_VF_STAT_##name##_L)
 
 	stats->tx_bcast_bytes    = GET_STAT(TX_VF_BCAST_BYTES);
 	stats->tx_bcast_frames   = GET_STAT(TX_VF_BCAST_FRAMES);
 	stats->tx_mcast_bytes    = GET_STAT(TX_VF_MCAST_BYTES);
 	stats->tx_mcast_frames   = GET_STAT(TX_VF_MCAST_FRAMES);
 	stats->tx_ucast_bytes    = GET_STAT(TX_VF_UCAST_BYTES);
 	stats->tx_ucast_frames   = GET_STAT(TX_VF_UCAST_FRAMES);
 	stats->tx_drop_frames    = GET_STAT(TX_VF_DROP_FRAMES);
 	stats->tx_offload_bytes  = GET_STAT(TX_VF_OFFLOAD_BYTES);
 	stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES);
 	stats->rx_bcast_bytes    = GET_STAT(RX_VF_BCAST_BYTES);
 	stats->rx_bcast_frames   = GET_STAT(RX_VF_BCAST_FRAMES);
 	stats->rx_mcast_bytes    = GET_STAT(RX_VF_MCAST_BYTES);
 	stats->rx_mcast_frames   = GET_STAT(RX_VF_MCAST_FRAMES);
 	stats->rx_ucast_bytes    = GET_STAT(RX_VF_UCAST_BYTES);
 	stats->rx_ucast_frames   = GET_STAT(RX_VF_UCAST_FRAMES);
 	stats->rx_err_frames     = GET_STAT(RX_VF_ERR_FRAMES);
 
 #undef GET_STAT
 }
 
 static void
 t4_clr_vi_stats(struct adapter *sc, u_int vin)
 {
 	int reg;
 
 	t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) |
 	    V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L)));
 	for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L;
 	     reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4)
 		t4_write_reg(sc, A_PL_INDIR_DATA, 0);
 }
 
 static void
 vi_refresh_stats(struct adapter *sc, struct vi_info *vi)
 {
 	struct timeval tv;
 	const struct timeval interval = {0, 250000};	/* 250ms */
 
 	if (!(vi->flags & VI_INIT_DONE))
 		return;
 
 	getmicrotime(&tv);
 	timevalsub(&tv, &interval);
 	if (timevalcmp(&tv, &vi->last_refreshed, <))
 		return;
 
 	mtx_lock(&sc->reg_lock);
 	t4_get_vi_stats(sc, vi->vin, &vi->stats);
 	getmicrotime(&vi->last_refreshed);
 	mtx_unlock(&sc->reg_lock);
 }
 
 static void
 cxgbe_refresh_stats(struct adapter *sc, struct port_info *pi)
 {
 	u_int i, v, tnl_cong_drops, bg_map;
 	struct timeval tv;
 	const struct timeval interval = {0, 250000};	/* 250ms */
 
 	getmicrotime(&tv);
 	timevalsub(&tv, &interval);
 	if (timevalcmp(&tv, &pi->last_refreshed, <))
 		return;
 
 	tnl_cong_drops = 0;
 	t4_get_port_stats(sc, pi->tx_chan, &pi->stats);
 	bg_map = pi->mps_bg_map;
 	while (bg_map) {
 		i = ffs(bg_map) - 1;
 		mtx_lock(&sc->reg_lock);
 		t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1,
 		    A_TP_MIB_TNL_CNG_DROP_0 + i);
 		mtx_unlock(&sc->reg_lock);
 		tnl_cong_drops += v;
 		bg_map &= ~(1 << i);
 	}
 	pi->tnl_cong_drops = tnl_cong_drops;
 	getmicrotime(&pi->last_refreshed);
 }
 
 static void
 cxgbe_tick(void *arg)
 {
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 	cxgbe_refresh_stats(sc, pi);
 
 	callout_schedule(&pi->tick, hz);
 }
 
 void
 vi_tick(void *arg)
 {
 	struct vi_info *vi = arg;
 	struct adapter *sc = vi->pi->adapter;
 
 	vi_refresh_stats(sc, vi);
 
 	callout_schedule(&vi->tick, hz);
 }
 
 /*
  * Should match fw_caps_config_<foo> enums in t4fw_interface.h
  */
 static char *caps_decoder[] = {
 	"\20\001IPMI\002NCSI",				/* 0: NBM */
 	"\20\001PPP\002QFC\003DCBX",			/* 1: link */
 	"\20\001INGRESS\002EGRESS",			/* 2: switch */
 	"\20\001NIC\002VM\003IDS\004UM\005UM_ISGL"	/* 3: NIC */
 	    "\006HASHFILTER\007ETHOFLD",
 	"\20\001TOE",					/* 4: TOE */
 	"\20\001RDDP\002RDMAC",				/* 5: RDMA */
 	"\20\001INITIATOR_PDU\002TARGET_PDU"		/* 6: iSCSI */
 	    "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD"
 	    "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD"
 	    "\007T10DIF"
 	    "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD",
 	"\20\001LOOKASIDE\002TLSKEYS",			/* 7: Crypto */
 	"\20\001INITIATOR\002TARGET\003CTRL_OFLD"	/* 8: FCoE */
 		    "\004PO_INITIATOR\005PO_TARGET",
 };
 
 void
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *c0;
 	static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"};
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 
 	/*
 	 * dev.t4nex.X.
 	 */
 	oid = device_get_sysctl_tree(sc->dev);
 	c0 = children = SYSCTL_CHILDREN(oid);
 
 	sc->sc_do_rxcopy = 1;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW,
 	    &sc->sc_do_rxcopy, 1, "Do RX copy of small frames");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL,
 	    sc->params.nports, "# of ports");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells",
 	    CTLTYPE_STRING | CTLFLAG_RD, doorbells, (uintptr_t)&sc->doorbells,
 	    sysctl_bitfield_8b, "A", "available doorbells");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL,
 	    sc->params.vpd.cclk, "core clock frequency (in KHz)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.timer_val,
 	    sizeof(sc->params.sge.timer_val), sysctl_int_array, "A",
 	    "interrupt holdoff timer values (us)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.counter_val,
 	    sizeof(sc->params.sge.counter_val), sysctl_int_array, "A",
 	    "interrupt holdoff packet counter values");
 
 	t4_sge_sysctls(sc, ctx, children);
 
 	sc->lro_timeout = 100;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW,
 	    &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW,
 	    &sc->debug_flags, 0, "flags to enable runtime debugging");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version",
 	    CTLFLAG_RD, sc->tp_version, 0, "TP microcode version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
 	    CTLFLAG_RD, sc->fw_version, 0, "firmware version");
 
 	if (sc->flags & IS_VF)
 		return;
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD,
 	    NULL, chip_rev(sc), "chip hardware revision");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn",
 	    CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn",
 	    CTLFLAG_RD, sc->params.vpd.pn, 0, "part number");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec",
 	    CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "md_version",
 	    CTLFLAG_RD, sc->params.vpd.md, 0, "manufacturing diags version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na",
 	    CTLFLAG_RD, sc->params.vpd.na, 0, "network address");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD,
 	    sc->er_version, 0, "expansion ROM version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD,
 	    sc->bs_version, 0, "bootstrap firmware version");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD,
 	    NULL, sc->params.scfg_vers, "serial config version");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD,
 	    NULL, sc->params.vpd_vers, "VPD version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf",
 	    CTLFLAG_RD, sc->cfg_file, 0, "configuration file");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL,
 	    sc->cfcsum, "config file checksum");
 
 #define SYSCTL_CAP(name, n, text) \
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \
 	    CTLTYPE_STRING | CTLFLAG_RD, caps_decoder[n], (uintptr_t)&sc->name, \
 	    sysctl_bitfield_16b, "A", "available " text " capabilities")
 
 	SYSCTL_CAP(nbmcaps, 0, "NBM");
 	SYSCTL_CAP(linkcaps, 1, "link");
 	SYSCTL_CAP(switchcaps, 2, "switch");
 	SYSCTL_CAP(niccaps, 3, "NIC");
 	SYSCTL_CAP(toecaps, 4, "TCP offload");
 	SYSCTL_CAP(rdmacaps, 5, "RDMA");
 	SYSCTL_CAP(iscsicaps, 6, "iSCSI");
 	SYSCTL_CAP(cryptocaps, 7, "crypto");
 	SYSCTL_CAP(fcoecaps, 8, "FCoE");
 #undef SYSCTL_CAP
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD,
 	    NULL, sc->tids.nftids, "number of filters");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT |
 	    CTLFLAG_RD, sc, 0, sysctl_temperature, "I",
 	    "chip temperature (in Celsius)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "loadavg", CTLTYPE_STRING |
 	    CTLFLAG_RD, sc, 0, sysctl_loadavg, "A",
 	    "microprocessor load averages (debug firmwares only)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_vdd", CTLFLAG_RD,
 	    &sc->params.core_vdd, 0, "core Vdd (in mV)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "local_cpus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, LOCAL_CPUS,
 	    sysctl_cpus, "A", "local CPUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_cpus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, INTR_CPUS,
 	    sysctl_cpus, "A", "preferred CPUs for interrupts");
 
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "swintr", CTLFLAG_RW,
+	    &sc->swintr, 0, "software triggered interrupts");
+
 	/*
 	 * dev.t4nex.X.misc.  Marked CTLFLAG_SKIP to avoid information overload.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc",
 	    CTLFLAG_RD | CTLFLAG_SKIP, NULL,
 	    "logs and miscellaneous information");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cctrl, "A", "congestion control");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 1,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 2,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 3,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 4,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 5,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_la,
 	    "A", "CIM logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_ma_la, "A", "CIM MA logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)");
 
 	if (chip_id(sc) > CHELSIO_T4) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ,
 		    sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ,
 		    sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_pif_la, "A", "CIM PIF logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_qcfg, "A", "CIM queue configuration");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cpl_stats, "A", "CPL statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_ddp_stats, "A", "non-TCP DDP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_devlog, "A", "firmware's device log");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_fcoe_stats, "A", "FCoE statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_hw_sched, "A", "hardware scheduler ");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_l2t, "A", "hardware L2 table");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "smt",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_smt, "A", "hardware source MAC table");
 
 #ifdef INET6
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "clip",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_clip, "A", "active CLIP table entries");
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_lb_stats, "A", "loopback statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_meminfo, "A", "memory regions");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6,
 	    "A", "MPS TCAM entries");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_path_mtus, "A", "path MTUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_pm_stats, "A", "PM statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_rdma_stats, "A", "RDMA statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tcp_stats, "A", "TCP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tids, "A", "TID information");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tp_err_stats, "A", "TP error statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask",
 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tp_la_mask, "I",
 	    "TP logic analyzer event capture mask");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tp_la, "A", "TP logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tx_rate, "A", "Tx rate");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_ulprx_la, "A", "ULPRX logic analyzer");
 
 	if (chip_id(sc) >= CHELSIO_T5) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 		    sysctl_wcwr_stats, "A", "write combined work requests");
 	}
 
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		int i;
 		char s[4];
 
 		/*
 		 * dev.t4nex.X.toe.
 		 */
 		oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD,
 		    NULL, "TOE parameters");
 		children = SYSCTL_CHILDREN(oid);
 
 		sc->tt.cong_algorithm = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_algorithm",
 		    CTLFLAG_RW, &sc->tt.cong_algorithm, 0, "congestion control "
 		    "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, "
 		    "3 = highspeed)");
 
 		sc->tt.sndbuf = 256 * 1024;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW,
 		    &sc->tt.sndbuf, 0, "max hardware send buffer size");
 
 		sc->tt.ddp = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW,
 		    &sc->tt.ddp, 0, "DDP allowed");
 
 		sc->tt.rx_coalesce = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce",
 		    CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing");
 
 		sc->tt.tls = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tls", CTLFLAG_RW,
 		    &sc->tt.tls, 0, "Inline TLS allowed");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_ports",
 		    CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tls_rx_ports,
 		    "I", "TCP ports that use inline TLS+TOE RX");
 
 		sc->tt.tx_align = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align",
 		    CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload");
 
 		sc->tt.tx_zcopy = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy",
 		    CTLFLAG_RW, &sc->tt.tx_zcopy, 0,
 		    "Enable zero-copy aio_write(2)");
 
 		sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 		    "cop_managed_offloading", CTLFLAG_RW,
 		    &sc->tt.cop_managed_offloading, 0,
 		    "COP (Connection Offload Policy) controls all TOE offload");
+
+		sc->tt.autorcvbuf_inc = 16 * 1024;
+		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "autorcvbuf_inc",
+		    CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0,
+		    "autorcvbuf increment");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A",
 		    "TP timer tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_tp_tick, "A",
 		    "TCP timestamp tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_tp_tick, "A",
 		    "DACK tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, sysctl_tp_dack_timer,
 		    "IU", "DACK timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MIN,
 		    sysctl_tp_timer, "LU", "Minimum retransmit interval (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MAX,
 		    sysctl_tp_timer, "LU", "Maximum retransmit interval (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MIN,
 		    sysctl_tp_timer, "LU", "Persist timer min (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MAX,
 		    sysctl_tp_timer, "LU", "Persist timer max (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_IDLE,
 		    sysctl_tp_timer, "LU", "Keepalive idle timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_interval",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_INTVL,
 		    sysctl_tp_timer, "LU", "Keepalive interval timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_INIT_SRTT,
 		    sysctl_tp_timer, "LU", "Initial SRTT (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_FINWAIT2_TIMER,
 		    sysctl_tp_timer, "LU", "FINWAIT2 timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "syn_rexmt_count",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, S_SYNSHIFTMAX,
 		    sysctl_tp_shift_cnt, "IU",
 		    "Number of SYN retransmissions before abort");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_count",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, S_RXTSHIFTMAXR2,
 		    sysctl_tp_shift_cnt, "IU",
 		    "Number of retransmissions before abort");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_count",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, S_KEEPALIVEMAXR2,
 		    sysctl_tp_shift_cnt, "IU",
 		    "Number of keepalive probes before abort");
 
 		oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rexmt_backoff",
 		    CTLFLAG_RD, NULL, "TOE retransmit backoffs");
 		children = SYSCTL_CHILDREN(oid);
 		for (i = 0; i < 16; i++) {
 			snprintf(s, sizeof(s), "%u", i);
 			SYSCTL_ADD_PROC(ctx, children, OID_AUTO, s,
 			    CTLTYPE_UINT | CTLFLAG_RD, sc, i, sysctl_tp_backoff,
 			    "IU", "TOE retransmit backoff");
 		}
 	}
 #endif
 }
 
 void
 vi_sysctls(struct vi_info *vi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 
 	ctx = device_get_sysctl_ctx(vi->dev);
 
 	/*
 	 * dev.v?(cxgbe|cxl).X.
 	 */
 	oid = device_get_sysctl_tree(vi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL,
 	    vi->viid, "VI identifer");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD,
 	    &vi->nrxq, 0, "# of rx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD,
 	    &vi->ntxq, 0, "# of tx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD,
 	    &vi->first_rxq, 0, "index of first rx queue");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
 	    &vi->first_txq, 0, "index of first tx queue");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_base", CTLFLAG_RD, NULL,
 	    vi->rss_base, "start of RSS indirection table");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL,
 	    vi->rss_size, "size of RSS indirection table");
 
 	if (IS_MAIN_VI(vi)) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq",
 		    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_noflowq, "IU",
 		    "Reserve queue 0 for non-flowid packets");
 	}
 
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
 		    &vi->nofldrxq, 0,
 		    "# of rx queues for offloaded TCP connections");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq",
 		    CTLFLAG_RD, &vi->first_ofld_rxq, 0,
 		    "index of first TOE rx queue");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx_ofld",
 		    CTLTYPE_INT | CTLFLAG_RW, vi, 0,
 		    sysctl_holdoff_tmr_idx_ofld, "I",
 		    "holdoff timer index for TOE queues");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx_ofld",
 		    CTLTYPE_INT | CTLFLAG_RW, vi, 0,
 		    sysctl_holdoff_pktc_idx_ofld, "I",
 		    "holdoff packet counter index for TOE queues");
 	}
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (vi->nofldtxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD,
 		    &vi->nofldtxq, 0,
 		    "# of tx queues for TOE/ETHOFLD");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq",
 		    CTLFLAG_RD, &vi->first_ofld_txq, 0,
 		    "index of first TOE/ETHOFLD tx queue");
 	}
 #endif
 #ifdef DEV_NETMAP
 	if (vi->nnmrxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD,
 		    &vi->nnmrxq, 0, "# of netmap rx queues");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD,
 		    &vi->nnmtxq, 0, "# of netmap tx queues");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq",
 		    CTLFLAG_RD, &vi->first_nm_rxq, 0,
 		    "index of first netmap rx queue");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq",
 		    CTLFLAG_RD, &vi->first_nm_txq, 0,
 		    "index of first netmap tx queue");
 	}
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_tmr_idx, "I",
 	    "holdoff timer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_pktc_idx, "I",
 	    "holdoff packet counter index");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_rxq, "I",
 	    "rx queue size");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_txq, "I",
 	    "tx queue size");
 }
 
 static void
 cxgbe_sysctls(struct port_info *pi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *children2;
 	struct adapter *sc = pi->adapter;
 	int i;
 	char name[16];
 	static char *tc_flags = {"\20\1USER\2SYNC\3ASYNC\4ERR"};
 
 	ctx = device_get_sysctl_ctx(pi->dev);
 
 	/*
 	 * dev.cxgbe.X.
 	 */
 	oid = device_get_sysctl_tree(pi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING |
 	   CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down");
 	if (pi->port_type == FW_PORT_TYPE_BT_XAUI) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature",
 		    CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I",
 		    "PHY temperature (in Celsius)");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version",
 		    CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I",
 		    "PHY firmware version");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings",
 	    CTLTYPE_STRING | CTLFLAG_RW, pi, 0, sysctl_pause_settings, "A",
     "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fec",
 	    CTLTYPE_STRING | CTLFLAG_RW, pi, 0, sysctl_fec, "A",
 	    "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "autoneg",
 	    CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_autoneg, "I",
 	    "autonegotiation (-1 = not supported)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL,
 	    port_top_speed(pi), "max speed (in Gbps)");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "mps_bg_map", CTLFLAG_RD, NULL,
 	    pi->mps_bg_map, "MPS buffer group map");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_e_chan_map", CTLFLAG_RD,
 	    NULL, pi->rx_e_chan_map, "TP rx e-channel map");
 
 	if (sc->flags & IS_VF)
 		return;
 
 	/*
 	 * dev.(cxgbe|cxl).X.tc.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD, NULL,
 	    "Tx scheduler traffic classes (cl_rl)");
 	children2 = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "pktsize",
 	    CTLFLAG_RW, &pi->sched_params->pktsize, 0,
 	    "pktsize for per-flow cl-rl (0 means up to the driver )");
 	SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "burstsize",
 	    CTLFLAG_RW, &pi->sched_params->burstsize, 0,
 	    "burstsize for per-flow cl-rl (0 means up to the driver)");
 	for (i = 0; i < sc->chip_params->nsched_cls; i++) {
 		struct tx_cl_rl_params *tc = &pi->sched_params->cl_rl[i];
 
 		snprintf(name, sizeof(name), "%d", i);
 		children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx,
 		    SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL,
 		    "traffic class"));
 		SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "flags",
 		    CTLTYPE_STRING | CTLFLAG_RD, tc_flags, (uintptr_t)&tc->flags,
 		    sysctl_bitfield_8b, "A", "flags");
 		SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount",
 		    CTLFLAG_RD, &tc->refcount, 0, "references to this class");
 		SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, (pi->port_id << 16) | i,
 		    sysctl_tc_params, "A", "traffic class parameters");
 	}
 
 	/*
 	 * dev.cxgbe.X.stats.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD,
 	    NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD,
 	    &pi->tx_parse_error, 0,
 	    "# of tx packets with invalid length or # of segments");
 
 #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \
 	SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \
 	    CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \
 	    sysctl_handle_t4_reg64, "QU", desc)
 
 	SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_64",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L));
 
 	SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err",
 	    "# of frames received with bad FCS",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_len_err",
 	    "# of frames received with length error",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_64",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L));
 
 #undef SYSCTL_ADD_T4_REG64
 
 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \
 	    &pi->stats.name, desc)
 
 	/* We get these from port_stats and they may be stale by up to 1s */
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0,
 	    "# drops due to buffer-group 0 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1,
 	    "# drops due to buffer-group 1 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2,
 	    "# drops due to buffer-group 2 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3,
 	    "# drops due to buffer-group 3 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc0,
 	    "# of buffer-group 0 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc1,
 	    "# of buffer-group 1 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc2,
 	    "# of buffer-group 2 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc3,
 	    "# of buffer-group 3 truncated packets");
 
 #undef SYSCTL_ADD_T4_PORTSTAT
 
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tx_tls_records",
 	    CTLFLAG_RD, &pi->tx_tls_records,
 	    "# of TLS records transmitted");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tx_tls_octets",
 	    CTLFLAG_RD, &pi->tx_tls_octets,
 	    "# of payload octets in transmitted TLS records");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_tls_records",
 	    CTLFLAG_RD, &pi->rx_tls_records,
 	    "# of TLS records received");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_tls_octets",
 	    CTLFLAG_RD, &pi->rx_tls_octets,
 	    "# of payload octets in received TLS records");
 }
 
 static int
 sysctl_int_array(SYSCTL_HANDLER_ARGS)
 {
 	int rc, *i, space = 0;
 	struct sbuf sb;
 
 	sbuf_new_for_sysctl(&sb, NULL, 64, req);
 	for (i = arg1; arg2; arg2 -= sizeof(int), i++) {
 		if (space)
 			sbuf_printf(&sb, " ");
 		sbuf_printf(&sb, "%d", *i);
 		space = 1;
 	}
 	rc = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", *(uint8_t *)(uintptr_t)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", *(uint16_t *)(uintptr_t)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_btphy(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	int op = arg2;
 	struct adapter *sc = pi->adapter;
 	u_int v;
 	int rc;
 
 	rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt");
 	if (rc)
 		return (rc);
 	/* XXX: magic numbers */
 	rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820,
 	    &v);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 	if (op == 0)
 		v /= 256;
 
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_noflowq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	int rc, val;
 
 	val = vi->rsrv_noflowq;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if ((val >= 1) && (vi->ntxq > 1))
 		vi->rsrv_noflowq = 1;
 	else
 		vi->rsrv_noflowq = 0;
 
 	return (rc);
 }
 
 static int
 sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc, i;
 	struct sge_rxq *rxq;
 	uint8_t v;
 
 	idx = vi->tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4tmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1);
 	for_each_rxq(vi, i, rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&rxq->iq.intr_params, v);
 #else
 		rxq->iq.intr_params = v;
 #endif
 	}
 	vi->tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc;
 
 	idx = vi->pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4pktc");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int qsize, rc;
 
 	qsize = vi->qsize_rxq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || (qsize & 7))
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4rxqs");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->qsize_rxq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_txq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int qsize, rc;
 
 	qsize = vi->qsize_txq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || qsize > 65536)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4txqs");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->qsize_txq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_pause_settings(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 
 	if (req->newptr == NULL) {
 		struct sbuf *sb;
 		static char *bits = "\20\1RX\2TX\3AUTO";
 
 		rc = sysctl_wire_old_buffer(req, 0);
 		if (rc != 0)
 			return(rc);
 
 		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 		if (sb == NULL)
 			return (ENOMEM);
 
 		if (lc->link_ok) {
 			sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) |
 			    (lc->requested_fc & PAUSE_AUTONEG), bits);
 		} else {
 			sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX |
 			    PAUSE_RX | PAUSE_AUTONEG), bits);
 		}
 		rc = sbuf_finish(sb);
 		sbuf_delete(sb);
 	} else {
 		char s[2];
 		int n;
 
 		s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX |
 		    PAUSE_AUTONEG));
 		s[1] = 0;
 
 		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
 		if (rc != 0)
 			return(rc);
 
 		if (s[1] != 0)
 			return (EINVAL);
 		if (s[0] < '0' || s[0] > '9')
 			return (EINVAL);	/* not a number */
 		n = s[0] - '0';
 		if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG))
 			return (EINVAL);	/* some other bit is set too */
 
 		rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 		    "t4PAUSE");
 		if (rc)
 			return (rc);
 		PORT_LOCK(pi);
 		lc->requested_fc = n;
 		fixup_link_config(pi);
 		if (pi->up_vis > 0)
 			rc = apply_link_config(pi);
 		set_current_media(pi);
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, 0);
 	}
 
 	return (rc);
 }
 
 static int
 sysctl_fec(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 	int8_t old;
 
 	if (req->newptr == NULL) {
 		struct sbuf *sb;
 		static char *bits = "\20\1RS\2BASE-R\3RSVD1\4RSVD2\5RSVD3\6AUTO";
 
 		rc = sysctl_wire_old_buffer(req, 0);
 		if (rc != 0)
 			return(rc);
 
 		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 		if (sb == NULL)
 			return (ENOMEM);
 
 		/*
 		 * Display the requested_fec when the link is down -- the actual
 		 * FEC makes sense only when the link is up.
 		 */
 		if (lc->link_ok) {
 			sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) |
 			    (lc->requested_fec & FEC_AUTO), bits);
 		} else {
 			sbuf_printf(sb, "%b", lc->requested_fec, bits);
 		}
 		rc = sbuf_finish(sb);
 		sbuf_delete(sb);
 	} else {
 		char s[3];
 		int n;
 
 		snprintf(s, sizeof(s), "%d",
 		    lc->requested_fec == FEC_AUTO ? -1 :
 		    lc->requested_fec & M_FW_PORT_CAP32_FEC);
 
 		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
 		if (rc != 0)
 			return(rc);
 
 		n = strtol(&s[0], NULL, 0);
 		if (n < 0 || n & FEC_AUTO)
 			n = FEC_AUTO;
 		else {
 			if (n & ~M_FW_PORT_CAP32_FEC)
 				return (EINVAL);/* some other bit is set too */
 			if (!powerof2(n))
 				return (EINVAL);/* one bit can be set at most */
 		}
 
 		rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 		    "t4fec");
 		if (rc)
 			return (rc);
 		PORT_LOCK(pi);
 		old = lc->requested_fec;
 		if (n == FEC_AUTO)
 			lc->requested_fec = FEC_AUTO;
 		else if (n == 0)
 			lc->requested_fec = FEC_NONE;
 		else {
 			if ((lc->supported | V_FW_PORT_CAP32_FEC(n)) !=
 			    lc->supported) {
 				rc = ENOTSUP;
 				goto done;
 			}
 			lc->requested_fec = n;
 		}
 		fixup_link_config(pi);
 		if (pi->up_vis > 0) {
 			rc = apply_link_config(pi);
 			if (rc != 0) {
 				lc->requested_fec = old;
 				if (rc == FW_EPROTO)
 					rc = ENOTSUP;
 			}
 		}
 done:
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, 0);
 	}
 
 	return (rc);
 }
 
 static int
 sysctl_autoneg(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc, val;
 
 	if (lc->supported & FW_PORT_CAP32_ANEG)
 		val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1;
 	else
 		val = -1;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 	if (val == 0)
 		val = AUTONEG_DISABLE;
 	else if (val == 1)
 		val = AUTONEG_ENABLE;
 	else
 		val = AUTONEG_AUTO;
 
 	rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 	    "t4aneg");
 	if (rc)
 		return (rc);
 	PORT_LOCK(pi);
 	if (val == AUTONEG_ENABLE && !(lc->supported & FW_PORT_CAP32_ANEG)) {
 		rc = ENOTSUP;
 		goto done;
 	}
 	lc->requested_aneg = val;
 	fixup_link_config(pi);
 	if (pi->up_vis > 0)
 		rc = apply_link_config(pi);
 	set_current_media(pi);
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int reg = arg2;
 	uint64_t val;
 
 	val = t4_read_reg64(sc, reg);
 
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 static int
 sysctl_temperature(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, t;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp");
 	if (rc)
 		return (rc);
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 	    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	/* unknown is returned as 0 but we display -1 in that case */
 	t = val == 0 ? -1 : val;
 
 	rc = sysctl_handle_int(oidp, &t, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_loadavg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4lavg");
 	if (rc)
 		return (rc);
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_LOAD);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (val == 0xffffffff) {
 		/* Only debug and custom firmwares report load averages. */
 		sbuf_printf(sb, "not available");
 	} else {
 		sbuf_printf(sb, "%d %d %d", val & 0xff, (val >> 8) & 0xff,
 		    (val >> 16) & 0xff);
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cctrl(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t incr[NMTUS][NCCTRL_WIN];
 	static const char *dec_fac[] = {
 		"0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875",
 		"0.9375"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_read_cong_tbl(sc, incr);
 
 	for (i = 0; i < NCCTRL_WIN; ++i) {
 		sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i,
 		    incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i],
 		    incr[5][i], incr[6][i], incr[7][i]);
 		sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n",
 		    incr[8][i], incr[9][i], incr[10][i], incr[11][i],
 		    incr[12][i], incr[13][i], incr[14][i], incr[15][i],
 		    sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = {
 	"TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI",	/* ibq's */
 	"ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI",	/* obq's */
 	"SGE0-RX", "SGE1-RX"	/* additional obq's (T5 onwards) */
 };
 
 static int
 sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n, qid = arg2;
 	uint32_t *buf, *p;
 	char *qtype;
 	u_int cim_num_obq = sc->chip_params->cim_num_obq;
 
 	KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq,
 	    ("%s: bad qid %d\n", __func__, qid));
 
 	if (qid < CIM_NUM_IBQ) {
 		/* inbound queue */
 		qtype = "IBQ";
 		n = 4 * CIM_IBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		rc = t4_read_cim_ibq(sc, qid, buf, n);
 	} else {
 		/* outbound queue */
 		qtype = "OBQ";
 		qid -= CIM_NUM_IBQ;
 		n = 4 * cim_num_obq * CIM_OBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		rc = t4_read_cim_obq(sc, qid, buf, n);
 	}
 
 	if (rc < 0) {
 		rc = -rc;
 		goto done;
 	}
 	n = rc * sizeof(uint32_t);	/* rc has # of words actually read */
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		goto done;
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]);
 	for (i = 0, p = buf; i < n; i += 16, p += 4)
 		sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1],
 		    p[2], p[3]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static void
 sbuf_cim_la4(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg)
 {
 	uint32_t *p;
 
 	sbuf_printf(sb, "Status   Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr             LS0Data");
 
 	for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x", p[5] & 0xff,
 			    p[6], p[7]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x",
 			    (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8,
 			    p[4] & 0xff, p[5] >> 8);
 			sbuf_printf(sb, "\n  %02x   %x%07x %x%07x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4);
 		} else {
 			sbuf_printf(sb,
 			    "\n  %02x   %x%07x %x%07x %08x %08x "
 			    "%08x%08x%08x%08x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5],
 			    p[6], p[7]);
 		}
 	}
 }
 
 static void
 sbuf_cim_la6(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg)
 {
 	uint32_t *p;
 
 	sbuf_printf(sb, "Status   Inst    Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr  LS0Data  LS1Stat  LS1Addr  LS1Data");
 
 	for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x %08x",
 			    p[3] & 0xff, p[2], p[1], p[0]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x %02x%06x",
 			    (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8,
 			    p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8);
 			sbuf_printf(sb, "\n  %02x   %04x%04x %04x%04x %04x%04x",
 			    (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16,
 			    p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff,
 			    p[6] >> 16);
 		} else {
 			sbuf_printf(sb, "\n  %02x   %04x%04x %04x%04x %04x%04x "
 			    "%08x %08x %08x %08x %08x %08x",
 			    (p[9] >> 16) & 0xff,
 			    p[9] & 0xffff, p[8] >> 16,
 			    p[8] & 0xffff, p[7] >> 16,
 			    p[7] & 0xffff, p[6] >> 16,
 			    p[2], p[1], p[0], p[5], p[4], p[3]);
 		}
 	}
 }
 
 static int
 sbuf_cim_la(struct adapter *sc, struct sbuf *sb, int flags)
 {
 	uint32_t cfg, *buf;
 	int rc;
 
 	rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg);
 	if (rc != 0)
 		return (rc);
 
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 	buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | flags);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	rc = -t4_cim_read_la(sc, buf, NULL);
 	if (rc != 0)
 		goto done;
 	if (chip_id(sc) < CHELSIO_T6)
 		sbuf_cim_la4(sc, sb, buf, cfg);
 	else
 		sbuf_cim_la6(sc, sb, buf, cfg);
 
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	rc = sbuf_cim_la(sc, sb, M_WAITOK);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 bool
 t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose)
 {
 	struct sbuf sb;
 	int rc;
 
 	if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
 		return (false);
 	rc = sbuf_cim_la(sc, &sb, M_NOWAIT);
 	if (rc == 0) {
 		rc = sbuf_finish(&sb);
 		if (rc == 0) {
 			log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s",
 		    		device_get_nameunit(sc->dev), sbuf_data(&sb));
 		}
 	}
 	sbuf_delete(&sb);
 	return (false);
 }
 
 static int
 sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE);
 	p = buf;
 
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2],
 		    p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCnt ID Tag UE       Data       RDY VLD");
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%3u %2u  %x   %u %08x%08x  %u   %u",
 		    (p[2] >> 10) & 0xff, (p[2] >> 7) & 7,
 		    (p[2] >> 3) & 0xf, (p[2] >> 2) & 1,
 		    (p[1] >> 2) | ((p[2] & 3) << 30),
 		    (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1,
 		    p[0] & 1);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL);
 	p = buf;
 
 	sbuf_printf(sb, "Cntl ID DataBE   Addr                 Data");
 	for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x  %04x  %08x %08x%08x%08x%08x",
 		    (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff,
 		    p[4], p[3], p[2], p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCntl ID               Data");
 	for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x %08x%08x%08x%08x",
 		    (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t thres[CIM_NUM_IBQ];
 	uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr;
 	uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat;
 	u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq;
 
 	cim_num_obq = sc->chip_params->cim_num_obq;
 	if (is_t4(sc)) {
 		ibq_rdaddr = A_UP_IBQ_0_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_REALADDR;
 	} else {
 		ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR;
 	}
 	nq = CIM_NUM_IBQ + cim_num_obq;
 
 	rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat);
 	if (rc == 0)
 		rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr);
 	if (rc != 0)
 		return (rc);
 
 	t4_read_cimq_cfg(sc, base, size, thres);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "  Queue  Base  Size Thres  RdPtr WrPtr  SOP  EOP Avail");
 
 	for (i = 0; i < CIM_NUM_IBQ; i++, p += 4)
 		sbuf_printf(sb, "\n%7s %5x %5u %5u %6x  %4x %4u %4u %5u",
 		    qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]),
 		    G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 	for ( ; i < nq; i++, p += 4, wr += 2)
 		sbuf_printf(sb, "\n%7s %5x %5u %12x  %4x %4u %4u %5u", qname[i],
 		    base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff,
 		    wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpl_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_cpl_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_cpl_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "                 channel 0  channel 1"
 		    "  channel 2  channel 3");
 		sbuf_printf(sb, "\nCPL requests:   %10u %10u %10u %10u",
 		    stats.req[0], stats.req[1], stats.req[2], stats.req[3]);
 		sbuf_printf(sb, "\nCPL responses:   %10u %10u %10u %10u",
 		    stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]);
 	} else {
 		sbuf_printf(sb, "                 channel 0  channel 1");
 		sbuf_printf(sb, "\nCPL requests:   %10u %10u",
 		    stats.req[0], stats.req[1]);
 		sbuf_printf(sb, "\nCPL responses:   %10u %10u",
 		    stats.rsp[0], stats.rsp[1]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ddp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_usm_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_usm_stats(sc, &stats, 1);
 
 	sbuf_printf(sb, "Frames: %u\n", stats.frames);
 	sbuf_printf(sb, "Octets: %ju\n", stats.octets);
 	sbuf_printf(sb, "Drops:  %u", stats.drops);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static const char * const devlog_level_strings[] = {
 	[FW_DEVLOG_LEVEL_EMERG]		= "EMERG",
 	[FW_DEVLOG_LEVEL_CRIT]		= "CRIT",
 	[FW_DEVLOG_LEVEL_ERR]		= "ERR",
 	[FW_DEVLOG_LEVEL_NOTICE]	= "NOTICE",
 	[FW_DEVLOG_LEVEL_INFO]		= "INFO",
 	[FW_DEVLOG_LEVEL_DEBUG]		= "DEBUG"
 };
 
 static const char * const devlog_facility_strings[] = {
 	[FW_DEVLOG_FACILITY_CORE]	= "CORE",
 	[FW_DEVLOG_FACILITY_CF]		= "CF",
 	[FW_DEVLOG_FACILITY_SCHED]	= "SCHED",
 	[FW_DEVLOG_FACILITY_TIMER]	= "TIMER",
 	[FW_DEVLOG_FACILITY_RES]	= "RES",
 	[FW_DEVLOG_FACILITY_HW]		= "HW",
 	[FW_DEVLOG_FACILITY_FLR]	= "FLR",
 	[FW_DEVLOG_FACILITY_DMAQ]	= "DMAQ",
 	[FW_DEVLOG_FACILITY_PHY]	= "PHY",
 	[FW_DEVLOG_FACILITY_MAC]	= "MAC",
 	[FW_DEVLOG_FACILITY_PORT]	= "PORT",
 	[FW_DEVLOG_FACILITY_VI]		= "VI",
 	[FW_DEVLOG_FACILITY_FILTER]	= "FILTER",
 	[FW_DEVLOG_FACILITY_ACL]	= "ACL",
 	[FW_DEVLOG_FACILITY_TM]		= "TM",
 	[FW_DEVLOG_FACILITY_QFC]	= "QFC",
 	[FW_DEVLOG_FACILITY_DCB]	= "DCB",
 	[FW_DEVLOG_FACILITY_ETH]	= "ETH",
 	[FW_DEVLOG_FACILITY_OFLD]	= "OFLD",
 	[FW_DEVLOG_FACILITY_RI]		= "RI",
 	[FW_DEVLOG_FACILITY_ISCSI]	= "ISCSI",
 	[FW_DEVLOG_FACILITY_FCOE]	= "FCOE",
 	[FW_DEVLOG_FACILITY_FOISCSI]	= "FOISCSI",
 	[FW_DEVLOG_FACILITY_FOFCOE]	= "FOFCOE",
 	[FW_DEVLOG_FACILITY_CHNET]	= "CHNET",
 };
 
 static int
 sbuf_devlog(struct adapter *sc, struct sbuf *sb, int flags)
 {
 	int i, j, rc, nentries, first = 0;
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e *buf, *e;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->addr == 0)
 		return (ENXIO);
 
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 	buf = malloc(dparams->size, M_CXGBE, M_ZERO | flags);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size);
 	if (rc != 0)
 		goto done;
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 	for (i = 0; i < nentries; i++) {
 		e = &buf[i];
 
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		e->timestamp = be64toh(e->timestamp);
 		e->seqno = be32toh(e->seqno);
 		for (j = 0; j < 8; j++)
 			e->params[j] = be32toh(e->params[j]);
 
 		if (e->timestamp < ftstamp) {
 			ftstamp = e->timestamp;
 			first = i;
 		}
 	}
 
 	if (buf[first].timestamp == 0)
 		goto done;	/* nothing in the log */
 
 	sbuf_printf(sb, "%10s  %15s  %8s  %8s  %s\n",
 	    "Seq#", "Tstamp", "Level", "Facility", "Message");
 
 	i = first;
 	do {
 		e = &buf[i];
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		sbuf_printf(sb, "%10d  %15ju  %8s  %8s  ",
 		    e->seqno, e->timestamp,
 		    (e->level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e->level] : "UNKNOWN"),
 		    (e->facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e->facility] : "UNKNOWN"));
 		sbuf_printf(sb, e->fmt, e->params[0], e->params[1],
 		    e->params[2], e->params[3], e->params[4],
 		    e->params[5], e->params[6], e->params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_devlog(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	rc = sbuf_devlog(sc, sb, M_WAITOK);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 void
 t4_os_dump_devlog(struct adapter *sc)
 {
 	int rc;
 	struct sbuf sb;
 
 	if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
 		return;
 	rc = sbuf_devlog(sc, &sb, M_NOWAIT);
 	if (rc == 0) {
 		rc = sbuf_finish(&sb);
 		if (rc == 0) {
 			log(LOG_DEBUG, "%s: device log follows.\n%s",
 		    		device_get_nameunit(sc->dev), sbuf_data(&sb));
 		}
 	}
 	sbuf_delete(&sb);
 }
 
 static int
 sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_fcoe_stats stats[MAX_NCHAN];
 	int i, nchan = sc->chip_params->nchan;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nchan; i++)
 		t4_get_fcoe_stats(sc, i, &stats[i], 1);
 
 	if (nchan > 2) {
 		sbuf_printf(sb, "                   channel 0        channel 1"
 		    "        channel 2        channel 3");
 		sbuf_printf(sb, "\noctetsDDP:  %16ju %16ju %16ju %16ju",
 		    stats[0].octets_ddp, stats[1].octets_ddp,
 		    stats[2].octets_ddp, stats[3].octets_ddp);
 		sbuf_printf(sb, "\nframesDDP:  %16u %16u %16u %16u",
 		    stats[0].frames_ddp, stats[1].frames_ddp,
 		    stats[2].frames_ddp, stats[3].frames_ddp);
 		sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u",
 		    stats[0].frames_drop, stats[1].frames_drop,
 		    stats[2].frames_drop, stats[3].frames_drop);
 	} else {
 		sbuf_printf(sb, "                   channel 0        channel 1");
 		sbuf_printf(sb, "\noctetsDDP:  %16ju %16ju",
 		    stats[0].octets_ddp, stats[1].octets_ddp);
 		sbuf_printf(sb, "\nframesDDP:  %16u %16u",
 		    stats[0].frames_ddp, stats[1].frames_ddp);
 		sbuf_printf(sb, "\nframesDrop: %16u %16u",
 		    stats[0].frames_drop, stats[1].frames_drop);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_hw_sched(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	unsigned int map, kbps, ipg, mode;
 	unsigned int pace_tab[NTX_SCHED];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP);
 	mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG));
 	t4_read_pace_tbl(sc, pace_tab);
 
 	sbuf_printf(sb, "Scheduler  Mode   Channel  Rate (Kbps)   "
 	    "Class IPG (0.1 ns)   Flow IPG (us)");
 
 	for (i = 0; i < NTX_SCHED; ++i, map >>= 2) {
 		t4_get_tx_sched(sc, i, &kbps, &ipg, 1);
 		sbuf_printf(sb, "\n    %u      %-5s     %u     ", i,
 		    (mode & (1 << i)) ? "flow" : "class", map & 3);
 		if (kbps)
 			sbuf_printf(sb, "%9u     ", kbps);
 		else
 			sbuf_printf(sb, " disabled     ");
 
 		if (ipg)
 			sbuf_printf(sb, "%13u        ", ipg);
 		else
 			sbuf_printf(sb, "     disabled        ");
 
 		if (pace_tab[i])
 			sbuf_printf(sb, "%10u", pace_tab[i]);
 		else
 			sbuf_printf(sb, "  disabled");
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_lb_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, j;
 	uint64_t *p0, *p1;
 	struct lb_port_stats s[2];
 	static const char *stat_name[] = {
 		"OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:",
 		"UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:",
 		"Frames128To255:", "Frames256To511:", "Frames512To1023:",
 		"Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:",
 		"BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:",
 		"BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:",
 		"BG2FramesTrunc:", "BG3FramesTrunc:"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	memset(s, 0, sizeof(s));
 
 	for (i = 0; i < sc->chip_params->nchan; i += 2) {
 		t4_get_lb_stats(sc, i, &s[0]);
 		t4_get_lb_stats(sc, i + 1, &s[1]);
 
 		p0 = &s[0].octets;
 		p1 = &s[1].octets;
 		sbuf_printf(sb, "%s                       Loopback %u"
 		    "           Loopback %u", i == 0 ? "" : "\n", i, i + 1);
 
 		for (j = 0; j < nitems(stat_name); j++)
 			sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j],
 				   *p0++, *p1++);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_linkdnrc(SYSCTL_HANDLER_ARGS)
 {
 	int rc = 0;
 	struct port_info *pi = arg1;
 	struct link_config *lc = &pi->link_cfg;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 64, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (lc->link_ok || lc->link_down_rc == 255)
 		sbuf_printf(sb, "n/a");
 	else
 		sbuf_printf(sb, "%s", t4_link_down_rc_str(lc->link_down_rc));
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 struct mem_desc {
 	unsigned int base;
 	unsigned int limit;
 	unsigned int idx;
 };
 
 static int
 mem_desc_cmp(const void *a, const void *b)
 {
 	return ((const struct mem_desc *)a)->base -
 	       ((const struct mem_desc *)b)->base;
 }
 
 static void
 mem_region_show(struct sbuf *sb, const char *name, unsigned int from,
     unsigned int to)
 {
 	unsigned int size;
 
 	if (from == to)
 		return;
 
 	size = to - from + 1;
 	if (size == 0)
 		return;
 
 	/* XXX: need humanize_number(3) in libkern for a more readable 'size' */
 	sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size);
 }
 
 static int
 sysctl_meminfo(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n;
 	uint32_t lo, hi, used, alloc;
 	static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"};
 	static const char *region[] = {
 		"DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:",
 		"Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:",
 		"Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:",
 		"TDDP region:", "TPT region:", "STAG region:", "RQ region:",
 		"RQUDP region:", "PBL region:", "TXPBL region:",
 		"DBVFIFO region:", "ULPRX state:", "ULPTX state:",
 		"On-chip queues:", "TLS keys:",
 	};
 	struct mem_desc avail[4];
 	struct mem_desc mem[nitems(region) + 3];	/* up to 3 holes */
 	struct mem_desc *md = mem;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nitems(mem); i++) {
 		mem[i].limit = 0;
 		mem[i].idx = i;
 	}
 
 	/* Find and sort the populated memory ranges */
 	i = 0;
 	lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if (lo & F_EDRAM0_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		avail[i].base = G_EDRAM0_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20);
 		avail[i].idx = 0;
 		i++;
 	}
 	if (lo & F_EDRAM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		avail[i].base = G_EDRAM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20);
 		avail[i].idx = 1;
 		i++;
 	}
 	if (lo & F_EXT_MEM_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		avail[i].base = G_EXT_MEM_BASE(hi) << 20;
 		avail[i].limit = avail[i].base +
 		    (G_EXT_MEM_SIZE(hi) << 20);
 		avail[i].idx = is_t5(sc) ? 3 : 2;	/* Call it MC0 for T5 */
 		i++;
 	}
 	if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		avail[i].base = G_EXT_MEM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base +
 		    (G_EXT_MEM1_SIZE(hi) << 20);
 		avail[i].idx = 4;
 		i++;
 	}
 	if (!i)                                    /* no memory available */
 		return 0;
 	qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp);
 
 	(md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE);
 
 	/* the next few have explicit upper bounds */
 	md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) *
 		    G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE));
 	md++;
 
 	md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) *
 		    G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE));
 	md++;
 
 	if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 		if (chip_id(sc) <= CHELSIO_T5)
 			md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE);
 		else
 			md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR);
 		md->limit = 0;
 	} else {
 		md->base = 0;
 		md->idx = nitems(region);  /* hide it */
 	}
 	md++;
 
 #define ulp_region(reg) \
 	md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\
 	(md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT)
 
 	ulp_region(RX_ISCSI);
 	ulp_region(RX_TDDP);
 	ulp_region(TX_TPT);
 	ulp_region(RX_STAG);
 	ulp_region(RX_RQ);
 	ulp_region(RX_RQUDP);
 	ulp_region(RX_PBL);
 	ulp_region(TX_PBL);
 #undef ulp_region
 
 	md->base = 0;
 	md->idx = nitems(region);
 	if (!is_t4(sc)) {
 		uint32_t size = 0;
 		uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2);
 		uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE);
 
 		if (is_t5(sc)) {
 			if (sge_ctrl & F_VFIFO_ENABLE)
 				size = G_DBVFIFO_SIZE(fifo_size);
 		} else
 			size = G_T6_DBVFIFO_SIZE(fifo_size);
 
 		if (size) {
 			md->base = G_BASEADDR(t4_read_reg(sc,
 			    A_SGE_DBVFIFO_BADDR));
 			md->limit = md->base + (size << 2) - 1;
 		}
 	}
 	md++;
 
 	md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE);
 	md->limit = 0;
 	md++;
 	md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE);
 	md->limit = 0;
 	md++;
 
 	md->base = sc->vres.ocq.start;
 	if (sc->vres.ocq.size)
 		md->limit = md->base + sc->vres.ocq.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	md->base = sc->vres.key.start;
 	if (sc->vres.key.size)
 		md->limit = md->base + sc->vres.key.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	/* add any address-space holes, there can be up to 3 */
 	for (n = 0; n < i - 1; n++)
 		if (avail[n].limit < avail[n + 1].base)
 			(md++)->base = avail[n].limit;
 	if (avail[n].limit)
 		(md++)->base = avail[n].limit;
 
 	n = md - mem;
 	qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp);
 
 	for (lo = 0; lo < i; lo++)
 		mem_region_show(sb, memory[avail[lo].idx], avail[lo].base,
 				avail[lo].limit - 1);
 
 	sbuf_printf(sb, "\n");
 	for (i = 0; i < n; i++) {
 		if (mem[i].idx >= nitems(region))
 			continue;                        /* skip holes */
 		if (!mem[i].limit)
 			mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0;
 		mem_region_show(sb, region[mem[i].idx], mem[i].base,
 				mem[i].limit);
 	}
 
 	sbuf_printf(sb, "\n");
 	lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP RAM:", lo, hi);
 
 	lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP Extmem2:", lo, hi);
 
 	lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE);
 	sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n",
 		   G_PMRXMAXPAGE(lo),
 		   t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10,
 		   (lo & F_PMRXNUMCHN) ? 2 : 1);
 
 	lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE);
 	hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
 	sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n",
 		   G_PMTXMAXPAGE(lo),
 		   hi >= (1 << 20) ? (hi >> 20) : (hi >> 10),
 		   hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo));
 	sbuf_printf(sb, "%u p-structs\n",
 		   t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT));
 
 	for (i = 0; i < 4; i++) {
 		if (chip_id(sc) > CHELSIO_T5)
 			lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4);
 		else
 			lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4);
 		if (is_t5(sc)) {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		} else {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		}
 		/* For T6 these are MAC buffer groups */
 		sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated",
 		    i, used, alloc);
 	}
 	for (i = 0; i < sc->chip_params->nchan; i++) {
 		if (chip_id(sc) > CHELSIO_T5)
 			lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4);
 		else
 			lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4);
 		if (is_t5(sc)) {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		} else {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		}
 		/* For T6 these are MAC buffer groups */
 		sbuf_printf(sb,
 		    "\nLoopback %d using %u pages out of %u allocated",
 		    i, used, alloc);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static inline void
 tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask)
 {
 	*mask = x | y;
 	y = htobe64(y);
 	memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN);
 }
 
 static int
 sysctl_mps_tcam(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 
 	MPASS(chip_id(sc) <= CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "Idx  Ethernet address     Mask     Vld Ports PF"
 	    "  VF              Replication             P0 P1 P2 P3  ML");
 	for (i = 0; i < sc->chip_params->mps_tcam_size; i++) {
 		uint64_t tcamx, tcamy, mask;
 		uint32_t cls_lo, cls_hi;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i));
 		tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i));
 		if (tcamx & tcamy)
 			continue;
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 		cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 		cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 		sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx"
 			   "  %c   %#x%4u%4d", i, addr[0], addr[1], addr[2],
 			   addr[3], addr[4], addr[5], (uintmax_t)mask,
 			   (cls_lo & F_SRAM_VLD) ? 'Y' : 'N',
 			   G_PORTMAP(cls_hi), G_PF(cls_lo),
 			   (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1);
 
 		if (cls_lo & F_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.rplc.fid_idx =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_IDX(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t4mps");
 			if (rc)
 				break;
 			rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 			    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 
 			if (rc != 0) {
 				sbuf_printf(sb, "%36d", rc);
 				rc = 0;
 			} else {
 				sbuf_printf(sb, " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%36s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo),
 		    G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo),
 		    G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 
 	MPASS(chip_id(sc) > CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "Idx  Ethernet address     Mask       VNI   Mask"
 	    "   IVLAN Vld DIP_Hit   Lookup  Port Vld Ports PF  VF"
 	    "                           Replication"
 	    "                                    P0 P1 P2 P3  ML\n");
 
 	for (i = 0; i < sc->chip_params->mps_tcam_size; i++) {
 		uint8_t dip_hit, vlan_vld, lookup_type, port_num;
 		uint16_t ivlan;
 		uint64_t tcamx, tcamy, val, mask;
 		uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0);
 		if (i < 256)
 			ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0);
 		else
 			ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1);
 		t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl);
 		val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1);
 		tcamy = G_DMACH(val) << 32;
 		tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1);
 		data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1);
 		lookup_type = G_DATALKPTYPE(data2);
 		port_num = G_DATAPORTNUM(data2);
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			/* Inner header VNI */
 			vniy = ((data2 & F_DATAVIDH2) << 23) |
 				       (G_DATAVIDH1(data2) << 16) | G_VIDL(val);
 			dip_hit = data2 & F_DATADIPHIT;
 			vlan_vld = 0;
 		} else {
 			vniy = 0;
 			dip_hit = 0;
 			vlan_vld = data2 & F_DATAVIDH2;
 			ivlan = G_VIDL(val);
 		}
 
 		ctl |= V_CTLXYBITSEL(1);
 		t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl);
 		val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1);
 		tcamx = G_DMACH(val) << 32;
 		tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1);
 		data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1);
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			/* Inner header VNI mask */
 			vnix = ((data2 & F_DATAVIDH2) << 23) |
 			       (G_DATAVIDH1(data2) << 16) | G_VIDL(val);
 		} else
 			vnix = 0;
 
 		if (tcamx & tcamy)
 			continue;
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 
 		cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 		cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x "
 			    "%012jx %06x %06x    -    -   %3c"
 			    "      'I'  %4x   %3c   %#x%4u%4d", i, addr[0],
 			    addr[1], addr[2], addr[3], addr[4], addr[5],
 			    (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N',
 			    port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N',
 			    G_PORTMAP(cls_hi), G_T6_PF(cls_lo),
 			    cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1);
 		} else {
 			sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x "
 			    "%012jx    -       -   ", i, addr[0], addr[1],
 			    addr[2], addr[3], addr[4], addr[5],
 			    (uintmax_t)mask);
 
 			if (vlan_vld)
 				sbuf_printf(sb, "%4u   Y     ", ivlan);
 			else
 				sbuf_printf(sb, "  -    N     ");
 
 			sbuf_printf(sb, "-      %3c  %4x   %3c   %#x%4u%4d",
 			    lookup_type ? 'I' : 'O', port_num,
 			    cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N',
 			    G_PORTMAP(cls_hi), G_T6_PF(cls_lo),
 			    cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1);
 		}
 
 
 		if (cls_lo & F_T6_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.rplc.fid_idx =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_IDX(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t6mps");
 			if (rc)
 				break;
 			rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 			    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 
 			if (rc != 0) {
 				sbuf_printf(sb, "%72d", rc);
 				rc = 0;
 			} else {
 				sbuf_printf(sb, " %08x %08x %08x %08x"
 				    " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc.rplc255_224),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc223_192),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc191_160),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc159_128),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%72s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#x",
 		    G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo),
 		    G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo),
 		    (cls_lo >> S_T6_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_path_mtus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint16_t mtus[NMTUS];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_read_mtu_tbl(sc, mtus, NULL);
 
 	sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u",
 	    mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6],
 	    mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13],
 	    mtus[14], mtus[15]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_pm_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS];
 	uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS];
 	static const char *tx_stats[MAX_PM_NSTATS] = {
 		"Read:", "Write bypass:", "Write mem:", "Bypass + mem:",
 		"Tx FIFO wait", NULL, "Tx latency"
 	};
 	static const char *rx_stats[MAX_PM_NSTATS] = {
 		"Read:", "Write bypass:", "Write mem:", "Flush:",
 		"Rx FIFO wait", NULL, "Rx latency"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_pmtx_get_stats(sc, tx_cnt, tx_cyc);
 	t4_pmrx_get_stats(sc, rx_cnt, rx_cyc);
 
 	sbuf_printf(sb, "                Tx pcmds             Tx bytes");
 	for (i = 0; i < 4; i++) {
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 	}
 
 	sbuf_printf(sb, "\n                Rx pcmds             Rx bytes");
 	for (i = 0; i < 4; i++) {
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 	}
 
 	if (chip_id(sc) > CHELSIO_T5) {
 		sbuf_printf(sb,
 		    "\n              Total wait      Total occupancy");
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 
 		i += 2;
 		MPASS(i < nitems(tx_stats));
 
 		sbuf_printf(sb,
 		    "\n                   Reads           Total wait");
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_rdma_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_rdma_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_rdma_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod);
 	sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tcp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_tcp_stats v4, v6;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_tcp_stats(sc, &v4, &v6, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	sbuf_printf(sb,
 	    "                                IP                 IPv6\n");
 	sbuf_printf(sb, "OutRsts:      %20u %20u\n",
 	    v4.tcp_out_rsts, v6.tcp_out_rsts);
 	sbuf_printf(sb, "InSegs:       %20ju %20ju\n",
 	    v4.tcp_in_segs, v6.tcp_in_segs);
 	sbuf_printf(sb, "OutSegs:      %20ju %20ju\n",
 	    v4.tcp_out_segs, v6.tcp_out_segs);
 	sbuf_printf(sb, "RetransSegs:  %20ju %20ju",
 	    v4.tcp_retrans_segs, v6.tcp_retrans_segs);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tids(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tid_info *t = &sc->tids;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (t->natids) {
 		sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1,
 		    t->atids_in_use);
 	}
 
 	if (t->nhpftids) {
 		sbuf_printf(sb, "HPFTID range: %u-%u, in use: %u\n",
 		    t->hpftid_base, t->hpftid_end, t->hpftids_in_use);
 	}
 
 	if (t->ntids) {
 		sbuf_printf(sb, "TID range: ");
 		if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 			uint32_t b, hb;
 
 			if (chip_id(sc) <= CHELSIO_T5) {
 				b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4;
 				hb = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4;
 			} else {
 				b = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX);
 				hb = t4_read_reg(sc, A_T6_LE_DB_HASH_TID_BASE);
 			}
 
 			if (b)
 				sbuf_printf(sb, "%u-%u, ", t->tid_base, b - 1);
 			sbuf_printf(sb, "%u-%u", hb, t->ntids - 1);
 		} else
 			sbuf_printf(sb, "%u-%u", t->tid_base, t->ntids - 1);
 		sbuf_printf(sb, ", in use: %u\n",
 		    atomic_load_acq_int(&t->tids_in_use));
 	}
 
 	if (t->nstids) {
 		sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base,
 		    t->stid_base + t->nstids - 1, t->stids_in_use);
 	}
 
 	if (t->nftids) {
 		sbuf_printf(sb, "FTID range: %u-%u, in use: %u\n", t->ftid_base,
 		    t->ftid_end, t->ftids_in_use);
 	}
 
 	if (t->netids) {
 		sbuf_printf(sb, "ETID range: %u-%u, in use: %u\n", t->etid_base,
 		    t->etid_base + t->netids - 1, t->etids_in_use);
 	}
 
 	sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users",
 	    t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4),
 	    t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6));
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_err_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_err_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "                 channel 0  channel 1"
 		    "  channel 2  channel 3\n");
 		sbuf_printf(sb, "macInErrs:      %10u %10u %10u %10u\n",
 		    stats.mac_in_errs[0], stats.mac_in_errs[1],
 		    stats.mac_in_errs[2], stats.mac_in_errs[3]);
 		sbuf_printf(sb, "hdrInErrs:      %10u %10u %10u %10u\n",
 		    stats.hdr_in_errs[0], stats.hdr_in_errs[1],
 		    stats.hdr_in_errs[2], stats.hdr_in_errs[3]);
 		sbuf_printf(sb, "tcpInErrs:      %10u %10u %10u %10u\n",
 		    stats.tcp_in_errs[0], stats.tcp_in_errs[1],
 		    stats.tcp_in_errs[2], stats.tcp_in_errs[3]);
 		sbuf_printf(sb, "tcp6InErrs:     %10u %10u %10u %10u\n",
 		    stats.tcp6_in_errs[0], stats.tcp6_in_errs[1],
 		    stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]);
 		sbuf_printf(sb, "tnlCongDrops:   %10u %10u %10u %10u\n",
 		    stats.tnl_cong_drops[0], stats.tnl_cong_drops[1],
 		    stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]);
 		sbuf_printf(sb, "tnlTxDrops:     %10u %10u %10u %10u\n",
 		    stats.tnl_tx_drops[0], stats.tnl_tx_drops[1],
 		    stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]);
 		sbuf_printf(sb, "ofldVlanDrops:  %10u %10u %10u %10u\n",
 		    stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1],
 		    stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]);
 		sbuf_printf(sb, "ofldChanDrops:  %10u %10u %10u %10u\n\n",
 		    stats.ofld_chan_drops[0], stats.ofld_chan_drops[1],
 		    stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]);
 	} else {
 		sbuf_printf(sb, "                 channel 0  channel 1\n");
 		sbuf_printf(sb, "macInErrs:      %10u %10u\n",
 		    stats.mac_in_errs[0], stats.mac_in_errs[1]);
 		sbuf_printf(sb, "hdrInErrs:      %10u %10u\n",
 		    stats.hdr_in_errs[0], stats.hdr_in_errs[1]);
 		sbuf_printf(sb, "tcpInErrs:      %10u %10u\n",
 		    stats.tcp_in_errs[0], stats.tcp_in_errs[1]);
 		sbuf_printf(sb, "tcp6InErrs:     %10u %10u\n",
 		    stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]);
 		sbuf_printf(sb, "tnlCongDrops:   %10u %10u\n",
 		    stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]);
 		sbuf_printf(sb, "tnlTxDrops:     %10u %10u\n",
 		    stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]);
 		sbuf_printf(sb, "ofldVlanDrops:  %10u %10u\n",
 		    stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]);
 		sbuf_printf(sb, "ofldChanDrops:  %10u %10u\n\n",
 		    stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]);
 	}
 
 	sbuf_printf(sb, "ofldNoNeigh:    %u\nofldCongDefer:  %u",
 	    stats.ofld_no_neigh, stats.ofld_cong_defer);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct tp_params *tpp = &sc->params.tp;
 	u_int mask;
 	int rc;
 
 	mask = tpp->la_mask >> 16;
 	rc = sysctl_handle_int(oidp, &mask, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 	if (mask > 0xffff)
 		return (EINVAL);
 	tpp->la_mask = mask << 16;
 	t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask);
 
 	return (0);
 }
 
 struct field_desc {
 	const char *name;
 	u_int start;
 	u_int width;
 };
 
 static void
 field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f)
 {
 	char buf[32];
 	int line_size = 0;
 
 	while (f->name) {
 		uint64_t mask = (1ULL << f->width) - 1;
 		int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name,
 		    ((uintmax_t)v >> f->start) & mask);
 
 		if (line_size + len >= 79) {
 			line_size = 8;
 			sbuf_printf(sb, "\n        ");
 		}
 		sbuf_printf(sb, "%s ", buf);
 		line_size += len + 1;
 		f++;
 	}
 	sbuf_printf(sb, "\n");
 }
 
 static const struct field_desc tp_la0[] = {
 	{ "RcfOpCodeOut", 60, 4 },
 	{ "State", 56, 4 },
 	{ "WcfState", 52, 4 },
 	{ "RcfOpcSrcOut", 50, 2 },
 	{ "CRxError", 49, 1 },
 	{ "ERxError", 48, 1 },
 	{ "SanityFailed", 47, 1 },
 	{ "SpuriousMsg", 46, 1 },
 	{ "FlushInputMsg", 45, 1 },
 	{ "FlushInputCpl", 44, 1 },
 	{ "RssUpBit", 43, 1 },
 	{ "RssFilterHit", 42, 1 },
 	{ "Tid", 32, 10 },
 	{ "InitTcb", 31, 1 },
 	{ "LineNumber", 24, 7 },
 	{ "Emsg", 23, 1 },
 	{ "EdataOut", 22, 1 },
 	{ "Cmsg", 21, 1 },
 	{ "CdataOut", 20, 1 },
 	{ "EreadPdu", 19, 1 },
 	{ "CreadPdu", 18, 1 },
 	{ "TunnelPkt", 17, 1 },
 	{ "RcfPeerFin", 16, 1 },
 	{ "RcfReasonOut", 12, 4 },
 	{ "TxCchannel", 10, 2 },
 	{ "RcfTxChannel", 8, 2 },
 	{ "RxEchannel", 6, 2 },
 	{ "RcfRxChannel", 5, 1 },
 	{ "RcfDataOutSrdy", 4, 1 },
 	{ "RxDvld", 3, 1 },
 	{ "RxOoDvld", 2, 1 },
 	{ "RxCongestion", 1, 1 },
 	{ "TxCongestion", 0, 1 },
 	{ NULL }
 };
 
 static const struct field_desc tp_la1[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "CplCmdOut", 48, 8 },
 	{ "ESynOut", 47, 1 },
 	{ "EAckOut", 46, 1 },
 	{ "EFinOut", 45, 1 },
 	{ "ERstOut", 44, 1 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static const struct field_desc tp_la2[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "MpsVfVld", 55, 1 },
 	{ "MpsPf", 52, 3 },
 	{ "MpsVf", 44, 8 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static void
 tp_la_show(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	field_desc_show(sb, *p, tp_la0);
 }
 
 static void
 tp_la_show2(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], tp_la0);
 }
 
 static void
 tp_la_show3(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1);
 }
 
 static int
 sysctl_tp_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint64_t *buf, *p;
 	int rc;
 	u_int i, inc;
 	void (*show_func)(struct sbuf *, uint64_t *, int);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK);
 
 	t4_tp_read_la(sc, buf, NULL);
 	p = buf;
 
 	switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) {
 	case 2:
 		inc = 2;
 		show_func = tp_la_show2;
 		break;
 	case 3:
 		inc = 2;
 		show_func = tp_la_show3;
 		break;
 	default:
 		inc = 1;
 		show_func = tp_la_show;
 	}
 
 	for (i = 0; i < TPLA_SIZE / inc; i++, p += inc)
 		(*show_func)(sb, p, i);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_tx_rate(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	u64 nrate[MAX_NCHAN], orate[MAX_NCHAN];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_chan_txrate(sc, nrate, orate);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "              channel 0   channel 1"
 		    "   channel 2   channel 3\n");
 		sbuf_printf(sb, "NIC B/s:     %10ju  %10ju  %10ju  %10ju\n",
 		    nrate[0], nrate[1], nrate[2], nrate[3]);
 		sbuf_printf(sb, "Offload B/s: %10ju  %10ju  %10ju  %10ju",
 		    orate[0], orate[1], orate[2], orate[3]);
 	} else {
 		sbuf_printf(sb, "              channel 0   channel 1\n");
 		sbuf_printf(sb, "NIC B/s:     %10ju  %10ju\n",
 		    nrate[0], nrate[1]);
 		sbuf_printf(sb, "Offload B/s: %10ju  %10ju",
 		    orate[0], orate[1]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ulprx_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc, i;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_ulprx_read_la(sc, buf);
 	p = buf;
 
 	sbuf_printf(sb, "      Pcmd        Type   Message"
 	    "                Data");
 	for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) {
 		sbuf_printf(sb, "\n%08x%08x  %4x  %08x  %08x%08x%08x%08x",
 		    p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, v;
 
 	MPASS(chip_id(sc) >= CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	v = t4_read_reg(sc, A_SGE_STAT_CFG);
 	if (G_STATSOURCE_T5(v) == 7) {
 		int mode;
 
 		mode = is_t5(sc) ? G_STATMODE(v) : G_T6_STATMODE(v);
 		if (mode == 0) {
 			sbuf_printf(sb, "total %d, incomplete %d",
 			    t4_read_reg(sc, A_SGE_STAT_TOTAL),
 			    t4_read_reg(sc, A_SGE_STAT_MATCH));
 		} else if (mode == 1) {
 			sbuf_printf(sb, "total %d, data overflow %d",
 			    t4_read_reg(sc, A_SGE_STAT_TOTAL),
 			    t4_read_reg(sc, A_SGE_STAT_MATCH));
 		} else {
 			sbuf_printf(sb, "unknown mode %d", mode);
 		}
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	enum cpu_sets op = arg2;
 	cpuset_t cpuset;
 	struct sbuf *sb;
 	int i, rc;
 
 	MPASS(op == LOCAL_CPUS || op == INTR_CPUS);
 
 	CPU_ZERO(&cpuset);
 	rc = bus_get_cpus(sc->dev, op, sizeof(cpuset), &cpuset);
 	if (rc != 0)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	CPU_FOREACH(i)
 		sbuf_printf(sb, "%d ", i);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int *old_ports, *new_ports;
 	int i, new_count, rc;
 
 	if (req->newptr == NULL && req->oldptr == NULL)
 		return (SYSCTL_OUT(req, NULL, imax(sc->tt.num_tls_rx_ports, 1) *
 		    sizeof(sc->tt.tls_rx_ports[0])));
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tlsrx");
 	if (rc)
 		return (rc);
 
 	if (sc->tt.num_tls_rx_ports == 0) {
 		i = -1;
 		rc = SYSCTL_OUT(req, &i, sizeof(i));
 	} else
 		rc = SYSCTL_OUT(req, sc->tt.tls_rx_ports,
 		    sc->tt.num_tls_rx_ports * sizeof(sc->tt.tls_rx_ports[0]));
 	if (rc == 0 && req->newptr != NULL) {
 		new_count = req->newlen / sizeof(new_ports[0]);
 		new_ports = malloc(new_count * sizeof(new_ports[0]), M_CXGBE,
 		    M_WAITOK);
 		rc = SYSCTL_IN(req, new_ports, new_count *
 		    sizeof(new_ports[0]));
 		if (rc)
 			goto err;
 
 		/* Allow setting to a single '-1' to clear the list. */
 		if (new_count == 1 && new_ports[0] == -1) {
 			ADAPTER_LOCK(sc);
 			old_ports = sc->tt.tls_rx_ports;
 			sc->tt.tls_rx_ports = NULL;
 			sc->tt.num_tls_rx_ports = 0;
 			ADAPTER_UNLOCK(sc);
 			free(old_ports, M_CXGBE);
 		} else {
 			for (i = 0; i < new_count; i++) {
 				if (new_ports[i] < 1 ||
 				    new_ports[i] > IPPORT_MAX) {
 					rc = EINVAL;
 					goto err;
 				}
 			}
 
 			ADAPTER_LOCK(sc);
 			old_ports = sc->tt.tls_rx_ports;
 			sc->tt.tls_rx_ports = new_ports;
 			sc->tt.num_tls_rx_ports = new_count;
 			ADAPTER_UNLOCK(sc);
 			free(old_ports, M_CXGBE);
 			new_ports = NULL;
 		}
 	err:
 		free(new_ports, M_CXGBE);
 	}
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static void
 unit_conv(char *buf, size_t len, u_int val, u_int factor)
 {
 	u_int rem = val % factor;
 
 	if (rem == 0)
 		snprintf(buf, len, "%u", val / factor);
 	else {
 		while (rem % 10 == 0)
 			rem /= 10;
 		snprintf(buf, len, "%u.%u", val / factor, rem);
 	}
 }
 
 static int
 sysctl_tp_tick(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	char buf[16];
 	u_int res, re;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION);
 	switch (arg2) {
 	case 0:
 		/* timer_tick */
 		re = G_TIMERRESOLUTION(res);
 		break;
 	case 1:
 		/* TCP timestamp tick */
 		re = G_TIMESTAMPRESOLUTION(res);
 		break;
 	case 2:
 		/* DACK tick */
 		re = G_DELAYEDACKRESOLUTION(res);
 		break;
 	default:
 		return (EDOOFUS);
 	}
 
 	unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000);
 
 	return (sysctl_handle_string(oidp, buf, sizeof(buf), req));
 }
 
 static int
 sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int res, dack_re, v;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION);
 	dack_re = G_DELAYEDACKRESOLUTION(res);
 	v = ((cclk_ps << dack_re) / 1000000) * t4_read_reg(sc, A_TP_DACK_TIMER);
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_tp_timer(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int reg = arg2;
 	u_int tre;
 	u_long tp_tick_us, v;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX ||
 	    reg == A_TP_PERS_MIN  || reg == A_TP_PERS_MAX ||
 	    reg == A_TP_KEEP_IDLE || reg == A_TP_KEEP_INTVL ||
 	    reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER);
 
 	tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION));
 	tp_tick_us = (cclk_ps << tre) / 1000000;
 
 	if (reg == A_TP_INIT_SRTT)
 		v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg));
 	else
 		v = tp_tick_us * t4_read_reg(sc, reg);
 
 	return (sysctl_handle_long(oidp, &v, 0, req));
 }
 
 /*
  * All fields in TP_SHIFT_CNT are 4b and the starting location of the field is
  * passed to this function.
  */
 static int
 sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int idx = arg2;
 	u_int v;
 
 	MPASS(idx >= 0 && idx <= 24);
 
 	v = (t4_read_reg(sc, A_TP_SHIFT_CNT) >> idx) & 0xf;
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_tp_backoff(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int idx = arg2;
 	u_int shift, v, r;
 
 	MPASS(idx >= 0 && idx < 16);
 
 	r = A_TP_TCP_BACKOFF_REG0 + (idx & ~3);
 	shift = (idx & 3) << 3;
 	v = (t4_read_reg(sc, r) >> shift) & M_TIMERBACKOFFINDEX0;
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc, i;
 	struct sge_ofld_rxq *ofld_rxq;
 	uint8_t v;
 
 	idx = vi->ofld_tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4otmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->ofld_pktc_idx != -1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&ofld_rxq->iq.intr_params, v);
 #else
 		ofld_rxq->iq.intr_params = v;
 #endif
 	}
 	vi->ofld_tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc;
 
 	idx = vi->ofld_pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4opktc");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->ofld_pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 #endif
 
 static int
 get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt)
 {
 	int rc;
 
 	if (cntxt->cid > M_CTXTQID)
 		return (EINVAL);
 
 	if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS &&
 	    cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt");
 	if (rc)
 		return (rc);
 
 	if (sc->flags & FW_OK) {
 		rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id,
 		    &cntxt->data[0]);
 		if (rc == 0)
 			goto done;
 	}
 
 	/*
 	 * Read via firmware failed or wasn't even attempted.  Read directly via
 	 * the backdoor.
 	 */
 	rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_fw(struct adapter *sc, struct t4_data *fw)
 {
 	int rc;
 	uint8_t *fw_data;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw");
 	if (rc)
 		return (rc);
 
 	/*
 	 * The firmware, with the sole exception of the memory parity error
 	 * handler, runs from memory and not flash.  It is almost always safe to
 	 * install a new firmware on a running system.  Just set bit 1 in
 	 * hw.cxgbe.dflags or dev.<nexus>.<n>.dflags first.
 	 */
 	if (sc->flags & FULL_INIT_DONE &&
 	    (sc->debug_flags & DF_LOAD_FW_ANYTIME) == 0) {
 		rc = EBUSY;
 		goto done;
 	}
 
 	fw_data = malloc(fw->len, M_CXGBE, M_WAITOK);
 	if (fw_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(fw->data, fw_data, fw->len);
 	if (rc == 0)
 		rc = -t4_load_fw(sc, fw_data, fw->len);
 
 	free(fw_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_cfg(struct adapter *sc, struct t4_data *cfg)
 {
 	int rc;
 	uint8_t *cfg_data = NULL;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf");
 	if (rc)
 		return (rc);
 
 	if (cfg->len == 0) {
 		/* clear */
 		rc = -t4_load_cfg(sc, NULL, 0);
 		goto done;
 	}
 
 	cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK);
 	if (cfg_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(cfg->data, cfg_data, cfg->len);
 	if (rc == 0)
 		rc = -t4_load_cfg(sc, cfg_data, cfg->len);
 
 	free(cfg_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_boot(struct adapter *sc, struct t4_bootrom *br)
 {
 	int rc;
 	uint8_t *br_data = NULL;
 	u_int offset;
 
 	if (br->len > 1024 * 1024)
 		return (EFBIG);
 
 	if (br->pf_offset == 0) {
 		/* pfidx */
 		if (br->pfidx_addr > 7)
 			return (EINVAL);
 		offset = G_OFFSET(t4_read_reg(sc, PF_REG(br->pfidx_addr,
 		    A_PCIE_PF_EXPROM_OFST)));
 	} else if (br->pf_offset == 1) {
 		/* offset */
 		offset = G_OFFSET(br->pfidx_addr);
 	} else {
 		return (EINVAL);
 	}
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldbr");
 	if (rc)
 		return (rc);
 
 	if (br->len == 0) {
 		/* clear */
 		rc = -t4_load_boot(sc, NULL, offset, 0);
 		goto done;
 	}
 
 	br_data = malloc(br->len, M_CXGBE, M_WAITOK);
 	if (br_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(br->data, br_data, br->len);
 	if (rc == 0)
 		rc = -t4_load_boot(sc, br_data, offset, br->len);
 
 	free(br_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_bootcfg(struct adapter *sc, struct t4_data *bc)
 {
 	int rc;
 	uint8_t *bc_data = NULL;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf");
 	if (rc)
 		return (rc);
 
 	if (bc->len == 0) {
 		/* clear */
 		rc = -t4_load_bootcfg(sc, NULL, 0);
 		goto done;
 	}
 
 	bc_data = malloc(bc->len, M_CXGBE, M_WAITOK);
 	if (bc_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(bc->data, bc_data, bc->len);
 	if (rc == 0)
 		rc = -t4_load_bootcfg(sc, bc_data, bc->len);
 
 	free(bc_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump)
 {
 	int rc;
 	struct cudbg_init *cudbg;
 	void *handle, *buf;
 
 	/* buf is large, don't block if no memory is available */
 	buf = malloc(dump->len, M_CXGBE, M_NOWAIT | M_ZERO);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	handle = cudbg_alloc_handle();
 	if (handle == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	cudbg = cudbg_get_init(handle);
 	cudbg->adap = sc;
 	cudbg->print = (cudbg_print_cb)printf;
 
 #ifndef notyet
 	device_printf(sc->dev, "%s: wr_flash %u, len %u, data %p.\n",
 	    __func__, dump->wr_flash, dump->len, dump->data);
 #endif
 
 	if (dump->wr_flash)
 		cudbg->use_flash = 1;
 	MPASS(sizeof(cudbg->dbg_bitmap) == sizeof(dump->bitmap));
 	memcpy(cudbg->dbg_bitmap, dump->bitmap, sizeof(cudbg->dbg_bitmap));
 
 	rc = cudbg_collect(handle, buf, &dump->len);
 	if (rc != 0)
 		goto done;
 
 	rc = copyout(buf, dump->data, dump->len);
 done:
 	cudbg_free_handle(handle);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static void
 free_offload_policy(struct t4_offload_policy *op)
 {
 	struct offload_rule *r;
 	int i;
 
 	if (op == NULL)
 		return;
 
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 		free(r->bpf_prog.bf_insns, M_CXGBE);
 	}
 	free(op->rule, M_CXGBE);
 	free(op, M_CXGBE);
 }
 
 static int
 set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop)
 {
 	int i, rc, len;
 	struct t4_offload_policy *op, *old;
 	struct bpf_program *bf;
 	const struct offload_settings *s;
 	struct offload_rule *r;
 	void *u;
 
 	if (!is_offload(sc))
 		return (ENODEV);
 
 	if (uop->nrules == 0) {
 		/* Delete installed policies. */
 		op = NULL;
 		goto set_policy;
 	} if (uop->nrules > 256) { /* arbitrary */
 		return (E2BIG);
 	}
 
 	/* Copy userspace offload policy to kernel */
 	op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK);
 	op->nrules = uop->nrules;
 	len = op->nrules * sizeof(struct offload_rule);
 	op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
 	rc = copyin(uop->rule, op->rule, len);
 	if (rc) {
 		free(op->rule, M_CXGBE);
 		free(op, M_CXGBE);
 		return (rc);
 	}
 
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 
 		/* Validate open_type */
 		if (r->open_type != OPEN_TYPE_LISTEN &&
 		    r->open_type != OPEN_TYPE_ACTIVE &&
 		    r->open_type != OPEN_TYPE_PASSIVE &&
 		    r->open_type != OPEN_TYPE_DONTCARE) {
 error:
 			/*
 			 * Rules 0 to i have malloc'd filters that need to be
 			 * freed.  Rules i+1 to nrules have userspace pointers
 			 * and should be left alone.
 			 */
 			op->nrules = i;
 			free_offload_policy(op);
 			return (rc);
 		}
 
 		/* Validate settings */
 		s = &r->settings;
 		if ((s->offload != 0 && s->offload != 1) ||
 		    s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED ||
 		    s->sched_class < -1 ||
 		    s->sched_class >= sc->chip_params->nsched_cls) {
 			rc = EINVAL;
 			goto error;
 		}
 
 		bf = &r->bpf_prog;
 		u = bf->bf_insns;	/* userspace ptr */
 		bf->bf_insns = NULL;
 		if (bf->bf_len == 0) {
 			/* legal, matches everything */
 			continue;
 		}
 		len = bf->bf_len * sizeof(*bf->bf_insns);
 		bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
 		rc = copyin(u, bf->bf_insns, len);
 		if (rc != 0)
 			goto error;
 
 		if (!bpf_validate(bf->bf_insns, bf->bf_len)) {
 			rc = EINVAL;
 			goto error;
 		}
 	}
 set_policy:
 	rw_wlock(&sc->policy_lock);
 	old = sc->policy;
 	sc->policy = op;
 	rw_wunlock(&sc->policy_lock);
 	free_offload_policy(old);
 
 	return (0);
 }
 
 #define MAX_READ_BUF_SIZE (128 * 1024)
 static int
 read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr)
 {
 	uint32_t addr, remaining, n;
 	uint32_t *buf;
 	int rc;
 	uint8_t *dst;
 
 	rc = validate_mem_range(sc, mr->addr, mr->len);
 	if (rc != 0)
 		return (rc);
 
 	buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK);
 	addr = mr->addr;
 	remaining = mr->len;
 	dst = (void *)mr->data;
 
 	while (remaining) {
 		n = min(remaining, MAX_READ_BUF_SIZE);
 		read_via_memwin(sc, 2, addr, buf, n);
 
 		rc = copyout(buf, dst, n);
 		if (rc != 0)
 			break;
 
 		dst += n;
 		remaining -= n;
 		addr += n;
 	}
 
 	free(buf, M_CXGBE);
 	return (rc);
 }
 #undef MAX_READ_BUF_SIZE
 
 static int
 read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd)
 {
 	int rc;
 
 	if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports)
 		return (EINVAL);
 
 	if (i2cd->len > sizeof(i2cd->data))
 		return (EFBIG);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd");
 	if (rc)
 		return (rc);
 	rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr,
 	    i2cd->offset, i2cd->len, &i2cd->data[0]);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 int
 t4_os_find_pci_capability(struct adapter *sc, int cap)
 {
 	int i;
 
 	return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0);
 }
 
 int
 t4_os_pci_save_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_save(dev, dinfo, 0);
 	return (0);
 }
 
 int
 t4_os_pci_restore_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_restore(dev, dinfo);
 	return (0);
 }
 
 void
 t4_os_portmod_changed(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	static const char *mod_str[] = {
 		NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM"
 	};
 
 	KASSERT((pi->flags & FIXED_IFMEDIA) == 0,
 	    ("%s: port_type %u", __func__, pi->port_type));
 
 	vi = &pi->vi[0];
 	if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) {
 		PORT_LOCK(pi);
 		build_medialist(pi);
 		if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) {
 			fixup_link_config(pi);
 			apply_link_config(pi);
 		}
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, LOCK_HELD);
 	}
 
 	ifp = vi->ifp;
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
 		if_printf(ifp, "transceiver unplugged.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN)
 		if_printf(ifp, "unknown transceiver inserted.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED)
 		if_printf(ifp, "unsupported transceiver inserted.\n");
 	else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) {
 		if_printf(ifp, "%dGbps %s transceiver inserted.\n",
 		    port_top_speed(pi), mod_str[pi->mod_type]);
 	} else {
 		if_printf(ifp, "transceiver (type %d) inserted.\n",
 		    pi->mod_type);
 	}
 }
 
 void
 t4_os_link_changed(struct port_info *pi)
 {
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	struct link_config *lc;
 	int v;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	for_each_vi(pi, v, vi) {
 		ifp = vi->ifp;
 		if (ifp == NULL)
 			continue;
 
 		lc = &pi->link_cfg;
 		if (lc->link_ok) {
 			ifp->if_baudrate = IF_Mbps(lc->speed);
 			if_link_state_change(ifp, LINK_STATE_UP);
 		} else {
 			if_link_state_change(ifp, LINK_STATE_DOWN);
 		}
 	}
 }
 
 void
 t4_iterate(void (*func)(struct adapter *, void *), void *arg)
 {
 	struct adapter *sc;
 
 	sx_slock(&t4_list_lock);
 	SLIST_FOREACH(sc, &t4_list, link) {
 		/*
 		 * func should not make any assumptions about what state sc is
 		 * in - the only guarantee is that sc->sc_lock is a valid lock.
 		 */
 		func(sc, arg);
 	}
 	sx_sunlock(&t4_list_lock);
 }
 
 static int
 t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	int rc;
 	struct adapter *sc = dev->si_drv1;
 
 	rc = priv_check(td, PRIV_DRIVER);
 	if (rc != 0)
 		return (rc);
 
 	switch (cmd) {
 	case CHELSIO_T4_GETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		if (edata->size == 4)
 			edata->val = t4_read_reg(sc, edata->addr);
 		else if (edata->size == 8)
 			edata->val = t4_read_reg64(sc, edata->addr);
 		else
 			return (EINVAL);
 
 		break;
 	}
 	case CHELSIO_T4_SETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		if (edata->size == 4) {
 			if (edata->val & 0xffffffff00000000)
 				return (EINVAL);
 			t4_write_reg(sc, edata->addr, (uint32_t) edata->val);
 		} else if (edata->size == 8)
 			t4_write_reg64(sc, edata->addr, edata->val);
 		else
 			return (EINVAL);
 		break;
 	}
 	case CHELSIO_T4_REGDUMP: {
 		struct t4_regdump *regs = (struct t4_regdump *)data;
 		int reglen = t4_get_regs_len(sc);
 		uint8_t *buf;
 
 		if (regs->len < reglen) {
 			regs->len = reglen; /* hint to the caller */
 			return (ENOBUFS);
 		}
 
 		regs->len = reglen;
 		buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO);
 		get_regs(sc, regs, buf);
 		rc = copyout(buf, regs->data, reglen);
 		free(buf, M_CXGBE);
 		break;
 	}
 	case CHELSIO_T4_GET_FILTER_MODE:
 		rc = get_filter_mode(sc, (uint32_t *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER_MODE:
 		rc = set_filter_mode(sc, *(uint32_t *)data);
 		break;
 	case CHELSIO_T4_GET_FILTER:
 		rc = get_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER:
 		rc = set_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_DEL_FILTER:
 		rc = del_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_GET_SGE_CONTEXT:
 		rc = get_sge_context(sc, (struct t4_sge_context *)data);
 		break;
 	case CHELSIO_T4_LOAD_FW:
 		rc = load_fw(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_GET_MEM:
 		rc = read_card_mem(sc, 2, (struct t4_mem_range *)data);
 		break;
 	case CHELSIO_T4_GET_I2C:
 		rc = read_i2c(sc, (struct t4_i2c_data *)data);
 		break;
 	case CHELSIO_T4_CLEAR_STATS: {
 		int i, v, bg_map;
 		u_int port_id = *(uint32_t *)data;
 		struct port_info *pi;
 		struct vi_info *vi;
 
 		if (port_id >= sc->params.nports)
 			return (EINVAL);
 		pi = sc->port[port_id];
 		if (pi == NULL)
 			return (EIO);
 
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
 		pi->tx_parse_error = 0;
 		pi->tnl_cong_drops = 0;
 		mtx_lock(&sc->reg_lock);
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE)
 				t4_clr_vi_stats(sc, vi->vin);
 		}
 		bg_map = pi->mps_bg_map;
 		v = 0;	/* reuse */
 		while (bg_map) {
 			i = ffs(bg_map) - 1;
 			t4_write_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v,
 			    1, A_TP_MIB_TNL_CNG_DROP_0 + i);
 			bg_map &= ~(1 << i);
 		}
 		mtx_unlock(&sc->reg_lock);
 
 		/*
 		 * Since this command accepts a port, clear stats for
 		 * all VIs on this port.
 		 */
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE) {
 				struct sge_rxq *rxq;
 				struct sge_txq *txq;
 				struct sge_wrq *wrq;
 
 				for_each_rxq(vi, i, rxq) {
 #if defined(INET) || defined(INET6)
 					rxq->lro.lro_queued = 0;
 					rxq->lro.lro_flushed = 0;
 #endif
 					rxq->rxcsum = 0;
 					rxq->vlan_extraction = 0;
 				}
 
 				for_each_txq(vi, i, txq) {
 					txq->txcsum = 0;
 					txq->tso_wrs = 0;
 					txq->vlan_insertion = 0;
 					txq->imm_wrs = 0;
 					txq->sgl_wrs = 0;
 					txq->txpkt_wrs = 0;
 					txq->txpkts0_wrs = 0;
 					txq->txpkts1_wrs = 0;
 					txq->txpkts0_pkts = 0;
 					txq->txpkts1_pkts = 0;
 					txq->raw_wrs = 0;
 					mp_ring_reset_stats(txq->r);
 				}
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 				/* nothing to clear for each ofld_rxq */
 
 				for_each_ofld_txq(vi, i, wrq) {
 					wrq->tx_wrs_direct = 0;
 					wrq->tx_wrs_copied = 0;
 				}
 #endif
 
 				if (IS_MAIN_VI(vi)) {
 					wrq = &sc->sge.ctrlq[pi->port_id];
 					wrq->tx_wrs_direct = 0;
 					wrq->tx_wrs_copied = 0;
 				}
 			}
 		}
 		break;
 	}
 	case CHELSIO_T4_SCHED_CLASS:
 		rc = t4_set_sched_class(sc, (struct t4_sched_params *)data);
 		break;
 	case CHELSIO_T4_SCHED_QUEUE:
 		rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data);
 		break;
 	case CHELSIO_T4_GET_TRACER:
 		rc = t4_get_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_SET_TRACER:
 		rc = t4_set_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_LOAD_CFG:
 		rc = load_cfg(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_LOAD_BOOT:
 		rc = load_boot(sc, (struct t4_bootrom *)data);
 		break;
 	case CHELSIO_T4_LOAD_BOOTCFG:
 		rc = load_bootcfg(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_CUDBG_DUMP:
 		rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data);
 		break;
 	case CHELSIO_T4_SET_OFLD_POLICY:
 		rc = set_offload_policy(sc, (struct t4_offload_policy *)data);
 		break;
 	default:
 		rc = ENOTTY;
 	}
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 toe_capability(struct vi_info *vi, int enable)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!is_offload(sc))
 		return (ENODEV);
 
 	if (enable) {
 		if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) {
 			/* TOE is already enabled. */
 			return (0);
 		}
 
 		/*
 		 * We need the port's queues around so that we're able to send
 		 * and receive CPLs to/from the TOE even if the ifnet for this
 		 * port has never been UP'd administratively.
 		 */
 		if (!(vi->flags & VI_INIT_DONE)) {
 			rc = vi_full_init(vi);
 			if (rc)
 				return (rc);
 		}
 		if (!(pi->vi[0].flags & VI_INIT_DONE)) {
 			rc = vi_full_init(&pi->vi[0]);
 			if (rc)
 				return (rc);
 		}
 
 		if (isset(&sc->offload_map, pi->port_id)) {
 			/* TOE is enabled on another VI of this port. */
 			pi->uld_vis++;
 			return (0);
 		}
 
 		if (!uld_active(sc, ULD_TOM)) {
 			rc = t4_activate_uld(sc, ULD_TOM);
 			if (rc == EAGAIN) {
 				log(LOG_WARNING,
 				    "You must kldload t4_tom.ko before trying "
 				    "to enable TOE on a cxgbe interface.\n");
 			}
 			if (rc != 0)
 				return (rc);
 			KASSERT(sc->tom_softc != NULL,
 			    ("%s: TOM activated but softc NULL", __func__));
 			KASSERT(uld_active(sc, ULD_TOM),
 			    ("%s: TOM activated but flag not set", __func__));
 		}
 
 		/* Activate iWARP and iSCSI too, if the modules are loaded. */
 		if (!uld_active(sc, ULD_IWARP))
 			(void) t4_activate_uld(sc, ULD_IWARP);
 		if (!uld_active(sc, ULD_ISCSI))
 			(void) t4_activate_uld(sc, ULD_ISCSI);
 
 		pi->uld_vis++;
 		setbit(&sc->offload_map, pi->port_id);
 	} else {
 		pi->uld_vis--;
 
 		if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0)
 			return (0);
 
 		KASSERT(uld_active(sc, ULD_TOM),
 		    ("%s: TOM never initialized?", __func__));
 		clrbit(&sc->offload_map, pi->port_id);
 	}
 
 	return (0);
 }
 
 /*
  * Add an upper layer driver to the global list.
  */
 int
 t4_register_uld(struct uld_info *ui)
 {
 	int rc = 0;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u->uld_id == ui->uld_id) {
 		    rc = EEXIST;
 		    goto done;
 	    }
 	}
 
 	SLIST_INSERT_HEAD(&t4_uld_list, ui, link);
 	ui->refcount = 0;
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_unregister_uld(struct uld_info *ui)
 {
 	int rc = EINVAL;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u == ui) {
 		    if (ui->refcount > 0) {
 			    rc = EBUSY;
 			    goto done;
 		    }
 
 		    SLIST_REMOVE(&t4_uld_list, ui, uld_info, link);
 		    rc = 0;
 		    goto done;
 	    }
 	}
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_activate_uld(struct adapter *sc, int id)
 {
 	int rc;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (id < 0 || id > ULD_MAX)
 		return (EINVAL);
 	rc = EAGAIN;	/* kldoad the module with this ULD and try again. */
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			if (!(sc->flags & FULL_INIT_DONE)) {
 				rc = adapter_full_init(sc);
 				if (rc != 0)
 					break;
 			}
 
 			rc = ui->activate(sc);
 			if (rc == 0) {
 				setbit(&sc->active_ulds, id);
 				ui->refcount++;
 			}
 			break;
 		}
 	}
 
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 int
 t4_deactivate_uld(struct adapter *sc, int id)
 {
 	int rc;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (id < 0 || id > ULD_MAX)
 		return (EINVAL);
 	rc = ENXIO;
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			rc = ui->deactivate(sc);
 			if (rc == 0) {
 				clrbit(&sc->active_ulds, id);
 				ui->refcount--;
 			}
 			break;
 		}
 	}
 
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 int
 uld_active(struct adapter *sc, int uld_id)
 {
 
 	MPASS(uld_id >= 0 && uld_id <= ULD_MAX);
 
 	return (isset(&sc->active_ulds, uld_id));
 }
 #endif
 
 /*
  * t  = ptr to tunable.
  * nc = number of CPUs.
  * c  = compiled in default for that tunable.
  */
 static void
 calculate_nqueues(int *t, int nc, const int c)
 {
 	int nq;
 
 	if (*t > 0)
 		return;
 	nq = *t < 0 ? -*t : c;
 	*t = min(nc, nq);
 }
 
 /*
  * Come up with reasonable defaults for some of the tunables, provided they're
  * not set by the user (in which case we'll use the values as is).
  */
 static void
 tweak_tunables(void)
 {
 	int nc = mp_ncpus;	/* our snapshot of the number of CPUs */
 
 	if (t4_ntxq < 1) {
 #ifdef RSS
 		t4_ntxq = rss_getnumbuckets();
 #else
 		calculate_nqueues(&t4_ntxq, nc, NTXQ);
 #endif
 	}
 
 	calculate_nqueues(&t4_ntxq_vi, nc, NTXQ_VI);
 
 	if (t4_nrxq < 1) {
 #ifdef RSS
 		t4_nrxq = rss_getnumbuckets();
 #else
 		calculate_nqueues(&t4_nrxq, nc, NRXQ);
 #endif
 	}
 
 	calculate_nqueues(&t4_nrxq_vi, nc, NRXQ_VI);
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	calculate_nqueues(&t4_nofldtxq, nc, NOFLDTXQ);
 	calculate_nqueues(&t4_nofldtxq_vi, nc, NOFLDTXQ_VI);
 #endif
 #ifdef TCP_OFFLOAD
 	calculate_nqueues(&t4_nofldrxq, nc, NOFLDRXQ);
 	calculate_nqueues(&t4_nofldrxq_vi, nc, NOFLDRXQ_VI);
 
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = FW_CAPS_CONFIG_TOE;
 
 	if (t4_rdmacaps_allowed == -1) {
 		t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP |
 		    FW_CAPS_CONFIG_RDMA_RDMAC;
 	}
 
 	if (t4_iscsicaps_allowed == -1) {
 		t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU |
 		    FW_CAPS_CONFIG_ISCSI_TARGET_PDU |
 		    FW_CAPS_CONFIG_ISCSI_T10DIF;
 	}
 
 	if (t4_tmr_idx_ofld < 0 || t4_tmr_idx_ofld >= SGE_NTIMERS)
 		t4_tmr_idx_ofld = TMR_IDX_OFLD;
 
 	if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS)
 		t4_pktc_idx_ofld = PKTC_IDX_OFLD;
 #else
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = 0;
 
 	if (t4_rdmacaps_allowed == -1)
 		t4_rdmacaps_allowed = 0;
 
 	if (t4_iscsicaps_allowed == -1)
 		t4_iscsicaps_allowed = 0;
 #endif
 
 #ifdef DEV_NETMAP
 	calculate_nqueues(&t4_nnmtxq_vi, nc, NNMTXQ_VI);
 	calculate_nqueues(&t4_nnmrxq_vi, nc, NNMRXQ_VI);
 #endif
 
 	if (t4_tmr_idx < 0 || t4_tmr_idx >= SGE_NTIMERS)
 		t4_tmr_idx = TMR_IDX;
 
 	if (t4_pktc_idx < -1 || t4_pktc_idx >= SGE_NCOUNTERS)
 		t4_pktc_idx = PKTC_IDX;
 
 	if (t4_qsize_txq < 128)
 		t4_qsize_txq = 128;
 
 	if (t4_qsize_rxq < 128)
 		t4_qsize_rxq = 128;
 	while (t4_qsize_rxq & 7)
 		t4_qsize_rxq++;
 
 	t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX;
 
 	/*
 	 * Number of VIs to create per-port.  The first VI is the "main" regular
 	 * VI for the port.  The rest are additional virtual interfaces on the
 	 * same physical port.  Note that the main VI does not have native
 	 * netmap support but the extra VIs do.
 	 *
 	 * Limit the number of VIs per port to the number of available
 	 * MAC addresses per port.
 	 */
 	if (t4_num_vis < 1)
 		t4_num_vis = 1;
 	if (t4_num_vis > nitems(vi_mac_funcs)) {
 		t4_num_vis = nitems(vi_mac_funcs);
 		printf("cxgbe: number of VIs limited to %d\n", t4_num_vis);
 	}
 
 	if (pcie_relaxed_ordering < 0 || pcie_relaxed_ordering > 2) {
 		pcie_relaxed_ordering = 1;
 #if defined(__i386__) || defined(__amd64__)
 		if (cpu_vendor_id == CPU_VENDOR_INTEL)
 			pcie_relaxed_ordering = 0;
 #endif
 	}
 }
 
 #ifdef DDB
 static void
 t4_dump_tcb(struct adapter *sc, int tid)
 {
 	uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos;
 
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2);
 	save = t4_read_reg(sc, reg);
 	base = sc->memwin[2].mw_base;
 
 	/* Dump TCB for the tid */
 	tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	tcb_addr += tid * TCB_SIZE;
 
 	if (is_t4(sc)) {
 		pf = 0;
 		win_pos = tcb_addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		win_pos = tcb_addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	t4_write_reg(sc, reg, win_pos | pf);
 	t4_read_reg(sc, reg);
 
 	off = tcb_addr - win_pos;
 	for (i = 0; i < 4; i++) {
 		uint32_t buf[8];
 		for (j = 0; j < 8; j++, off += 4)
 			buf[j] = htonl(t4_read_reg(sc, base + off));
 
 		db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
 		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
 		    buf[7]);
 	}
 
 	t4_write_reg(sc, reg, save);
 	t4_read_reg(sc, reg);
 }
 
 static void
 t4_dump_devlog(struct adapter *sc)
 {
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e e;
 	int i, first, j, m, nentries, rc;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->start == 0) {
 		db_printf("devlog params not valid\n");
 		return;
 	}
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 	m = fwmtype_to_hwmtype(dparams->memtype);
 
 	/* Find the first entry. */
 	first = -1;
 	for (i = 0; i < nentries && !db_pager_quit; i++) {
 		rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e),
 		    sizeof(e), (void *)&e);
 		if (rc != 0)
 			break;
 
 		if (e.timestamp == 0)
 			break;
 
 		e.timestamp = be64toh(e.timestamp);
 		if (e.timestamp < ftstamp) {
 			ftstamp = e.timestamp;
 			first = i;
 		}
 	}
 
 	if (first == -1)
 		return;
 
 	i = first;
 	do {
 		rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e),
 		    sizeof(e), (void *)&e);
 		if (rc != 0)
 			return;
 
 		if (e.timestamp == 0)
 			return;
 
 		e.timestamp = be64toh(e.timestamp);
 		e.seqno = be32toh(e.seqno);
 		for (j = 0; j < 8; j++)
 			e.params[j] = be32toh(e.params[j]);
 
 		db_printf("%10d  %15ju  %8s  %8s  ",
 		    e.seqno, e.timestamp,
 		    (e.level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e.level] : "UNKNOWN"),
 		    (e.facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e.facility] : "UNKNOWN"));
 		db_printf(e.fmt, e.params[0], e.params[1], e.params[2],
 		    e.params[3], e.params[4], e.params[5], e.params[6],
 		    e.params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first && !db_pager_quit);
 }
 
 static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table);
 _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table);
 
 DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL)
 {
 	device_t dev;
 	int t;
 	bool valid;
 
 	valid = false;
 	t = db_read_token();
 	if (t == tIDENT) {
 		dev = device_lookup_by_name(db_tok_string);
 		valid = true;
 	}
 	db_skip_to_eol();
 	if (!valid) {
 		db_printf("usage: show t4 devlog <nexus>\n");
 		return;
 	}
 
 	if (dev == NULL) {
 		db_printf("device not found\n");
 		return;
 	}
 
 	t4_dump_devlog(device_get_softc(dev));
 }
 
 DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL)
 {
 	device_t dev;
 	int radix, tid, t;
 	bool valid;
 
 	valid = false;
 	radix = db_radix;
 	db_radix = 10;
 	t = db_read_token();
 	if (t == tIDENT) {
 		dev = device_lookup_by_name(db_tok_string);
 		t = db_read_token();
 		if (t == tNUMBER) {
 			tid = db_tok_number;
 			valid = true;
 		}
 	}	
 	db_radix = radix;
 	db_skip_to_eol();
 	if (!valid) {
 		db_printf("usage: show t4 tcb <nexus> <tid>\n");
 		return;
 	}
 
 	if (dev == NULL) {
 		db_printf("device not found\n");
 		return;
 	}
 	if (tid < 0) {
 		db_printf("invalid tid\n");
 		return;
 	}
 
 	t4_dump_tcb(device_get_softc(dev), tid);
 }
 #endif
 
 /*
  * Borrowed from cesa_prep_aes_key().
  *
  * NB: The crypto engine wants the words in the decryption key in reverse
  * order.
  */
 void
 t4_aes_getdeckey(void *dec_key, const void *enc_key, unsigned int kbits)
 {
 	uint32_t ek[4 * (RIJNDAEL_MAXNR + 1)];
 	uint32_t *dkey;
 	int i;
 
 	rijndaelKeySetupEnc(ek, enc_key, kbits);
 	dkey = dec_key;
 	dkey += (kbits / 8) / 4;
 
 	switch (kbits) {
 	case 128:
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 10 + i]);
 		break;
 	case 192:
 		for (i = 0; i < 2; i++)
 			*--dkey = htobe32(ek[4 * 11 + 2 + i]);
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 12 + i]);
 		break;
 	case 256:
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 13 + i]);
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 14 + i]);
 		break;
 	}
 	MPASS(dkey == dec_key);
 }
 
 static struct sx mlu;	/* mod load unload */
 SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
 
 static int
 mod_event(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 	static int loaded = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		sx_xlock(&mlu);
 		if (loaded++ == 0) {
 			t4_sge_modload();
 			t4_register_shared_cpl_handler(CPL_SET_TCB_RPL,
 			    t4_filter_rpl, CPL_COOKIE_FILTER);
 			t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL,
 			    do_l2t_write_rpl, CPL_COOKIE_FILTER);
 			t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL,
 			    t4_hashfilter_ao_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_shared_cpl_handler(CPL_SET_TCB_RPL,
 			    t4_hashfilter_tcb_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS,
 			    t4_del_hashfilter_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt);
 			t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt);
 			t4_register_cpl_handler(CPL_SMT_WRITE_RPL,
 			    do_smt_write_rpl);
 			sx_init(&t4_list_lock, "T4/T5 adapters");
 			SLIST_INIT(&t4_list);
 			callout_init(&fatal_callout, 1);
 #ifdef TCP_OFFLOAD
 			sx_init(&t4_uld_list_lock, "T4/T5 ULDs");
 			SLIST_INIT(&t4_uld_list);
 #endif
 #ifdef INET6
 			t4_clip_modload();
 #endif
 			t4_tracer_modload();
 			tweak_tunables();
 		}
 		sx_xunlock(&mlu);
 		break;
 
 	case MOD_UNLOAD:
 		sx_xlock(&mlu);
 		if (--loaded == 0) {
 			int tries;
 
 			sx_slock(&t4_list_lock);
 			if (!SLIST_EMPTY(&t4_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #ifdef TCP_OFFLOAD
 			sx_slock(&t4_uld_list_lock);
 			if (!SLIST_EMPTY(&t4_uld_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_uld_list_lock);
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #endif
 			tries = 0;
 			while (tries++ < 5 && t4_sge_extfree_refs() != 0) {
 				uprintf("%ju clusters with custom free routine "
 				    "still is use.\n", t4_sge_extfree_refs());
 				pause("t4unload", 2 * hz);
 			}
 #ifdef TCP_OFFLOAD
 			sx_sunlock(&t4_uld_list_lock);
 #endif
 			sx_sunlock(&t4_list_lock);
 
 			if (t4_sge_extfree_refs() == 0) {
 				t4_tracer_modunload();
 #ifdef INET6
 				t4_clip_modunload();
 #endif
 #ifdef TCP_OFFLOAD
 				sx_destroy(&t4_uld_list_lock);
 #endif
 				sx_destroy(&t4_list_lock);
 				t4_sge_modunload();
 				loaded = 0;
 			} else {
 				rc = EBUSY;
 				loaded++;	/* undo earlier decrement */
 			}
 		}
 done_unload:
 		sx_xunlock(&mlu);
 		break;
 	}
 
 	return (rc);
 }
 
 static devclass_t t4_devclass, t5_devclass, t6_devclass;
 static devclass_t cxgbe_devclass, cxl_devclass, cc_devclass;
 static devclass_t vcxgbe_devclass, vcxl_devclass, vcc_devclass;
 
 DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0);
 MODULE_VERSION(t4nex, 1);
 MODULE_DEPEND(t4nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t4nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0);
 MODULE_VERSION(t5nex, 1);
 MODULE_DEPEND(t5nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t5nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(t6nex, pci, t6_driver, t6_devclass, mod_event, 0);
 MODULE_VERSION(t6nex, 1);
 MODULE_DEPEND(t6nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t6nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0);
 MODULE_VERSION(cxgbe, 1);
 
 DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0);
 MODULE_VERSION(cxl, 1);
 
 DRIVER_MODULE(cc, t6nex, cc_driver, cc_devclass, 0, 0);
 MODULE_VERSION(cc, 1);
 
 DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0);
 MODULE_VERSION(vcxgbe, 1);
 
 DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0);
 MODULE_VERSION(vcxl, 1);
 
 DRIVER_MODULE(vcc, cc, vcc_driver, vcc_devclass, 0, 0);
 MODULE_VERSION(vcc, 1);
Index: projects/capsicum-test/sys/dev/cxgbe/t4_sge.c
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/t4_sge.c	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/t4_sge.c	(revision 345710)
@@ -1,6044 +1,6051 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /* Internal mbuf flags stored in PH_loc.eight[1]. */
 #define	MC_RAW_WR		0x02
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
     "payload DMA offset in rx buffer (bytes)");
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
     "payload pad boundary (bytes)");
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
     "status page size (bytes)");
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
     "Congestion control for RX queues (0 = backpressure, 1 = drop");
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
     0, "Enable buffer packing");
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
     "payload pack boundary (bytes)");
 
 /*
  * Allow the driver to create mbuf(s) in a cluster allocated for rx.
  * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
  * 1: ok to create mbuf(s) within a cluster if there is room.
  */
 static int allow_mbufs_in_cluster = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, allow_mbufs_in_cluster, CTLFLAG_RDTUN,
     &allow_mbufs_in_cluster, 0,
     "Allow driver to create mbufs within a rx cluster");
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
     &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
     &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
 
 #ifdef RATELIMIT
 /*
  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
  * for rewriting.  -1 and 0-3 are all valid values.
  * -1: hardware should leave the TCP timestamps alone.
  * 0: 1ms
  * 1: 100us
  * 2: 10us
  * 3: 1us
  */
 static int tsclk = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
     "Control TCP timestamp rewriting when using pacing");
 
 static int eo_max_backlog = 1024 * 1024;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
     0, "Maximum backlog of ratelimited data per flow");
 #endif
 
 /*
  * The interrupt holdoff timers are multiplied by this value on T6+.
  * 1 and 3-17 (both inclusive) are legal values.
  */
 static int tscale = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
     "Interrupt holdoff timer scale on T6+");
 
 /*
  * Number of LRO entries in the lro_ctrl structure per rx queue.
  */
 static int lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
     "Number of LRO entries per RX queue");
 
 /*
  * This enables presorting of frames before they're fed into tcp_lro_rx.
  */
 static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
 struct txpkts {
 	u_int wr_type;		/* type 0 or type 1 */
 	u_int npkt;		/* # of packets in this work request */
 	u_int plen;		/* total payload (sum of all packets) */
 	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
 	struct sglist sg;
 	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     uint16_t, char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_iq *);
 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_ctrlq(struct adapter *, struct sge_wrq *, int,
     struct sysctl_oid *);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct vi_info *, struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 #endif
 #ifdef DEV_NETMAP
 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
     struct sysctl_oid *);
 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
     struct sysctl_oid *);
 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq(struct adapter *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_oid *);
 static int free_wrq(struct adapter *, struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
     struct sysctl_oid *);
 static int free_txq(struct vi_info *, struct sge_txq *);
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
 static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, u_int);
 static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
 static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
     struct mbuf *, u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
 static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
     struct mbuf *, const struct txpkts *, u_int);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 #endif
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
 
 void
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(h == NULL || t4_an_handler == NULL);
 
 	loc = (uintptr_t *)&t4_an_handler;
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(type < nitems(t4_fw_msg_handler));
 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	MPASS(type != FW_TYPE_RSSCPL);
 	MPASS(type != FW6_TYPE_RSSCPL);
 
 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
 
 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 static int
 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	u_int tid;
 	int cookie;
 
 	MPASS(m == NULL);
 
 	tid = GET_TID(cpl);
 	if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
 		/*
 		 * The return code for filter-write is put in the CPL cookie so
 		 * we have to rely on the hardware tid (is_ftid) to determine
 		 * that this is a response to a filter.
 		 */
 		cookie = CPL_COOKIE_FILTER;
 	} else {
 		cookie = G_COOKIE(cpl->cookie);
 	}
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
 
 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
 	unsigned int cookie;
 
 	MPASS(m == NULL);
 
 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 
 	MPASS(m == NULL);
 	MPASS(cookie != CPL_COOKIE_RESERVED);
 
 	return (act_open_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_hashfilter(sc))
 		cookie = CPL_COOKIE_HASHFILTER;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
 }
 
 static int
 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_etid(sc, tid))
 		cookie = CPL_COOKIE_ETHOFLD;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (fw4_ack_handlers[cookie](iq, rss, m));
 }
 
 static void
 t4_init_shared_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
 }
 
 void
 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < NUM_CPL_COOKIES);
 	MPASS(t4_cpl_handler[opcode] != NULL);
 
 	switch (opcode) {
 	case CPL_SET_TCB_RPL:
 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
 		break;
 	case CPL_L2T_WRITE_RPL:
 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
 		break;
 	case CPL_ACT_OPEN_RPL:
 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
 		break;
 	case CPL_ABORT_RPL_RSS:
 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
 		break;
 	case CPL_FW4_ACK:
 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
 		break;
 	default:
 		MPASS(0);
 		return;
 	}
 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 0 instead.\n", fl_pktshift);
 		fl_pktshift = 0;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
 		printf("Invalid hw.cxgbe.tscale value (%d),"
 		    " using 1 instead.\n", tscale);
 		tscale = 1;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 
 	t4_init_shared_cpl_handlers();
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
 #ifdef RATELIMIT
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
 	    CPL_COOKIE_ETHOFLD);
 #endif
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sge_flbuf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 		MJUMPAGESIZE - CL_METADATA_SIZE,
 		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES,
 		MCLBYTES - MSIZE - CL_METADATA_SIZE,
 		MJUM9BYTES - CL_METADATA_SIZE,
 		MJUM16BYTES - CL_METADATA_SIZE,
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
 	    ("%s: hw buffer size table too big", __func__));
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
 	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE15 - (4 * i),
 		    sge_flbuf_sizes[i]);
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		m = V_TSCALE(M_TSCALE);
 		if (tscale == 1)
 			v = 0;
 		else
 			v = V_TSCALE(tscale - 2);
 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
 
 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 			v &= ~m;
 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(16);
 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 		}
 	}
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
  * padding is in use, the buffer's start and end need to be aligned to the pad
  * boundary as well.  We'll just make sure that the size is a multiple of the
  * boundary here, it is up to the buffer allocation code to make sure the start
  * of the buffer is aligned as well.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct sw_zone_info *swz, *safe_swz;
 	struct hw_buf_info *hwb;
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sc->params.sge.sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
 	hwb = &s->hw_buf_info[0];
 	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
 		r = sc->params.sge.sge_fl_buffer_size[i];
 		hwb->size = r;
 		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
 		hwb->next = -1;
 	}
 
 	/*
 	 * Create a sorted list in decreasing order of hw buffer sizes (and so
 	 * increasing order of spare area) for each software zone.
 	 *
 	 * If padding is enabled then the start and end of the buffer must align
 	 * to the pad boundary; if packing is enabled then they must align with
 	 * the pack boundary as well.  Allocations from the cluster zones are
 	 * aligned to min(size, 4K), so the buffer starts at that alignment and
 	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
 	 * starting alignment will be reduced to MSIZE and the driver will
 	 * exercise appropriate caution when deciding on the best buffer layout
 	 * to use.
 	 */
 	n = 0;	/* no usable buffer size to begin with */
 	swz = &s->sw_zone_info[0];
 	safe_swz = NULL;
 	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
 		int8_t head = -1, tail = -1;
 
 		swz->size = sw_buf_sizes[i];
 		swz->zone = m_getzone(swz->size);
 		swz->type = m_gettype(swz->size);
 
 		if (swz->size < PAGE_SIZE) {
 			MPASS(powerof2(swz->size));
 			if (fl_pad && (swz->size % sp->pad_boundary != 0))
 				continue;
 		}
 
 		if (swz->size == safest_rx_cluster)
 			safe_swz = swz;
 
 		hwb = &s->hw_buf_info[0];
 		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
 			if (hwb->zidx != -1 || hwb->size > swz->size)
 				continue;
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			hwb->zidx = i;
 			if (head == -1)
 				head = tail = j;
 			else if (hwb->size < s->hw_buf_info[tail].size) {
 				s->hw_buf_info[tail].next = j;
 				tail = j;
 			} else {
 				int8_t *cur;
 				struct hw_buf_info *t;
 
 				for (cur = &head; *cur != -1; cur = &t->next) {
 					t = &s->hw_buf_info[*cur];
 					if (hwb->size == t->size) {
 						hwb->zidx = -2;
 						break;
 					}
 					if (hwb->size > t->size) {
 						hwb->next = *cur;
 						*cur = j;
 						break;
 					}
 				}
 			}
 		}
 		swz->head_hwidx = head;
 		swz->tail_hwidx = tail;
 
 		if (tail != -1) {
 			n++;
 			if (swz->size - s->hw_buf_info[tail].size >=
 			    CL_METADATA_SIZE)
 				sc->flags |= BUF_PACKING_OK;
 		}
 	}
 	if (n == 0) {
 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
 		rc = EINVAL;
 	}
 
 	s->safe_hwidx1 = -1;
 	s->safe_hwidx2 = -1;
 	if (safe_swz != NULL) {
 		s->safe_hwidx1 = safe_swz->head_hwidx;
 		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
 			int spare;
 
 			hwb = &s->hw_buf_info[i];
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			spare = safe_swz->size - hwb->size;
 			if (spare >= CL_METADATA_SIZE) {
 				s->safe_hwidx2 = i;
 				break;
 			}
 		}
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	t4_init_tp_params(sc, 1);
 
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
 	    "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue, control queues, and special
  * purpose rx queues owned by the adapter.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 	int rc, i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	sysctl_ctx_init(&sc->ctx);
 	sc->flags |= ADAP_SYSCTL_CTX;
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * That's all for the VF driver.
 	 */
 	if (sc->flags & IS_VF)
 		return (rc);
 
 	oid = device_get_sysctl_tree(sc->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	/*
 	 * XXX: General purpose rx queues, one per port.
 	 */
 
 	/*
 	 * Control queues, one per port.
 	 */
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "ctrlq",
 	    CTLFLAG_RD, NULL, "control queues");
 	for_each_port(sc, i) {
 		struct sge_wrq *ctrlq = &sc->sge.ctrlq[i];
 
 		rc = alloc_ctrlq(sc, ctrlq, i, oid);
 		if (rc != 0)
 			return (rc);
 	}
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/* Do this before freeing the queue */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	if (!(sc->flags & IS_VF)) {
 		for_each_port(sc, i)
 			free_wrq(sc, &sc->sge.ctrlq[i]);
 	}
 	free_fwq(sc);
 
 	return (0);
 }
 
 /* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
 mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
 {
 	int payload;
 
 #ifdef TCP_OFFLOAD
 	if (toe) {
 		int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2));
 
 		/* Note that COP can set rx_coalesce on/off per connection. */
 		payload = max(mtu, rxcs);
 	} else {
 #endif
 		/* large enough even when hw VLAN extraction is disabled */
 		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 		    ETHER_VLAN_ENCAP_LEN + mtu;
 #ifdef TCP_OFFLOAD
 	}
 #endif
 
 	return (payload);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, intr_idx, iqidx;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	char name[16];
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 	int maxp, mtu = ifp->if_mtu;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = vi->first_intr;
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(!forwarding_intr_to_fwq(sc));
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
 		    CTLFLAG_RD, NULL, "tx queues");
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * Allocate rx queues first because a default iqid is required when
 	 * creating a tx queue.
 	 */
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 	    CTLFLAG_RD, NULL, "rx queues");
 	for_each_rxq(vi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_rxq(vi, rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 	    CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections");
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
 		    vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_ofld_rxq(vi, ofld_rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #endif
 
 	/*
 	 * Now the tx queues.
 	 */
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
 	    NULL, "tx queues");
 	for_each_txq(vi, i, txq) {
 		iqidx = vi->first_rxq + (i % vi->nrxq);
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
 
 		rc = alloc_txq(vi, txq, i, oid);
 		if (rc != 0)
 			goto done;
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD, NULL, "tx queues for TOE/ETHOFLD");
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		struct sysctl_oid *oid2;
 
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), i);
 		if (vi->nofldrxq > 0) {
 			iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.ofld_rxq[iqidx].iq.cntxt_id,
 			    name);
 		} else {
 			iqidx = vi->first_rxq + (i % vi->nrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name);
 		}
 
 		snprintf(name, sizeof(name), "%d", i);
 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    name, CTLFLAG_RD, NULL, "offload tx queue");
 
 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
 		if (rc != 0)
 			goto done;
 	}
 #endif
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 	/* Do this before freeing the queues */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Interrupt handler when the driver is using only 1 interrupt.  This is a very
  * unusual scenario.
  *
  * a) Deals with errors, if any.
  * b) Services firmware event queue, which is taking interrupts for all other
  *    queues.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	MPASS(sc->intr_count == 1);
 
 	if (sc->intr_type == INTR_INTX)
 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 
 	t4_intr_err(arg);
 	t4_intr_evt(fwq);
 }
 
 /*
  * Interrupt handler for errors (installed directly when multiple interrupts are
  * being used, or called by t4_intr_all).
  */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
+	uint32_t v;
 	const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
 
 	if (sc->flags & ADAP_ERR)
 		return;
+
+	v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
+	if (v & F_PFSW) {
+		sc->swintr++;
+		t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
+	}
 
 	t4_slow_intr_handler(sc, verbose);
 }
 
 /*
  * Interrupt handler for iq-only queues.  The firmware event queue is the only
  * such queue right now.
  */
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /*
  * Interrupt handler for iq+fl queues.
  */
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq_fl(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 #ifdef DEV_NETMAP
 /*
  * Interrupt handler for netmap rx queues.
  */
 void
 t4_nm_intr(void *arg)
 {
 	struct sge_nm_rxq *nm_rxq = arg;
 
 	if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
 		service_nm_rxq(nm_rxq);
 		(void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
 	}
 }
 
 /*
  * Interrupt handler for vectors shared between NIC and netmap rx queues.
  */
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 	MPASS(irq->nm_rxq != NULL);
 	t4_nm_intr(irq->nm_rxq);
 
 	MPASS(irq->rxq != NULL);
 	t4_intr(irq->rxq);
 }
 #endif
 
 /*
  * Deals with interrupts on an iq-only (no freelist) queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type;
 	uint32_t lq;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	KASSERT((iq->flags & IQ_HAS_FL) == 0,
 	    ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
 	    iq->flags));
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 	MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 				panic("%s: data for an iq (%p) with no freelist",
 				    __func__, iq);
 
 				/* NOTREACHED */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (__predict_true(lq >= 1024)) {
 					t4_an_handler(iq, &d->rsp);
 					break;
 				}
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq_fl(q, q->qsize / 16) == 0) {
 						(void) atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 				if (budget) {
 					return (EINPROGRESS);
 				}
 			}
 		}
 
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq_fl(q, q->qsize / 8) == 0)
 			(void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	return (0);
 }
 
 static inline int
 sort_before_lro(struct lro_ctrl *lro)
 {
 
 	return (lro->lro_mbuf_max != 0);
 }
 
 static inline uint64_t
 last_flit_to_ns(struct adapter *sc, uint64_t lf)
 {
 	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
 
 	if (n > UINT64_MAX / 1000000)
 		return (n / sc->params.vpd.cclk * 1000000);
 	else
 		return (n * 1000000 / sc->params.vpd.cclk);
 }
 
 /*
  * Deals with interrupts on an iq+fl queue.
  */
 static int
 service_iq_fl(struct sge_iq *iq, int budget)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct sge_fl *fl;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type, refill, starved;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	MPASS(iq->flags & IQ_HAS_FL);
 
 	limit = budget ? budget : iq->qsize / 16;
 	fl = &rxq->fl;
 	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_ADJ_CREDIT) {
 		MPASS(sort_before_lro(lro));
 		iq->flags &= ~IQ_ADJ_CREDIT;
 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
 			tcp_lro_flush_all(lro);
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
 			    V_INGRESSQID((u32)iq->cntxt_id) |
 			    V_SEINTARM(iq->intr_params));
 			return (0);
 		}
 		ndescs = 1;
 	}
 #else
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 #endif
 
 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 		rmb();
 
 		refill = 0;
 		m0 = NULL;
 		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 		lq = be32toh(d->rsp.pldbuflen_qid);
 
 		switch (rsp_type) {
 		case X_RSPD_TYPE_FLBUF:
 
 			m0 = get_fl_payload(sc, fl, lq);
 			if (__predict_false(m0 == NULL))
 				goto out;
 			refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
 
 			if (iq->flags & IQ_RX_TIMESTAMP) {
 				/*
 				 * Fill up rcv_tstmp but do not set M_TSTMP.
 				 * rcv_tstmp is not in the format that the
 				 * kernel expects and we don't want to mislead
 				 * it.  For now this is only for custom code
 				 * that knows how to interpret cxgbe's stamp.
 				 */
 				m0->m_pkthdr.rcv_tstmp =
 				    last_flit_to_ns(sc, d->rsp.u.last_flit);
 #ifdef notyet
 				m0->m_flags |= M_TSTMP;
 #endif
 			}
 
 			/* fall through */
 
 		case X_RSPD_TYPE_CPL:
 			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 			    ("%s: bad opcode %02x.", __func__, d->rss.opcode));
 			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 			break;
 
 		case X_RSPD_TYPE_INTR:
 
 			/*
 			 * There are 1K interrupt-capable queues (qids 0
 			 * through 1023).  A response type indicating a
 			 * forwarded interrupt with a qid >= 1K is an
 			 * iWARP async notification.  That is the only
 			 * acceptable indirect interrupt on this queue.
 			 */
 			if (__predict_false(lq < 1024)) {
 				panic("%s: indirect interrupt on iq_fl %p "
 				    "with qid %u", __func__, iq, lq);
 			}
 
 			t4_an_handler(iq, &d->rsp);
 			break;
 
 		default:
 			KASSERT(0, ("%s: illegal response type %d on iq %p",
 			    __func__, rsp_type, iq));
 			log(LOG_ERR, "%s: illegal response type %d on iq %p",
 			    device_get_nameunit(sc->dev), rsp_type, iq);
 			break;
 		}
 
 		d++;
 		if (__predict_false(++iq->cidx == iq->sidx)) {
 			iq->cidx = 0;
 			iq->gen ^= F_RSPD_GEN;
 			d = &iq->desc[0];
 		}
 		if (__predict_false(++ndescs == limit)) {
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 			    V_INGRESSQID(iq->cntxt_id) |
 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 			ndescs = 0;
 
 #if defined(INET) || defined(INET6)
 			if (iq->flags & IQ_LRO_ENABLED &&
 			    !sort_before_lro(lro) &&
 			    sc->lro_timeout != 0) {
 				tcp_lro_flush_inactive(lro, &lro_timeout);
 			}
 #endif
 			if (budget) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 32);
 				FL_UNLOCK(fl);
 
 				return (EINPROGRESS);
 			}
 		}
 		if (refill) {
 			FL_LOCK(fl);
 			refill_fl(sc, fl, 32);
 			FL_UNLOCK(fl);
 			fl_hw_cidx = fl->hw_cidx;
 		}
 	}
 out:
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
 			MPASS(sort_before_lro(lro));
 			/* hold back one credit and don't flush LRO state */
 			iq->flags |= IQ_ADJ_CREDIT;
 			ndescs--;
 		} else {
 			tcp_lro_flush_all(lro);
 		}
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	FL_LOCK(fl);
 	starved = refill_fl(sc, fl, 64);
 	FL_UNLOCK(fl);
 	if (__predict_false(starved != 0))
 		add_fl_to_sfl(sc, fl);
 
 	return (0);
 }
 
 static inline int
 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
 	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
 
 	if (rc)
 		MPASS(cll->region3 >= CL_METADATA_SIZE);
 
 	return (rc);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
     caddr_t cl)
 {
 
 	if (cl_has_metadata(fl, cll)) {
 		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 
 		return ((struct cluster_metadata *)(cl + swz->size) - 1);
 	}
 	return (NULL);
 }
 
 static void
 rxb_free(struct mbuf *m)
 {
 	uma_zone_t zone = m->m_ext.ext_arg1;
 	void *cl = m->m_ext.ext_arg2;
 
 	uma_zfree(zone, cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned by this function could be allocated from zone_mbuf or
  * constructed in spare room in the cluster.
  *
  * The mbuf carries the payload in one of these ways
  * a) frame inside the mbuf (mbuf from zone_mbuf)
  * b) m_cljset (for clusters without metadata) zone_mbuf
  * c) m_extaddref (cluster with metadata) inline mbuf
  * d) m_extaddref (cluster with metadata) zone_mbuf
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct cluster_layout *cll = &sd->cll;
 	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
 	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
 	int len, blen;
 	caddr_t payload;
 
 	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
 	len = min(remaining, blen);
 	payload = sd->cl + cll->region1 + fl->rx_offset;
 	if (fl->flags & FL_BUF_PACKING) {
 		const u_int l = fr_offset + len;
 		const u_int pad = roundup2(l, fl->buf_boundary) - l;
 
 		if (fl->rx_offset + len + pad < hwb->size)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= hwb->size);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 	}
 
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 
 		/*
 		 * Copy payload into a freshly allocated mbuf.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 
 	} else if (sd->nmbuf * MSIZE < cll->region1) {
 
 		/*
 		 * There's spare room in the cluster for an mbuf.  Create one
 		 * and associate it with the payload that's in the cluster.
 		 */
 
 		MPASS(clm != NULL);
 		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
 		/* No bzero required */
 		if (m_init(m, M_NOWAIT, MT_DATA,
 		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
 			return (NULL);
 		fl->mbuf_inlined++;
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
 		    swz->zone, sd->cl);
 		if (sd->nmbuf++ == 0)
 			counter_u64_add(extfree_refs, 1);
 
 	} else {
 
 		/*
 		 * Grab an mbuf from zone_mbuf and associate it with the
 		 * payload in the cluster.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 		if (clm != NULL) {
 			m_extaddref(m, payload, blen, &clm->refcount,
 			    rxb_free, swz->zone, sd->cl);
 			if (sd->nmbuf++ == 0)
 				counter_u64_add(extfree_refs, 1);
 		} else {
 			m_cljset(m, sd->cl, swz->type);
 			sd->cl = NULL;	/* consumed, not a recycle candidate */
 		}
 	}
 	if (fr_offset == 0)
 		m->m_pkthdr.len = remaining;
 	m->m_len = len;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= hwb->size);
 		if (fl->rx_offset < hwb->size)
 			return (m);	/* without advancing the cidx */
 	}
 
 	if (__predict_false(++fl->cidx % 8 == 0)) {
 		uint16_t cidx = fl->cidx / 8;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 	fl->rx_offset = 0;
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 	const u_int total = G_RSPD_LEN(len_newbuf);
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == total);
 		MPASS(fl->remaining < total);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
 		fl->rx_offset = 0;
 		if (__predict_false(++fl->cidx % 8 == 0)) {
 			uint16_t cidx = fl->cidx / 8;
 
 			if (__predict_false(cidx == fl->sidx))
 				fl->cidx = cidx = 0;
 			fl->hw_cidx = cidx;
 		}
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, total);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = total - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, total - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct ifnet *ifp = rxq->ifp;
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 
 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]);
 	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
 
 	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    cpl->l2info & htobe32(F_RXF_IP)) {
 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    cpl->l2info & htobe32(F_RXF_IP6)) {
 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		}
 
 		if (__predict_false(cpl->ip_frag))
 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		else
 			m0->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (sort_before_lro(lro)) {
 			tcp_lro_queue_mbuf(lro, m0);
 			return (0); /* queued for sort, then LRO */
 		}
 		if (tcp_lro_rx(lro, m0, 0) == 0)
 			return (0); /* queued for LRO */
 	}
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp, mtu = ifp->if_mtu;
 
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.l5hlen = nsegs;
 }
 
 static inline int
 mbuf_cflags(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[4]);
 }
 
 static inline void
 set_mbuf_cflags(struct mbuf *m, uint8_t flags)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[4] = flags;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 #ifdef RATELIMIT
 static inline int
 mbuf_eo_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[1]);
 }
 
 static inline void
 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
 }
 
 static inline int
 mbuf_eo_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[2];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[2] = len16;
 }
 
 static inline int
 mbuf_eo_tsclk_tsoff(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[3]);
 }
 
 static inline void
 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
 }
 
 static inline int
 needs_eo(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.snd_tag != NULL);
 }
 #endif
 
 /*
  * Try to allocate an mbuf to contain a raw work request.  To make it
  * easy to construct the work request, don't allocate a chain but a
  * single mbuf.
  */
 struct mbuf *
 alloc_wr_mbuf(int len, int how)
 {
 	struct mbuf *m;
 
 	if (len <= MHLEN)
 		m = m_gethdr(how, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m = NULL;
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.len = len;
 	m->m_len = len;
 	set_mbuf_cflags(m, MC_RAW_WR);
 	set_mbuf_len16(m, howmany(len, 16));
 	return (m);
 }
 
 static inline int
 needs_tso(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & CSUM_TSO);
 }
 
 static inline int
 needs_l3_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
 }
 
 static inline int
 needs_l4_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
 	    CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 static inline int
 needs_tcp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 #ifdef RATELIMIT
 static inline int
 needs_udp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
 }
 #endif
 
 static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_flags & M_VLANTAG);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.  It is possible for this
  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m, int skip)
 {
 	vm_paddr_t lastb, next;
 	vm_offset_t va;
 	int len, nsegs;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len > 0);
 	MPASS(m->m_pkthdr.len >= skip);
 
 	nsegs = 0;
 	lastb = 0;
 	for (; m; m = m->m_next) {
 
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
 		if (skip >= len) {
 			skip -= len;
 			continue;
 		}
 		va = mtod(m, vm_offset_t) + skip;
 		len -= skip;
 		skip = 0;
 		next = pmap_kextract(va);
 		nsegs += sglist_count((void *)(uintptr_t)va, len);
 		if (lastb + 1 == next)
 			nsegs--;
 		lastb = pmap_kextract(va + len - 1);
 	}
 
 	return (nsegs);
 }
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct adapter *sc, struct mbuf **mp)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 	uint16_t eh_type;
 
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0, 0);
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	set_mbuf_cflags(m0, 0);
 	if (sc->flags & IS_VF)
 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
 #ifdef RATELIMIT
 	/*
 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
 	 * checksumming is enabled.  needs_l4_csum happens to check for all the
 	 * right things.
 	 */
 	if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0)))
 		m0->m_pkthdr.snd_tag = NULL;
 #endif
 
 	if (!needs_tso(m0) &&
 #ifdef RATELIMIT
 	    !needs_eo(m0) &&
 #endif
 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = l3hdr;
 
 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
 
 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
 		break;
 	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
 		break;
 	}
 #endif
 	default:
 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
 		    " with the same INET/INET6 options as the kernel.",
 		    __func__, eh_type);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
 			set_mbuf_eo_tsclk_tsoff(m0,
 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
 		} else
 			set_mbuf_eo_tsclk_tsoff(m0, 0);
 	} else if (needs_udp_csum(m)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
 	}
 #ifdef RATELIMIT
 	if (needs_eo(m0)) {
 		u_int immhdrs;
 
 		/* EO WRs have the headers in the WR and not the GL. */
 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
 		    m0->m_pkthdr.l4hlen;
 		nsegs = count_mbuf_nsegs(m0, immhdrs);
 		set_mbuf_eo_nsegs(m0, nsegs);
 		set_mbuf_eo_len16(m0,
 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
 	}
 #endif
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16) {
 			int available;
 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 			/*
 			 * Note that the WR via which we'll request tx updates
 			 * is at pidx and not eq->pidx, which has moved on
 			 * already.
 			 */
 			dst = (void *)&eq->desc[pidx];
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 			if (available < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 			}
 
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		} else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline int
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m) || (mbuf_cflags(m) & MC_RAW_WR) != 0);
 }
 
 static inline int
 discard_tx(struct sge_eq *eq)
 {
 
 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
 }
 
 static inline int
 wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
 {
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
 	case FW_ETH_TX_PKT_WR:
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKT_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
 	struct sge_txq *txq = r->cookie;
 	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n, next_cidx;
 	struct mbuf *m0, *tail;
 	struct txpkts txp;
 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	MPASS(remaining > 0);	/* Must not be called without work to do. */
 	total = 0;
 
 	TXQ_LOCK(txq);
 	if (__predict_false(discard_tx(eq))) {
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, 2048);
 		total = remaining;
 		goto done;
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
 
 	while (remaining > 0) {
 
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (available < SGE_MAX_WR_NDESC) {
 			available += reclaim_tx_descs(txq, 64);
 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
 				break;	/* out of descriptors */
 		}
 
 		next_cidx = cidx + 1;
 		if (__predict_false(next_cidx == r->size))
 			next_cidx = 0;
 
 		wr = (void *)&eq->desc[eq->pidx];
 		if (sc->flags & IS_VF) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
 			    available);
 		} else if (remaining > 1 &&
 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
 
 			/* pkts at cidx, next_cidx should both be in txp. */
 			MPASS(txp.npkt == 2);
 			tail = r->items[next_cidx];
 			MPASS(tail->m_nextpkt == NULL);
 			ETHER_BPF_MTAP(ifp, m0);
 			ETHER_BPF_MTAP(ifp, tail);
 			m0->m_nextpkt = tail;
 
 			if (__predict_false(++next_cidx == r->size))
 				next_cidx = 0;
 
 			while (next_cidx != pidx) {
 				if (add_to_txpkts(r->items[next_cidx], &txp,
 				    available) != 0)
 					break;
 				tail->m_nextpkt = r->items[next_cidx];
 				tail = tail->m_nextpkt;
 				ETHER_BPF_MTAP(ifp, tail);
 				if (__predict_false(++next_cidx == r->size))
 					next_cidx = 0;
 			}
 
 			n = write_txpkts_wr(txq, wr, m0, &txp, available);
 			total += txp.npkt;
 			remaining -= txp.npkt;
 		} else if (mbuf_cflags(m0) & MC_RAW_WR) {
 			total++;
 			remaining--;
 			n = write_raw_wr(txq, (void *)wr, m0, available);
 		} else {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_wr(txq, (void *)wr, m0, available);
 		}
 		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
 
 		available -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (wr_can_update_eq(wr)) {
 			if (total_available_tx_desc(eq) < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
 			    32) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			}
 		}
 
 		if (dbdiff >= 16 && remaining >= 4) {
 			ring_eq_db(sc, eq, dbdiff);
 			available += reclaim_tx_descs(txq, 4 * dbdiff);
 			dbdiff = 0;
 		}
 
 		cidx = next_cidx;
 	}
 	if (dbdiff != 0) {
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	}
 done:
 	TXQ_UNLOCK(txq);
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 
 	iq->flags = 0;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	find_best_refill_source(sc, fl, maxp);
 	find_safe_refill_source(sc, fl);
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, uint16_t iqid, char *name)
 {
 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 }
 
 static int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 static int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the ring for an ingress queue and an optional freelist.  If the
  * freelist is specified it will be allocated and then associated with the
  * ingress queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  *
  * If the ingress queue will take interrupts directly then the intr_idx
  * specifies the vector, starting from 0.  -1 means the interrupts for this
  * queue should be forwarded to the fwq.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     int intr_idx, int cong)
 {
 	int rc, i, cntxt_id;
 	size_t len;
 	struct fw_iq_cmd c;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = iq->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	__be32 v = 0;
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (intr_idx < 0) {
 		/* Forwarded interrupts, all headed to fwq */
 		v |= F_FW_IQ_CMD_IQANDST;
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
 	} else {
 		KASSERT(intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
 	}
 
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc)
 			return (rc);
 
 		/* Allocate space for one software descriptor per buffer. */
 		rc = alloc_fl_sdesc(fl);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to setup fl software descriptors: %d\n",
 			    rc);
 			return (rc);
 		}
 
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 			fl->buf_boundary = sp->pack_boundary;
 		} else {
 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 			fl->buf_boundary = 16;
 		}
 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 			fl->buf_boundary = sp->pad_boundary;
 
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->intr_next = iq->intr_params;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 	iq->flags |= IQ_ALLOCATED;
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 
 		iq->flags |= IQ_HAS_FL;
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = 0;
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.neq) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.neq - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	return (0);
 }
 
 static int
 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 	struct adapter *sc = iq->adapter;
 	device_t dev;
 
 	if (sc == NULL)
 		return (0);	/* nothing to do */
 
 	dev = vi ? vi->dev : sc->dev;
 
 	if (iq->flags & IQ_ALLOCATED) {
 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
 		if (rc != 0) {
 			device_printf(dev,
 			    "failed to free queue %p: %d\n", iq, rc);
 			return (rc);
 		}
 		iq->flags &= ~IQ_ALLOCATED;
 	}
 
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 
 	bzero(iq, sizeof(*iq));
 
 	if (fl) {
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
 		    fl->desc);
 
 		if (fl->sdesc)
 			free_fl_sdesc(sc, fl);
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
 		bzero(fl, sizeof(*fl));
 	}
 
 	return (0);
 }
 
 static void
 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_iq *iq)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
 	    "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &iq->abs_id, 0, sysctl_uint16, "I",
 	    "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &iq->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &iq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 }
 
 static void
 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fl->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
 	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
 	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
 	if (sc->flags & IS_VF)
 		intr_idx = 0;
 	else
 		intr_idx = sc->intr_count > 1 ? 1 : 0;
 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create firmware event queue: %d\n", rc);
 		return (rc);
 	}
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
 	    NULL, "firmware event queue");
 	add_iq_sysctls(&sc->ctx, oid, fwq);
 
 	return (0);
 }
 
 static int
 free_fwq(struct adapter *sc)
 {
 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
 }
 
 static int
 alloc_ctrlq(struct adapter *sc, struct sge_wrq *ctrlq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	char name[16];
 	struct sysctl_oid_list *children;
 
 	snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev),
 	    idx);
 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan,
 	    sc->sge.fwq.cntxt_id, name);
 
 	children = SYSCTL_CHILDREN(oid);
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "ctrl queue");
 	rc = alloc_wrq(sc, NULL, ctrlq, oid);
 
 	return (rc);
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_e_chan_map);
 }
 
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
 	if (rc != 0)
 		return (rc);
 
 	if (idx == 0)
 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 	else
 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 		    ("iq_base mismatch"));
 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero iq_base"));
 
 	/*
 	 * The freelist is just barely above the starvation threshold right now,
 	 * fill it up a bit more.
 	 */
 	FL_LOCK(&rxq->fl);
 	refill_fl(sc, &rxq->fl, 128);
 	FL_UNLOCK(&rxq->fl);
 
 #if defined(INET) || defined(INET6)
 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
 	if (rc != 0)
 		return (rc);
 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
 
 	if (vi->ifp->if_capenable & IFCAP_LRO)
 		rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 	if (vi->ifp->if_capenable & IFCAP_HWRXTSTMP)
 		rxq->iq.flags |= IQ_RX_TIMESTAMP;
 	rxq->ifp = vi->ifp;
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	add_iq_sysctls(&vi->ctx, oid, &rxq->iq);
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
 
 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	int rc;
 
 #if defined(INET) || defined(INET6)
 	if (rxq->lro.ifp) {
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
 	if (rc == 0)
 		bzero(rxq, sizeof(*rxq));
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
 {
 	struct port_info *pi = vi->pi;
 	int rc;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
 	if (rc != 0)
 		return (rc);
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq);
 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	int rc;
 
 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 	if (rc == 0)
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 
 	return (rc);
 }
 #endif
 
 #ifdef DEV_NETMAP
 static int
 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
     int idx, struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_oid_list *children;
 	struct sysctl_ctx_list *ctx;
 	char name[16];
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 
 	MPASS(na != NULL);
 
 	len = vi->qsize_rxq * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
 	if (rc != 0)
 		return (rc);
 
 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
 	if (rc != 0)
 		return (rc);
 
 	nm_rxq->vi = vi;
 	nm_rxq->nid = idx;
 	nm_rxq->iq_cidx = 0;
 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	nm_rxq->fl_sidx = na->num_rx_desc;
 	nm_rxq->intr_idx = intr_idx;
 	nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID;
 
 	ctx = &vi->ctx;
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
 	    "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
 	    "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &nm_rxq->fl_cidx, 0, "consumer index");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &nm_rxq->fl_pidx, 0, "producer index");
 
 	return (rc);
 }
 
 
 static int
 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID);
 	else
 		MPASS(nm_rxq->iq_cntxt_id == 0);
 
 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
 	    nm_rxq->iq_desc);
 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
 	    nm_rxq->fl_desc);
 
 	return (0);
 }
 
 static int
 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
 	    &nm_txq->ba, (void **)&nm_txq->desc);
 	if (rc)
 		return (rc);
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	nm_txq->sidx = na->num_tx_desc;
 	nm_txq->nid = idx;
 	nm_txq->iqidx = iqidx;
 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID;
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "netmap tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 
 	return (rc);
 }
 
 static int
 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID);
 	else
 		MPASS(nm_txq->cntxt_id == 0);
 
 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
 	    nm_txq->desc);
 
 	return (0);
 }
 #endif
 
 /*
  * Returns a reasonable automatic cidx flush threshold for a given queue size.
  */
 static u_int
 qsize_to_fthresh(int qsize)
 {
 	u_int fthresh;
 
 	while (!powerof2(qsize))
 		qsize++;
 	fthresh = ilog2(qsize);
 	if (fthresh > X_CIDXFLUSHTHRESH_128)
 		fthresh = X_CIDXFLUSHTHRESH_128;
 
 	return (fthresh);
 }
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 static int
 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
 	eq->pidx = eq->cidx = eq->dbidx = 0;
 	/* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
 	eq->equeqidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__,
 		    eq->flags & EQ_TYPEMASK);
 	}
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to allocate egress queue(%d): %d\n",
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc;
 
 	if (eq->flags & EQ_ALLOCATED) {
 		switch (eq->flags & EQ_TYPEMASK) {
 		case EQ_CTRL:
 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 		case EQ_ETH:
 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 #endif
 
 		default:
 			panic("%s: invalid eq type %d.", __func__,
 			    eq->flags & EQ_TYPEMASK);
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to free egress queue (%d): %d\n",
 			    eq->flags & EQ_TYPEMASK, rc);
 			return (rc);
 		}
 		eq->flags &= ~EQ_ALLOCATED;
 	}
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 
 	if (mtx_initialized(&eq->eq_lock))
 		mtx_destroy(&eq->eq_lock);
 
 	bzero(eq, sizeof(*eq));
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = alloc_eq(sc, vi, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &wrq->eq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx, "status page index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 
 	return (rc);
 }
 
 static int
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	int rc;
 
 	rc = free_eq(sc, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	bzero(wrq, sizeof(*wrq));
 	return (0);
 }
 
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
 	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
 	rc = alloc_eq(sc, vi, eq);
 	if (rc != 0) {
 		mp_ring_free(txq->r);
 		txq->r = NULL;
 		return (rc);
 	}
 
 	/* Can't fail after this point. */
 
 	if (idx == 0)
 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 	else
 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 		    ("eq_base mismatch"));
 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero eq_base"));
 
 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 	txq->ifp = vi->ifp;
 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 	if (sc->flags & IS_VF)
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan));
 	else
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 		    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 		    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	txq->tc_idx = -1;
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &eq->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    eq->sidx, "status page index");
 
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
 	    "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
 	    CTLFLAG_RD, &txq->vlan_insertion,
 	    "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
 	    CTLFLAG_RD, &txq->txpkts0_wrs,
 	    "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
 	    CTLFLAG_RD, &txq->txpkts1_wrs,
 	    "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
 	    CTLFLAG_RD, &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
 	    CTLFLAG_RD, &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
 
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
 	    CTLFLAG_RD, &txq->r->enqueues,
 	    "# of enqueues to the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
 	    CTLFLAG_RD, &txq->r->drops,
 	    "# of drops in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
 	    CTLFLAG_RD, &txq->r->starts,
 	    "# of normal consumer starts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
 	    CTLFLAG_RD, &txq->r->stalls,
 	    "# of consumer stalls in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
 	    CTLFLAG_RD, &txq->r->restarts,
 	    "# of consumer restarts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
 	    CTLFLAG_RD, &txq->r->abdications,
 	    "# of consumer abdications in the mp_ring for this queue");
 
 	return (0);
 }
 
 static int
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	rc = free_eq(sc, eq);
 	if (rc)
 		return (rc);
 
 	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
 	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
 }
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct cluster_layout *cll;
 	struct sw_zone_info *swz;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 	cll = &fl->cll_def;	/* default layout */
 	swz = &sc->sge.sw_zone_info[cll->zidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 #ifdef INVARIANTS
 				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 				if (clm != NULL)
 					MPASS(clm->refcount == 1);
 #endif
 				goto recycled_fast;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 alloc:
 		cl = uma_zalloc(swz->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
 			    fl->cll_def.zidx == fl->cll_alt.zidx)
 				break;
 
 			/* fall back to the safe zone */
 			cll = &fl->cll_alt;
 			swz = &sc->sge.sw_zone_info[cll->zidx];
 			goto alloc;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		pa += cll->region1;
 		sd->cl = cl;
 		sd->cll = *cll;
 		*d = htobe64(pa | cll->hwidx);
 		clm = cl_metadata(sc, fl, cll, cl);
 		if (clm != NULL) {
 recycled:
 #ifdef INVARIANTS
 			clm->sd = sd;
 #endif
 			clm->refcount = 1;
 		}
 		sd->nmbuf = 0;
 recycled_fast:
 		d++;
 		sd++;
 		if (__predict_false(++fl->pidx % 8 == 0)) {
 			uint16_t pidx = fl->pidx / 8;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if (fl->pidx / 8 != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 static int
 alloc_fl_sdesc(struct sge_fl *fl)
 {
 
 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	return (0);
 }
 
 static void
 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	struct cluster_layout *cll;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		cll = &sd->cll;
 		clm = cl_metadata(sc, fl, cll, sd->cl);
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 			counter_u64_add(extfree_rels, 1);
 		}
 		sd->cl = NULL;
 	}
 
 	free(fl->sdesc, M_CXGBE);
 	fl->sdesc = NULL;
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 	KASSERT(gl->sg_nseg > 0 &&
 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int csum_type, len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 	
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
 
 	csum_type = -1;
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			csum_type = TX_CSUM_TCPIP6;
 		else
 			csum_type = TX_CSUM_TCPIP;
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else {
 		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
 			csum_type = TX_CSUM_TCPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
 			csum_type = TX_CSUM_UDPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
 			csum_type = TX_CSUM_TCPIP6;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
 			csum_type = TX_CSUM_UDPIP6;
 #if defined(INET)
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
 			/*
 			 * XXX: The firmware appears to stomp on the
 			 * fragment/flags field of the IP header when
 			 * using TX_CSUM_IP.  Fall back to doing
 			 * software checksums.
 			 */
 			u_short *sump;
 			struct mbuf *m;
 			int offset;
 
 			m = m0;
 			offset = 0;
 			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
 			    offsetof(struct ip, ip_sum));
 			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
 			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 #endif
 
 		cpl = (void *)(wr + 1);
 	}
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (csum_type >= 0) {
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
 	    ("%s: mbuf %p needs checksum offload but missing header lengths",
 			__func__, m0));
 
 		if (chip_id(sc) <= CHELSIO_T5) {
 			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		} else {
 			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		}
 		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
 		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
 	} else
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a raw WR to the hardware descriptors, update the software
  * descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct mbuf *m;
 	caddr_t dst;
 	int len16, ndesc;
 
 	len16 = mbuf_len16(m0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	dst = wr;
 	for (m = m0; m != NULL; m = m->m_next)
 		copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 
 	txq->raw_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
     struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	else if (pktlen <= imm_payload(2) && available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (needs_l4_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
 	u_int needed, nsegs1, nsegs2, l1, l2;
 
 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
 		return (1);
 
 	nsegs1 = mbuf_nsegs(m);
 	nsegs2 = mbuf_nsegs(n);
 	if (nsegs1 + nsegs2 == 2) {
 		txp->wr_type = 1;
 		l1 = l2 = txpkts1_len16();
 	} else {
 		txp->wr_type = 0;
 		l1 = txpkts0_len16(nsegs1);
 		l2 = txpkts0_len16(nsegs2);
 	}
 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
 	needed = howmany(txp->len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
 	if (txp->plen > 65535)
 		return (1);
 
 	txp->npkt = 2;
 	set_mbuf_len16(m, l1);
 	set_mbuf_len16(n, l2);
 
 	return (0);
 }
 
 static int
 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 {
 	u_int plen, len16, needed, nsegs;
 
 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
 	if (cannot_use_txpkts(m))
 		return (1);
 
 	nsegs = mbuf_nsegs(m);
 	if (txp->wr_type == 1 && nsegs != 1)
 		return (1);
 
 	plen = txp->plen + m->m_pkthdr.len;
 	if (plen > 65535)
 		return (1);
 
 	if (txp->wr_type == 0)
 		len16 = txpkts0_len16(nsegs);
 	else
 		len16 = txpkts1_len16();
 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->npkt++;
 	txp->plen = plen;
 	txp->len16 += len16;
 	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
     struct mbuf *m0, const struct txpkts *txp, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int ndesc, checkwrap;
 	struct mbuf *m;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->plen < 65536);
 	MPASS(m0 != NULL);
 	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 	MPASS(available > 0 && available < eq->sidx);
 
 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (m = m0; m != NULL; m = m->m_nextpkt) {
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(mbuf_len16(m));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 		} else {
 			cpl = flitp;
 		}
 
 		/* Checksum offload */
 		ctrl1 = 0;
 		if (needs_l3_csum(m) == 0)
 			ctrl1 |= F_TXPKT_IPCSUM_DIS;
 		if (needs_l4_csum(m) == 0)
 			ctrl1 |= F_TXPKT_L4CSUM_DIS;
 		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 	}
 
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
 	} else {
 		txq->txpkts1_pkts += txp->npkt;
 		txq->txpkts1_wrs++;
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 		KASSERT(ndesc != 0,
 		    ("%s: descriptor with no credits: cidx %d",
 		    __func__, eq->cidx));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		uint64_t rc;
 
 		rc = (uint64_t)segs[i].ss_len << 32;
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)(segs[i + 1].ss_len);
 
 		return (htobe64(rc));
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static void
 find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
 {
 	int8_t zidx, hwidx, idx;
 	uint16_t region1, region3;
 	int spare, spare_needed, n;
 	struct sw_zone_info *swz;
 	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
 
 	/*
 	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
 	 * large enough for the max payload and cluster metadata.  Otherwise
 	 * settle for the largest bufsize that leaves enough room in the cluster
 	 * for metadata.
 	 *
 	 * Without buffer packing: Look for the smallest zone which has a
 	 * bufsize large enough for the max payload.  Settle for the largest
 	 * bufsize available if there's nothing big enough for max payload.
 	 */
 	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
 	swz = &sc->sge.sw_zone_info[0];
 	hwidx = -1;
 	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
 		if (swz->size > largest_rx_cluster) {
 			if (__predict_true(hwidx != -1))
 				break;
 
 			/*
 			 * This is a misconfiguration.  largest_rx_cluster is
 			 * preventing us from finding a refill source.  See
 			 * dev.t5nex.<n>.buffer_sizes to figure out why.
 			 */
 			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
 			    " refill source for fl %p (dma %u).  Ignored.\n",
 			    largest_rx_cluster, fl, maxp);
 		}
 		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
 			hwb = &hwb_list[idx];
 			spare = swz->size - hwb->size;
 			if (spare < spare_needed)
 				continue;
 
 			hwidx = idx;		/* best option so far */
 			if (hwb->size >= maxp) {
 
 				if ((fl->flags & FL_BUF_PACKING) == 0)
 					goto done; /* stop looking (not packing) */
 
 				if (swz->size >= safest_rx_cluster)
 					goto done; /* stop looking (packing) */
 			}
 			break;		/* keep looking, next zone */
 		}
 	}
 done:
 	/* A usable hwidx has been located. */
 	MPASS(hwidx != -1);
 	hwb = &hwb_list[hwidx];
 	zidx = hwb->zidx;
 	swz = &sc->sge.sw_zone_info[zidx];
 	region1 = 0;
 	region3 = swz->size - hwb->size;
 
 	/*
 	 * Stay within this zone and see if there is a better match when mbuf
 	 * inlining is allowed.  Remember that the hwidx's are sorted in
 	 * decreasing order of size (so in increasing order of spare area).
 	 */
 	for (idx = hwidx; idx != -1; idx = hwb->next) {
 		hwb = &hwb_list[idx];
 		spare = swz->size - hwb->size;
 
 		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
 			break;
 
 		/*
 		 * Do not inline mbufs if doing so would violate the pad/pack
 		 * boundary alignment requirement.
 		 */
 		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
 			continue;
 		if (fl->flags & FL_BUF_PACKING &&
 		    (MSIZE % sc->params.sge.pack_boundary) != 0)
 			continue;
 
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			continue;
 		n = (spare - CL_METADATA_SIZE) / MSIZE;
 		if (n > howmany(hwb->size, maxp))
 			break;
 
 		hwidx = idx;
 		if (fl->flags & FL_BUF_PACKING) {
 			region1 = n * MSIZE;
 			region3 = spare - region1;
 		} else {
 			region1 = MSIZE;
 			region3 = spare - region1;
 			break;
 		}
 	}
 
 	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
 	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
 	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
 	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
 	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
 	    sc->sge.sw_zone_info[zidx].size,
 	    ("%s: bad buffer layout for fl %p, maxp %d. "
 		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		sc->sge.sw_zone_info[zidx].size, region1,
 		sc->sge.hw_buf_info[hwidx].size, region3));
 	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
 		KASSERT(region3 >= CL_METADATA_SIZE,
 		    ("%s: no room for metadata.  fl %p, maxp %d; "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 		KASSERT(region1 % MSIZE == 0,
 		    ("%s: bad mbuf region for fl %p, maxp %d. "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 	}
 
 	fl->cll_def.zidx = zidx;
 	fl->cll_def.hwidx = hwidx;
 	fl->cll_def.region1 = region1;
 	fl->cll_def.region3 = region3;
 }
 
 static void
 find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
 {
 	struct sge *s = &sc->sge;
 	struct hw_buf_info *hwb;
 	struct sw_zone_info *swz;
 	int spare;
 	int8_t hwidx;
 
 	if (fl->flags & FL_BUF_PACKING)
 		hwidx = s->safe_hwidx2;	/* with room for metadata */
 	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
 		hwidx = s->safe_hwidx2;
 		hwb = &s->hw_buf_info[hwidx];
 		swz = &s->sw_zone_info[hwb->zidx];
 		spare = swz->size - hwb->size;
 
 		/* no good if there isn't room for an mbuf as well */
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			hwidx = s->safe_hwidx1;
 	} else
 		hwidx = s->safe_hwidx1;
 
 	if (hwidx == -1) {
 		/* No fallback source */
 		fl->cll_alt.hwidx = -1;
 		fl->cll_alt.zidx = -1;
 
 		return;
 	}
 
 	hwb = &s->hw_buf_info[hwidx];
 	swz = &s->sw_zone_info[hwb->zidx];
 	spare = swz->size - hwb->size;
 	fl->cll_alt.hwidx = hwidx;
 	fl->cll_alt.zidx = hwb->zidx;
 	if (allow_mbufs_in_cluster &&
 	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
 		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
 	else
 		fl->cll_alt.region1 = 0;
 	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	mp_ring_check_drainage(txq->r, 0);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 sysctl_uint16(SYSCTL_HANDLER_ARGS)
 {
 	uint16_t *id = arg1;
 	int i = *id;
 
 	return sysctl_handle_int(oidp, &i, 0, req);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct sge *s = arg1;
 	struct hw_buf_info *hwb = &s->hw_buf_info[0];
 	struct sw_zone_info *swz = &s->sw_zone_info[0];
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
 		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", hwb->size, c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 #ifdef RATELIMIT
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
 {
 	u_int n;
 
 	MPASS(immhdrs > 0);
 
 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
 	if (__predict_false(nsegs == 0))
 		goto done;
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 done:
 	return (howmany(n, 16));
 }
 
 #define ETID_FLOWC_NPARAMS 6
 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
 
 static int
 send_etid_flowc_wr(struct cxgbe_snd_tag *cst, struct port_info *pi,
     struct vi_info *vi)
 {
 	struct wrq_cookie cookie;
 	u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
 	struct fw_flowc_wr *flowc;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
 	    EO_FLOWC_PENDING);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		return (ENOMEM);
 
 	bzero(flowc, ETID_FLOWC_LEN);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(cst->iqid);
 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 	flowc->mnemval[5].val = htobe32(cst->schedcl);
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags &= ~EO_FLOWC_PENDING;
 	cst->flags |= EO_FLOWC_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
 	cst->tx_credits -= ETID_FLOWC_LEN16;
 
 	return (0);
 }
 
 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
 
 void
 send_etid_flush_wr(struct cxgbe_snd_tag *cst)
 {
 	struct fw_flowc_wr *flowc;
 	struct wrq_cookie cookie;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		CXGBE_UNIMPLEMENTED(__func__);
 
 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags |= EO_FLUSH_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
 	cst->tx_credits -= ETID_FLUSH_LEN16;
 	cst->ncompl++;
 }
 
 static void
 write_ethofld_wr(struct cxgbe_snd_tag *cst, struct fw_eth_tx_eo_wr *wr,
     struct mbuf *m0, int compl)
 {
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	uint32_t ctrl;	/* used in many unrelated places */
 	int len16, pktlen, nsegs, immhdrs;
 	caddr_t dst;
 	uintptr_t p;
 	struct ulptx_sgl *usgl;
 	struct sglist sg;
 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	M_ASSERTPKTHDR(m0);
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0,
 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
 
 	len16 = mbuf_eo_len16(m0);
 	nsegs = mbuf_eo_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
 	ctrl += immhdrs;
 
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	wr->r3 = 0;
 	if (needs_udp_csum(m0)) {
 		wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
 		wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
 		wr->u.udpseg.rtplen = 0;
 		wr->u.udpseg.r4 = 0;
 		wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
 		wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
 		wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
 		cpl = (void *)(wr + 1);
 	} else {
 		MPASS(needs_tcp_csum(m0));
 		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
 		wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
 		wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
 		wr->u.tcpseg.r4 = 0;
 		wr->u.tcpseg.r5 = 0;
 		wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
 
 		if (needs_tso(m0)) {
 			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 			wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
 
 			ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
 			    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
 			    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 			    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 			if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 				ctrl |= V_LSO_ETHHDR_LEN(1);
 			if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 				ctrl |= F_LSO_IPV6;
 			lso->lso_ctrl = htobe32(ctrl);
 			lso->ipid_ofst = htobe16(0);
 			lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 			lso->seqno_offset = htobe32(0);
 			lso->len = htobe32(pktlen);
 
 			cpl = (void *)(lso + 1);
 		} else {
 			wr->u.tcpseg.mss = htobe16(0xffff);
 			cpl = (void *)(wr + 1);
 		}
 	}
 
 	/* Checksum offload must be requested for ethofld. */
 	ctrl1 = 0;
 	MPASS(needs_l4_csum(m0));
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = cst->ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
 	p = (uintptr_t)(cpl + 1);
 	m_copydata(m0, 0, immhdrs, (void *)p);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 		int i, pad;
 
 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
 		p += immhdrs;
 		pad = 16 - (immhdrs & 0xf);
 		bzero((void *)p, pad);
 
 		usgl = (void *)(p + pad);
 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 		    V_ULPTX_NSGE(nsegs));
 
 		sglist_init(&sg, nitems(segs), segs);
 		for (; m0 != NULL; m0 = m0->m_next) {
 			if (__predict_false(m0->m_len == 0))
 				continue;
 			if (immhdrs >= m0->m_len) {
 				immhdrs -= m0->m_len;
 				continue;
 			}
 
 			sglist_append(&sg, mtod(m0, char *) + immhdrs,
 			    m0->m_len - immhdrs);
 			immhdrs = 0;
 		}
 		MPASS(sg.sg_nseg == nsegs);
 
 		/*
 		 * Zero pad last 8B in case the WR doesn't end on a 16B
 		 * boundary.
 		 */
 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
 
 		usgl->len0 = htobe32(segs[0].ss_len);
 		usgl->addr0 = htobe64(segs[0].ss_paddr);
 		for (i = 0; i < nsegs - 1; i++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 	}
 
 }
 
 static void
 ethofld_tx(struct cxgbe_snd_tag *cst)
 {
 	struct mbuf *m;
 	struct wrq_cookie cookie;
 	int next_credits, compl;
 	struct fw_eth_tx_eo_wr *wr;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
 		M_ASSERTPKTHDR(m);
 
 		/* How many len16 credits do we need to send this mbuf. */
 		next_credits = mbuf_eo_len16(m);
 		MPASS(next_credits > 0);
 		if (next_credits > cst->tx_credits) {
 			/*
 			 * Tx will make progress eventually because there is at
 			 * least one outstanding fw4_ack that will return
 			 * credits and kick the tx.
 			 */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie);
 		if (__predict_false(wr == NULL)) {
 			/* XXX: wishful thinking, not a real assertion. */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		cst->tx_credits -= next_credits;
 		cst->tx_nocompl += next_credits;
 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
 		ETHER_BPF_MTAP(cst->com.ifp, m);
 		write_ethofld_wr(cst, wr, m, compl);
 		commit_wrq_wr(cst->eo_txq, wr, &cookie);
 		if (compl) {
 			cst->ncompl++;
 			cst->tx_nocompl	= 0;
 		}
 		(void) mbufq_dequeue(&cst->pending_tx);
 		mbufq_enqueue(&cst->pending_fwack, m);
 	}
 }
 
 int
 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
 {
 	struct cxgbe_snd_tag *cst;
 	int rc;
 
 	MPASS(m0->m_nextpkt == NULL);
 	MPASS(m0->m_pkthdr.snd_tag != NULL);
 	cst = mst_to_cst(m0->m_pkthdr.snd_tag);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 
 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
 		struct vi_info *vi = ifp->if_softc;
 		struct port_info *pi = vi->pi;
 		struct adapter *sc = pi->adapter;
 		const uint32_t rss_mask = vi->rss_size - 1;
 		uint32_t rss_hash;
 
 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
 		if (M_HASHTYPE_ISHASH(m0))
 			rss_hash = m0->m_pkthdr.flowid;
 		else
 			rss_hash = arc4random();
 		/* We assume RSS hashing */
 		cst->iqid = vi->rss[rss_hash & rss_mask];
 		cst->eo_txq += rss_hash % vi->nofldtxq;
 		rc = send_etid_flowc_wr(cst, pi, vi);
 		if (rc != 0)
 			goto done;
 	}
 
 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
 		rc = ENOBUFS;
 		goto done;
 	}
 
 	mbufq_enqueue(&cst->pending_tx, m0);
 	cst->plen += m0->m_pkthdr.len;
 
 	ethofld_tx(cst);
 	rc = 0;
 done:
 	mtx_unlock(&cst->lock);
 	if (__predict_false(rc != 0))
 		m_freem(m0);
 	return (rc);
 }
 
 static int
 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	struct mbuf *m;
 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct cxgbe_snd_tag *cst;
 	uint8_t credits = cpl->credits;
 
 	cst = lookup_etid(sc, etid);
 	mtx_lock(&cst->lock);
 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
 		MPASS(credits >= ETID_FLOWC_LEN16);
 		credits -= ETID_FLOWC_LEN16;
 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
 	}
 
 	KASSERT(cst->ncompl > 0,
 	    ("%s: etid %u (%p) wasn't expecting completion.",
 	    __func__, etid, cst));
 	cst->ncompl--;
 
 	while (credits > 0) {
 		m = mbufq_dequeue(&cst->pending_fwack);
 		if (__predict_false(m == NULL)) {
 			/*
 			 * The remaining credits are for the final flush that
 			 * was issued when the tag was freed by the kernel.
 			 */
 			MPASS((cst->flags &
 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
 			    EO_FLUSH_RPL_PENDING);
 			MPASS(credits == ETID_FLUSH_LEN16);
 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
 			MPASS(cst->ncompl == 0);
 
 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
 			cst->tx_credits += cpl->credits;
 freetag:
 			cxgbe_snd_tag_free_locked(cst);
 			return (0);	/* cst is gone. */
 		}
 		KASSERT(m != NULL,
 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
 		    credits));
 		KASSERT(credits >= mbuf_eo_len16(m),
 		    ("%s: too few credits (%u, %u, %u)", __func__,
 		    cpl->credits, credits, mbuf_eo_len16(m)));
 		credits -= mbuf_eo_len16(m);
 		cst->plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 
 	cst->tx_credits += cpl->credits;
 	MPASS(cst->tx_credits <= cst->tx_total);
 
 	m = mbufq_first(&cst->pending_tx);
 	if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
 		ethofld_tx(cst);
 
 	if (__predict_false((cst->flags & EO_SND_TAG_REF) == 0) &&
 	    cst->ncompl == 0) {
 		if (cst->tx_credits == cst->tx_total)
 			goto freetag;
 		else {
 			MPASS((cst->flags & EO_FLUSH_RPL_PENDING) == 0);
 			send_etid_flush_wr(cst);
 		}
 	}
 
 	mtx_unlock(&cst->lock);
 
 	return (0);
 }
 #endif
Index: projects/capsicum-test/sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/tom/t4_cpl_io.c	(revision 345710)
@@ -1,2326 +1,2326 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct toepcb *toep);
 
 static size_t
 aiotx_mbuf_pgoff(struct mbuf *m)
 {
 	struct aiotx_buffer *ab;
 
 	MPASS(IS_AIOTX_MBUF(m));
 	ab = m->m_ext.ext_arg1;
 	return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
 }
 
 static vm_page_t *
 aiotx_mbuf_pages(struct mbuf *m)
 {
 	struct aiotx_buffer *ab;
 	int npages;
 
 	MPASS(IS_AIOTX_MBUF(m));
 	ab = m->m_ext.ext_arg1;
 	npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
 	return (ab->ps.pages + npages);
 }
 
 void
 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (ftxp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		nparams++;
 	if (toep->tls.fcplenmax != 0)
 		nparams++;
 	if (toep->tc_idx != -1) {
 		MPASS(toep->tc_idx >= 0 &&
 		    toep->tc_idx < sc->chip_params->nsched_cls);
 		nparams++;
 	}
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	if (ftxp) {
 		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
 
 		FLOWC_PARAM(SNDNXT, ftxp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt);
 		FLOWC_PARAM(SNDBUF, sndbuf);
 		FLOWC_PARAM(MSS, ftxp->mss);
 
 		CTR6(KTR_CXGBE,
 		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
 		    ftxp->rcv_nxt);
 	} else {
 		FLOWC_PARAM(SNDBUF, 512);
 		FLOWC_PARAM(MSS, 512);
 
 		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
 	}
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		FLOWC_PARAM(ULP_MODE, toep->ulp_mode);
 	if (toep->tls.fcplenmax != 0)
 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 	if (toep->tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->tc_idx);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
 	}
 
 	if (toep->tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->tc_idx >= 0)
 		t4_release_cl_rl(sc, port_id, toep->tc_idx);
 	toep->tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, unsigned int opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct adapter *sc = td_adapter(toep->td);
 	int n;
 
 	INP_LOCK_ASSERT(inp);
 
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;
 
 	if (G_TCPOPT_TSTAMP(opt)) {
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
 	}
 
 	CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__,
 	    toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)],
 	    tp->t_maxseg);
 
 	if (G_TCPOPT_SACK(opt))
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	else
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	}
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from the exchange of SYNs.
  */
 void
 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	long bufsize;
 	uint16_t tcpopt = be16toh(opt);
 	struct flowc_tx_params ftxp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tcp_state_change(tp, TCPS_ESTABLISHED);
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = toep->rx_credits << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	/*
 	 * If we were unable to send all rx credits via opt0, save the remainder
 	 * in rx_credits so that they can be handed over with the next credit
 	 * update.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	bufsize = select_rcv_wnd(so);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	toep->rx_credits = bufsize - tp->rcv_wnd;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
 		bufsize = V_tcp_autosndbuf_max;
 	else
 		bufsize = sbspace(&so->so_snd);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	ftxp.snd_nxt = tp->snd_nxt;
 	ftxp.rcv_nxt = tp->rcv_nxt;
 	ftxp.snd_space = bufsize;
 	ftxp.mss = tp->t_maxseg;
 	send_flowc_wr(toep, &ftxp);
 
 	soisconnected(so);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return;
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int credits;
 
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sbused(sb), toep->sb_cc));
 
 	credits = toep->sb_cc - sbused(sb);
 	toep->sb_cc = sbused(sb);
 	if (toep->ulp_mode == ULP_MODE_TLS) {
 		if (toep->tls.rcv_over >= credits) {
 			toep->tls.rcv_over -= credits;
 			credits = 0;
 		} else {
 			credits -= toep->tls.rcv_over;
 			toep->tls.rcv_over = 0;
 		}
 	}
 	toep->rx_credits += credits;
 
 	if (toep->rx_credits > 0 &&
 	    (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
 	    (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
 	    toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
 
 		credits = send_rx_credits(sc, toep, toep->rx_credits);
 		toep->rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_OFLD_TX_CREDITS)
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
     unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (txalign > 0) {
 		struct tcpcb *tp = intotcpcb(toep->inp);
 
 		if (plen < 2 * tp->t_maxseg)
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(tp->t_flags & TF_NODELAY ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (IS_AIOTX_MBUF(m))
 			rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
 			    aiotx_mbuf_pgoff(m), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool aiotx_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
 	    toep->ulp_mode == ULP_MODE_TCPDDP ||
 	    toep->ulp_mode == ULP_MODE_TLS ||
 	    toep->ulp_mode == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		aiotx_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if (IS_AIOTX_MBUF(m))
 				n = sglist_count_vmpages(aiotx_mbuf_pages(m),
 				    aiotx_mbuf_pgoff(m), m->m_len);
 			else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (IS_AIOTX_MBUF(m))
 				aiotx_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL,
 			    ("%s: nothing to send, but m != NULL", __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !aiotx_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
 			    sc->tt.tx_align);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
 			    sc->tt.tx_align);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop)
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		M_ASSERTPKTHDR(sndptr);
 
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits);
 		max_nsegs = max_dsgl_nsegs(tx_credits);
 
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/*
 			 * This mbuf would send us _over_ the nsegs limit.
 			 * Suspend tx because the PDU can't be sent out.
 			 */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/*
 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
 		 * is NULL.  A PDU's length is always a multiple of 4.
 		 */
 		MPASS(m == NULL);
 		MPASS((plen & 3) == 0);
 		MPASS(sndptr->m_pkthdr.len == plen);
 
 		shove = !(tp->t_flags & TF_MORETOCOME);
 		ulp_submode = mbuf_ulp_submode(sndptr);
 		MPASS(ulp_submode < nitems(ulp_extra_len));
 
 		/*
 		 * plen doesn't include header and data digests, which are
 		 * generated and inserted in the right places by the TOE, but
 		 * they do occupy TCP sequence space and need to be accounted
 		 * for.
 		 */
 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
 		if (plen <= max_imm) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
 			    shove, ulp_submode, sc->tt.tx_align);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
 			    shove, ulp_submode, sc->tt.tx_align);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += adjusted_plen;
 		tp->snd_max += adjusted_plen;
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, 0);
 	else if (tls_tx_key(toep))
 		t4_push_tls_records(sc, toep, 0);
 	else
 		t4_push_frames(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED) {
 		if (toep->ulp_mode == ULP_MODE_ISCSI)
 			t4_push_pdus(sc, toep, 0);
 		else if (tls_tx_key(toep))
 			t4_push_tls_records(sc, toep, 0);
 		else
 			t4_push_frames(sc, toep, 0);
 	}
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_peer_close and if
 		 * this is still a synqe instead of a toepcb then the connection
 		 * must be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	tp->rcv_nxt++;	/* FIN */
 
 	so = inp->inp_socket;
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 	socantrcvmore(so);
 
 	if (toep->ulp_mode != ULP_MODE_RDMA) {
 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    		be32toh(cpl->rcv_nxt)));
 	}
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tcp_state_change(tp, TCPS_CLOSING);
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_wrq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct epoch_tracker et;
 	int len;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_rx_data and if this
 		 * is still a synqe instead of a toepcb then the connection must
 		 * be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ulp_mode == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
-		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else {
 				KASSERT(cpl->ddp_off == 1,
 				    ("%s: DDP switched on by itself.",
 				    __func__));
 
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
 	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sbused(sb), toep->sb_cc));
 	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
 	toep->sb_cc = sbused(sb);
 	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->rx_credits);
 		toep->rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		if (txsd->iv_buffer) {
 			free(txsd->iv_buffer, M_CXGBE);
 			txsd->iv_buffer = NULL;
 		}
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		if (toep->ulp_mode == ULP_MODE_ISCSI)
 			t4_push_pdus(sc, toep, plen);
 		else if (tls_tx_key(toep))
 			t4_push_tls_records(sc, toep, plen);
 		else
 			t4_push_frames(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (toep->ulp_mode == ULP_MODE_ISCSI) {
 
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data trasmitted before the tid's ULP mode
 				 * changed to ISCSI is still in so_snd.
 				 * Incoming credits should account for so_snd
 				 * first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (tls_tx_key(toep)) {
 				struct tls_ofld_info *tls_ofld = &toep->tls;
 
 				MPASS(tls_ofld->sb_off >= plen);
 				tls_ofld->sb_off -= plen;
 			}
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
 	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
 		txsd->tx_credits = howmany(sizeof(*req), 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
 		    ("%s: not enough credits (%d)", __func__,
 		    toep->tx_credits));
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 	    CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 }
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * sent by the AIO job so far and the 'backend4' field to hold an
  * error that should be reported when the job is completed.
  */
 #define	aio_sent	backend3
 #define	aio_error	backend4
 
 #define	jobtotid(job)							\
 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
 	
 static void
 free_aiotx_buffer(struct aiotx_buffer *ab)
 {
 	struct kaiocb *job;
 	long status;
 	int error;
 
 	if (refcount_release(&ab->refcount) == 0)
 		return;
 
 	job = ab->job;
 	error = job->aio_error;
 	status = job->aio_sent;
 	vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
 	free(ab, M_CXGBE);
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error == ECANCELED && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, status, 0);
 }
 
 static void
 t4_aiotx_mbuf_free(struct mbuf *m)
 {
 	struct aiotx_buffer *ab = m->m_ext.ext_arg1;
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(ab->job));
 #endif
 	free_aiotx_buffer(ab);
 }
 
 /*
  * Hold the buffer backing an AIO request and return an AIO transmit
  * buffer.
  */
 static int
 hold_aio(struct kaiocb *job)
 {
 	struct aiotx_buffer *ab;
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	int n;
 
 	MPASS(job->backend1 == NULL);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 	n = atop(end - start);
 
 	ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	refcount_init(&ab->refcount, 1);
 	ab->ps.pages = (vm_page_t *)(ab + 1);
 	ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ab->ps.pages, n);
 	if (ab->ps.npages < 0) {
 		free(ab, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ab->ps.npages == n,
 	    ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
 
 	ab->ps.offset = pgoff;
 	ab->ps.len = job->uaiocb.aio_nbytes;
 	ab->job = job;
 	job->backend1 = ab;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
 #endif
 	return (0);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct adapter *sc;
 	struct sockbuf *sb;
 	struct file *fp;
 	struct aiotx_buffer *ab;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	int error;
 	bool moretocome, sendmore;
 
 	sc = td_adapter(toep->td);
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	fp = job->fd_file;
 	ab = job->backend1;
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(fp->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (ab == NULL) {
 		error = hold_aio(job);
 		if (error != 0)
 			goto out;
 		ab = job->backend1;
 	}
 
 	/* Inline sosend_generic(). */
 
 	job->msgsnd = 1;
 
 	error = sblock(sb, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	m = m_get(M_WAITOK, MT_DATA);
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			sbunlock(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		sbunlock(sb);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	m->m_len = sbspace(sb);
 	if (m->m_len > ab->ps.len - job->aio_sent) {
 		m->m_len = ab->ps.len - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (m->m_len > sc->tt.sndbuf) {
 		m->m_len = sc->tt.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(m->m_len != 0);
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		sbunlock(sb);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	refcount_acquire(&ab->refcount);
 	m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
 	    (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
 	m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
 	job->aio_sent += m->m_len;
 	
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tp->t_fb->tfb_tcp_output(tp);
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	sbunlock(sb);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a non-blocking socket and the request has not
 	 * been fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop a reference on
 	 * the aiotx buffer.  Any mbufs in flight should still
 	 * contain a reference, but this drops the reference that the
 	 * job owns while it is waiting to queue mbufs to the socket.
 	 */
 	free_aiotx_buffer(ab);
 
 out:
 	if (error) {
 		if (ab != NULL) {
 			job->aio_error = error;
 			free_aiotx_buffer(ab);
 		} else {
 			MPASS(job->aio_sent == 0);
 			aio_complete(job, -1, error);
 		}
 	}
 	if (m != NULL)
 		m_free(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct kaiocb *job;
 
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_task_active = false;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	CURVNET_RESTORE();
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aiotx_queue_toep(struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_task_active ? "true" : "false");
 #endif
 	if (toep->aiotx_task_active)
 		return;
 	toep->aiotx_task_active = true;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct aiotx_buffer *ab;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	ab = job->backend1;
 	if (ab != NULL)
 		free_aiotx_buffer(ab);
 	else
 		aio_cancel(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
Index: projects/capsicum-test/sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/tom/t4_ddp.c	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/tom/t4_ddp.c	(revision 345710)
@@ -1,1978 +1,1979 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * received by the AIO job so far.
  */
 #define	aio_received	backend3
 
 static void aio_ddp_requeue_task(void *context, int pending);
 static void ddp_complete_all(struct toepcb *toep, int error);
 static void t4_aio_cancel_active(struct kaiocb *job);
 static void t4_aio_cancel_queued(struct kaiocb *job);
 
 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
 static struct mtx ddp_orphan_pagesets_lock;
 static struct task ddp_orphan_task;
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
 
 /*
  * A page set holds information about a buffer used for DDP.  The page
  * set holds resources such as the VM pages backing the buffer (either
  * held or wired) and the page pods associated with the buffer.
  * Recently used page sets are cached to allow for efficient reuse of
  * buffers (avoiding the need to re-fault in pages, hold them, etc.).
  * Note that cached page sets keep the backing pages wired.  The
  * number of wired pages is capped by only allowing for two wired
  * pagesets per connection.  This is not a perfect cap, but is a
  * trade-off for performance.
  *
  * If an application ping-pongs two buffers for a connection via
  * aio_read(2) then those buffers should remain wired and expensive VM
  * fault lookups should be avoided after each buffer has been used
  * once.  If an application uses more than two buffers then this will
  * fall back to doing expensive VM fault lookups for each operation.
  */
 static void
 free_pageset(struct tom_data *td, struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	if (ps->prsv.prsv_nppods > 0)
 		t4_free_page_pods(&ps->prsv);
 
 	if (ps->flags & PS_WIRED) {
 		for (i = 0; i < ps->npages; i++) {
 			p = ps->pages[i];
 			vm_page_lock(p);
 			vm_page_unwire(p, PQ_INACTIVE);
 			vm_page_unlock(p);
 		}
 	} else
 		vm_page_unhold_pages(ps->pages, ps->npages);
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 ddp_free_orphan_pagesets(void *context, int pending)
 {
 	struct pageset *ps;
 
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
 		mtx_unlock(&ddp_orphan_pagesets_lock);
 		if (ps->vm)
 			vmspace_free(ps->vm);
 		free(ps, M_CXGBE);
 		mtx_lock(&ddp_orphan_pagesets_lock);
 	}
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 recycle_pageset(struct toepcb *toep, struct pageset *ps)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) && ps->flags & PS_WIRED) {
 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
 		    nitems(toep->ddp.db), ("too many wired pagesets"));
 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count++;
 	} else
 		free_pageset(toep->td, ps);
 }
 
 static void
 ddp_complete_one(struct kaiocb *job, int error)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0 || error == 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_complete(job, -1, error);
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db->job) {
 		/*
 		 * XXX: If we are un-offloading the socket then we
 		 * should requeue these on the socket somehow.  If we
 		 * got a FIN from the remote end, then this completes
 		 * any remaining requests with an EOF read.
 		 */
 		if (!aio_clear_cancel_function(db->job))
 			ddp_complete_one(db->job, 0);
 	}
 
 	if (db->ps)
 		free_pageset(td, db->ps);
 }
 
 void
 ddp_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->ddp.aiojobq);
 	TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep);
 	toep->ddp.flags = DDP_OK;
 	toep->ddp.active_id = -1;
 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
 }
 
 void
 ddp_uninit_toep(struct toepcb *toep)
 {
 
 	mtx_destroy(&toep->ddp.lock);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	struct pageset *ps;
 	int i;
 
 	DDP_LOCK(toep);
 	toep->flags |= DDP_DEAD;
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		free_ddp_buffer(toep->td, &toep->ddp.db[i]);
 	}
 	while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		free_pageset(toep->td, ps);
 	}
 	ddp_complete_all(toep, 0);
 	DDP_UNLOCK(toep);
 }
 
 #ifdef INVARIANTS
 void
 ddp_assert_empty(struct toepcb *toep)
 {
 	int i;
 
 	MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE));
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		MPASS(toep->ddp.db[i].job == NULL);
 		MPASS(toep->ddp.db[i].ps == NULL);
 	}
 	MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
 	MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
 }
 #endif
 
 static void
 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
     unsigned int db_idx)
 {
 	unsigned int db_flag;
 
 	toep->ddp.active_count--;
 	if (toep->ddp.active_id == db_idx) {
 		if (toep->ddp.active_count == 0) {
 			KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
 			    ("%s: active_count mismatch", __func__));
 			toep->ddp.active_id = -1;
 		} else
 			toep->ddp.active_id ^= 1;
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 #endif
 	} else {
 		KASSERT(toep->ddp.active_count != 0 &&
 		    toep->ddp.active_id != -1,
 		    ("%s: active count mismatch", __func__));
 	}
 
 	db->cancel_pending = 0;
 	db->job = NULL;
 	recycle_pageset(toep, db->ps);
 	db->ps = NULL;
 
 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 	KASSERT(toep->ddp.flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 	toep->ddp.flags &= ~db_flag;
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	size_t placed;
 	long copied;
 	unsigned int db_flag, db_idx;
 
 	INP_WLOCK_ASSERT(inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += n;
 #endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = n;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else if (copied + placed != 0) {
 			CTR4(KTR_CXGBE,
 			    "%s: completing %p (copied %ld, placed %lu)",
 			    __func__, job, copied, placed);
 			/* XXX: This always completes if there is some data. */
 			aio_complete(job, copied + placed, 0);
 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 			toep->ddp.waiting_count++;
 		} else
 			aio_cancel(job);
 		n -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(n == 0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
 {
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 	    V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(ps->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    ddp_flags_mask, ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct kaiocb *job;
 	long copied;
 
 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	DDP_LOCK(toep);
 
 	KASSERT(toep->ddp.active_id == db_idx,
 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
 	    toep->ddp.active_id, toep->tid));
 	db = &toep->ddp.db[db_idx];
 	job = db->job;
 
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		/*
 		 * This can happen due to an administrative tcpdrop(8).
 		 * Just fail the request with ECONNRESET.
 		 */
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, ECONNRESET);
 		goto completed;
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
 	 * sequence number of the next byte to receive.  The length of
 	 * the data received for this message must be computed by
 	 * comparing the new and old values of rcv_nxt.
 	 *
 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
 	 * length of the most recent DMA.  It does not include the
 	 * total length of the data received since the previous update
 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
 	 * first received byte from the most recent DMA.
 	 */
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: DDP[%d] placed %d bytes (%#x)", __func__, db_idx,
 	    len, report);
 #endif
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
+		struct adapter *sc = td_adapter(toep->td);
 		unsigned int hiwat = sb->sb_hiwat;
-		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += len;
 #endif
 
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
 		 * Update the job's length but defer completion to the
 		 * TCB_RPL callback.
 		 */
 		job->aio_received += len;
 		goto out;
 	} else if (!aio_clear_cancel_function(job)) {
 		/*
 		 * Update the copied length for when
 		 * t4_aio_cancel_active() completes this request.
 		 */
 		job->aio_received += len;
 	} else {
 		copied = job->aio_received;
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %d)",
 		    __func__, job, copied, len);
 #endif
 		aio_complete(job, copied + len, 0);
 		t4_rcvd(&toep->td->tod, tp);
 	}
 
 completed:
 	complete_ddp_buffer(toep, db, db_idx);
 	if (toep->ddp.waiting_count > 0)
 		ddp_queue_toep(toep);
 out:
 	DDP_UNLOCK(toep);
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 handle_ddp_indicate(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	MPASS(toep->ddp.active_count == 0);
 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
 	if (toep->ddp.waiting_count == 0) {
 		/*
 		 * The pending requests that triggered the request for an
 		 * an indicate were cancelled.  Those cancels should have
 		 * already disabled DDP.  Just ignore this as the data is
 		 * going into the socket buffer anyway.
 		 */
 		return;
 	}
 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
 	    toep->tid, toep->ddp.waiting_count);
 	ddp_queue_toep(toep);
 }
 
 enum {
 	DDP_BUF0_INVALIDATED = 0x2,
 	DDP_BUF1_INVALIDATED
 };
 
 CTASSERT(DDP_BUF0_INVALIDATED == CPL_COOKIE_DDP0);
 
 static int
 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	unsigned int db_idx;
 	struct toepcb *toep;
 	struct inpcb *inp;
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 
 	if (cpl->status != CPL_ERR_NONE)
 		panic("XXX: tcp_rpl failed: %d", cpl->status);
 
 	toep = lookup_tid(sc, tid);
 	inp = toep->inp;
 	switch (cpl->cookie) {
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED):
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED):
 		/*
 		 * XXX: This duplicates a lot of code with handle_ddp_data().
 		 */
 		db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED;
 		MPASS(db_idx < nitems(toep->ddp.db));
 		INP_WLOCK(inp);
 		DDP_LOCK(toep);
 		db = &toep->ddp.db[db_idx];
 
 		/*
 		 * handle_ddp_data() should leave the job around until
 		 * this callback runs once a cancel is pending.
 		 */
 		MPASS(db != NULL);
 		MPASS(db->job != NULL);
 		MPASS(db->cancel_pending);
 
 		/*
 		 * XXX: It's not clear what happens if there is data
 		 * placed when the buffer is invalidated.  I suspect we
 		 * need to read the TCB to see how much data was placed.
 		 *
 		 * For now this just pretends like nothing was placed.
 		 *
 		 * XXX: Note that if we did check the PCB we would need to
 		 * also take care of updating the tp, etc.
 		 */
 		job = db->job;
 		copied = job->aio_received;
 		if (copied == 0) {
 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
 			aio_cancel(job);
 		} else {
 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
 			    __func__, job, copied);
 			aio_complete(job, copied, 0);
 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
 		}
 
 		complete_ddp_buffer(toep, db, db_idx);
 		if (toep->ddp.waiting_count > 0)
 			ddp_queue_toep(toep);
 		DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 		break;
 	default:
 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
 	}
 
 	return (0);
 }
 
 void
 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
 {
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 	unsigned int db_flag, db_idx;
 	int len, placed;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
 	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits += len;
 #endif
 
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = len;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else {
 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
 			    __func__, toep->tid, db_idx, placed);
 			aio_complete(job, copied + placed, 0);
 		}
 		len -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(len == 0);
 	ddp_complete_all(toep, 0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 extern cpl_handler_t t4_cpl_handler[];
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	if (toep->ulp_mode == ULP_MODE_ISCSI) {
 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
 		return (0);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 static void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	DDP_ASSERT_LOCKED(toep);
 	toep->ddp.flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
 static inline int
 pages_to_nppods(int npages, int ddp_page_shift)
 {
 
 	MPASS(ddp_page_shift >= PAGE_SHIFT);
 
 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
 }
 
 static int
 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
     struct ppod_reservation *prsv)
 {
 	vmem_addr_t addr;       /* relative to start of region */
 
 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (ENOMEM);
 
 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
 
 	/*
 	 * The hardware tagmask includes an extra invalid bit but the arena was
 	 * seeded with valid values only.  An allocation out of this arena will
 	 * fit inside the tagmask but won't have the invalid bit set.
 	 */
 	MPASS((addr & pr->pr_tag_mask) == addr);
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	prsv->prsv_pr = pr;
 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
 	prsv->prsv_nppods = nppods;
 
 	return (0);
 }
 
 int
 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
 {
 	int i, hcf, seglen, idx, nppods;
 	struct ppod_reservation *prsv = &ps->prsv;
 
 	KASSERT(prsv->prsv_nppods == 0,
 	    ("%s: page pods already allocated", __func__));
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < ps->npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < ps->npages - 1 &&
 		    ps->pages[i]->phys_addr + PAGE_SIZE ==
 		    ps->pages[i + 1]->phys_addr) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (0);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (1);
 }
 
 int
 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
     struct ppod_reservation *prsv)
 {
 	int hcf, seglen, idx, npages, nppods;
 	uintptr_t start_pva, end_pva, pva, p1;
 
 	MPASS(buf > 0);
 	MPASS(len > 0);
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
 	 * in the page list.
 	 */
 	hcf = 0;
 	start_pva = trunc_page(buf);
 	end_pva = trunc_page(buf + len - 1);
 	pva = start_pva;
 	while (pva <= end_pva) {
 		seglen = PAGE_SIZE;
 		p1 = pmap_kextract(pva);
 		pva += PAGE_SIZE;
 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
 			seglen += PAGE_SIZE;
 			pva += PAGE_SIZE;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	npages = 1;
 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
 	nppods = howmany(npages, PPOD_PAGES);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (0);
 }
 
 void
 t4_free_page_pods(struct ppod_reservation *prsv)
 {
 	struct ppod_region *pr = prsv->prsv_pr;
 	vmem_addr_t addr;
 
 	MPASS(prsv != NULL);
 	MPASS(prsv->prsv_nppods != 0);
 
 	addr = prsv->prsv_tag & pr->pr_tag_mask;
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
 	    pr->pr_arena, addr, prsv->prsv_nppods);
 
 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
 	prsv->prsv_nppods = 0;
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
 int
 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct pageset *ps)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
 	struct ppod_reservation *prsv = &ps->prsv;
 	struct ppod_region *pr = prsv->prsv_pr;
 
 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
 	    ("%s: page pods already written", __func__));
 	MPASS(prsv->prsv_nppods > 0);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) | prsv->prsv_tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
 			    V_PPOD_OFST(ps->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < ps->npages) {
 					ppod->addr[k] =
 					    htobe64(ps->pages[idx]->phys_addr);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 	ps->flags |= PS_PPODS_WRITTEN;
 
 	return (0);
 }
 
 int
 t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz;
 	u_int ppod_addr, offset;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva, pa;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	offset = buf & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
 			    V_PPOD_OFST(offset));
 			ppod->rsvd = 0;
 
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (pva > end_pva)
 					ppod->addr[k] = 0;
 				else {
 					pa = pmap_kextract(pva);
 					ppod->addr[k] = htobe64(pa);
 					pva += ddp_pgsz;
 				}
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 			/*
 			 * Walk back 1 segment so that the first address in the
 			 * next pod is the same as the last one in the current
 			 * pod.
 			 */
 			pva -= ddp_pgsz;
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 
 	MPASS(pva <= end_pva);
 
 	return (0);
 }
 
 static void
 wire_pageset(struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	KASSERT(!(ps->flags & PS_WIRED), ("pageset already wired"));
 
 	for (i = 0; i < ps->npages; i++) {
 		p = ps->pages[i];
 		vm_page_lock(p);
 		vm_page_wire(p);
 		vm_page_unhold(p);
 		vm_page_unlock(p);
 	}
 	ps->flags |= PS_WIRED;
 }
 
 /*
  * Prepare a pageset for DDP.  This wires the pageset and sets up page
  * pods.
  */
 static int
 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
 {
 	struct tom_data *td = sc->tom_softc;
 
 	if (!(ps->flags & PS_WIRED))
 		wire_pageset(ps);
 	if (ps->prsv.prsv_nppods == 0 &&
 	    !t4_alloc_page_pods_for_ps(&td->pr, ps)) {
 		return (0);
 	}
 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
     const char *name)
 {
 	int i;
 
 	MPASS(pr != NULL);
 	MPASS(r->size > 0);
 
 	pr->pr_start = r->start;
 	pr->pr_len = r->size;
 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
 
 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
 			return (ENXIO);
 	}
 
 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
 		return (ENXIO);
 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
 
 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
 	    M_FIRSTFIT | M_NOWAIT);
 	if (pr->pr_arena == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 void
 t4_free_ppod_region(struct ppod_region *pr)
 {
 
 	MPASS(pr != NULL);
 
 	if (pr->pr_arena)
 		vmem_destroy(pr->pr_arena);
 	bzero(pr, sizeof(*pr));
 }
 
 static int
 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
     int pgoff, int len)
 {
 
 	if (ps->start != start || ps->npages != npages ||
 	    ps->offset != pgoff || ps->len != len)
 		return (1);
 
 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
 }
 
 static int
 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	struct pageset *ps;
 	int n;
 
 	DDP_ASSERT_LOCKED(toep);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE) {
 		/*
 		 * Truncate the request to a short read.
 		 * Alternatively, we could DDP in chunks to the larger
 		 * buffer, but that would be quite a bit more work.
 		 *
 		 * When truncating, round the request down to avoid
 		 * crossing a cache line on the final transaction.
 		 */
 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
 		    (unsigned long)(end - (start + pgoff)));
 		job->uaiocb.aio_nbytes = end - (start + pgoff);
 #endif
 		end = round_page(end);
 	}
 
 	n = atop(end - start);
 
 	/*
 	 * Try to reuse a cached pageset.
 	 */
 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
 		if (pscmp(ps, vm, start, n, pgoff,
 		    job->uaiocb.aio_nbytes) == 0) {
 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 			toep->ddp.cached_count--;
 			*pps = ps;
 			return (0);
 		}
 	}
 
 	/*
 	 * If there are too many cached pagesets to create a new one,
 	 * free a pageset before creating a new one.
 	 */
 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
 	if (toep->ddp.active_count + toep->ddp.cached_count ==
 	    nitems(toep->ddp.db)) {
 		KASSERT(toep->ddp.cached_count > 0,
 		    ("no cached pageset to free"));
 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count--;
 		free_pageset(toep->td, ps);
 	}
 	DDP_UNLOCK(toep);
 
 	/* Create a new pageset. */
 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	ps->pages = (vm_page_t *)(ps + 1);
 	ps->vm_timestamp = map->timestamp;
 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ps->pages, n);
 
 	DDP_LOCK(toep);
 	if (ps->npages < 0) {
 		free(ps, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
 	    ps->npages, n));
 
 	ps->offset = pgoff;
 	ps->len = job->uaiocb.aio_nbytes;
 	atomic_add_int(&vm->vm_refcnt, 1);
 	ps->vm = vm;
 	ps->start = start;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, toep->tid, ps, job, ps->npages);
 	*pps = ps;
 	return (0);
 }
 
 static void
 ddp_complete_all(struct toepcb *toep, int error)
 {
 	struct kaiocb *job;
 
 	DDP_ASSERT_LOCKED(toep);
 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, error);
 	}
 }
 
 static void
 aio_ddp_cancel_one(struct kaiocb *job)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_cancel(job);
 }
 
 /*
  * Called when the main loop wants to requeue a job to retry it later.
  * Deals with the race of the job being cancelled while it was being
  * examined.
  */
 static void
 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) &&
 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count++;
 	} else
 		aio_ddp_cancel_one(job);
 }
 
 static void
 aio_ddp_requeue(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct kaiocb *job;
 	struct ddp_buffer *db;
 	size_t copied, offset, resid;
 	struct pageset *ps;
 	struct mbuf *m;
 	uint64_t ddp_flags, ddp_flags_mask;
 	struct wrqe *wr;
 	int buf_flag, db_idx, error;
 
 	DDP_ASSERT_LOCKED(toep);
 
 restart:
 	if (toep->ddp.flags & DDP_DEAD) {
 		MPASS(toep->ddp.waiting_count == 0);
 		MPASS(toep->ddp.active_count == 0);
 		return;
 	}
 
 	if (toep->ddp.waiting_count == 0 ||
 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
 		return;
 	}
 
 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
 	so = job->fd_file->f_data;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	/* We will never get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		SOCKBUF_UNLOCK(sb);
 		ddp_complete_all(toep, ENOTCONN);
 		return;
 	}
 
 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
 	    ("%s: pending sockbuf data and DDP is active", __func__));
 
 	/* Abort if socket has reported problems. */
 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
 	if (so->so_error && sbavail(sb) == 0) {
 		toep->ddp.waiting_count--;
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		if (!aio_clear_cancel_function(job)) {
 			SOCKBUF_UNLOCK(sb);
 			goto restart;
 		}
 
 		/*
 		 * If this job has previously copied some data, report
 		 * a short read and leave the error to be reported by
 		 * a future request.
 		 */
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			aio_complete(job, copied, 0);
 			goto restart;
 		}
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		aio_complete(job, -1, error);
 		goto restart;
 	}
 
 	/*
 	 * Door is closed.  If there is pending data in the socket buffer,
 	 * deliver it.  If there are pending DDP requests, wait for those
 	 * to complete.  Once they have completed, return EOF reads.
 	 */
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ddp.active_count != 0)
 			return;
 		ddp_complete_all(toep, 0);
 		return;
 	}
 
 	/*
 	 * If DDP is not enabled and there is no pending socket buffer
 	 * data, try to enable DDP.
 	 */
 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
 		SOCKBUF_UNLOCK(sb);
 
 		/*
 		 * Wait for the card to ACK that DDP is enabled before
 		 * queueing any buffers.  Currently this waits for an
 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
 		 * message to know that DDP was enabled instead of waiting
 		 * for the indicate which would avoid copying the indicate
 		 * if no data is pending.
 		 *
 		 * XXX: Might want to limit the indicate size to the size
 		 * of the first queued request.
 		 */
 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
 			enable_ddp(sc, toep);
 		return;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/*
 	 * If another thread is queueing a buffer for DDP, let it
 	 * drain any work and return.
 	 */
 	if (toep->ddp.queueing != NULL)
 		return;
 
 	/* Take the next job to prep it for DDP. */
 	toep->ddp.waiting_count--;
 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 	if (!aio_clear_cancel_function(job))
 		goto restart;
 	toep->ddp.queueing = job;
 
 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
 	error = hold_aio(toep, job, &ps);
 	if (error != 0) {
 		ddp_complete_one(job, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_error && sbavail(sb) == 0) {
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		aio_complete(job, -1, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		if (toep->ddp.active_count != 0) {
 			/*
 			 * The door is closed, but there are still pending
 			 * DDP buffers.  Requeue.  These jobs will all be
 			 * completed once those buffers drain.
 			 */
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			return;
 		}
 		ddp_complete_one(job, 0);
 		ddp_complete_all(toep, 0);
 		toep->ddp.queueing = NULL;
 		return;
 	}
 
 sbcopy:
 	/*
 	 * If the toep is dead, there shouldn't be any data in the socket
 	 * buffer, so the above case should have handled this.
 	 */
 	MPASS(!(toep->ddp.flags & DDP_DEAD));
 
 	/*
 	 * If there is pending data in the socket buffer (either
 	 * from before the requests were queued or a DDP indicate),
 	 * copy those mbufs out directly.
 	 */
 	copied = 0;
 	offset = ps->offset + job->aio_received;
 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
 	resid = job->uaiocb.aio_nbytes - job->aio_received;
 	m = sb->sb_mb;
 	KASSERT(m == NULL || toep->ddp.active_count == 0,
 	    ("%s: sockbuf data with active DDP", __func__));
 	while (m != NULL && resid > 0) {
 		struct iovec iov[1];
 		struct uio uio;
 		int error;
 
 		iov[0].iov_base = mtod(m, void *);
 		iov[0].iov_len = m->m_len;
 		if (iov[0].iov_len > resid)
 			iov[0].iov_len = resid;
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = 0;
 		uio.uio_resid = iov[0].iov_len;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		error = uiomove_fromphys(ps->pages, offset + copied,
 		    uio.uio_resid, &uio);
 		MPASS(error == 0 && uio.uio_resid == 0);
 		copied += uio.uio_offset;
 		resid -= uio.uio_offset;
 		m = m->m_next;
 	}
 	if (copied != 0) {
 		sbdrop_locked(sb, copied);
 		job->aio_received += copied;
 		job->msgrcv = 1;
 		copied = job->aio_received;
 		inp = sotoinpcb(so);
 		if (!INP_TRY_WLOCK(inp)) {
 			/*
 			 * The reference on the socket file descriptor in
 			 * the AIO job should keep 'sb' and 'inp' stable.
 			 * Our caller has a reference on the 'toep' that
 			 * keeps it stable.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			DDP_UNLOCK(toep);
 			INP_WLOCK(inp);
 			DDP_LOCK(toep);
 			SOCKBUF_LOCK(sb);
 
 			/*
 			 * If the socket has been closed, we should detect
 			 * that and complete this request if needed on
 			 * the next trip around the loop.
 			 */
 		}
 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
 		INP_WUNLOCK(inp);
 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
 			/*
 			 * We filled the entire buffer with socket
 			 * data, DDP is not being used, or the socket
 			 * is being shut down, so complete the
 			 * request.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * If DDP is not enabled, requeue this request and restart.
 		 * This will either enable DDP or wait for more data to
 		 * arrive on the socket buffer.
 		 */
 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * An indicate might have arrived and been added to
 		 * the socket buffer while it was unlocked after the
 		 * copy to lock the INP.  If so, restart the copy.
 		 */
 		if (sbavail(sb) != 0)
 			goto sbcopy;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	if (prep_pageset(sc, toep, ps) == 0) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need to retry this later.  Mostly need a trigger
 		 * when page pods are freed up.
 		 */
 		printf("%s: prep_pageset failed\n", __func__);
 		return;
 	}
 
 	/* Determine which DDP buffer to use. */
 	if (toep->ddp.db[0].job == NULL) {
 		db_idx = 0;
 	} else {
 		MPASS(toep->ddp.db[1].job == NULL);
 		db_idx = 1;
 	}
 
 	ddp_flags = 0;
 	ddp_flags_mask = 0;
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
 		buf_flag = DDP_BUF0_ACTIVE;
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
 		buf_flag = DDP_BUF1_ACTIVE;
 	}
 	MPASS((toep->ddp.flags & buf_flag) == 0);
 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
 		MPASS(db_idx == 0);
 		MPASS(toep->ddp.active_id == -1);
 		MPASS(toep->ddp.active_count == 0);
 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
 	}
 
 	/*
 	 * The TID for this connection should still be valid.  If DDP_DEAD
 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
 	 * this far anyway.  Even if the socket is closing on the other
 	 * end, the AIO job holds a reference on this end of the socket
 	 * which will keep it open and keep the TCP PCB attached until
 	 * after the job is completed.
 	 */
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
 	    ddp_flags, ddp_flags_mask);
 	if (wr == NULL) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need a way to kick a retry here.
 		 *
 		 * XXX: We know the fixed size needed and could
 		 * preallocate this using a blocking request at the
 		 * start of the task to avoid having to handle this
 		 * edge case.
 		 */
 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
 		return;
 	}
 
 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
 		free_wrqe(wr);
 		recycle_pageset(toep, ps);
 		aio_ddp_cancel_one(job);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: scheduling %p for DDP[%d] (flags %#lx/%#lx)",
 	    __func__, job, db_idx, ddp_flags, ddp_flags_mask);
 #endif
 	/* Give the chip the go-ahead. */
 	t4_wrq_tx(sc, wr);
 	db = &toep->ddp.db[db_idx];
 	db->cancel_pending = 0;
 	db->job = job;
 	db->ps = ps;
 	toep->ddp.queueing = NULL;
 	toep->ddp.flags |= buf_flag;
 	toep->ddp.active_count++;
 	if (toep->ddp.active_count == 1) {
 		MPASS(toep->ddp.active_id == -1);
 		toep->ddp.active_id = db_idx;
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 	}
 	goto restart;
 }
 
 void
 ddp_queue_toep(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
 		return;
 	toep->ddp.flags |= DDP_TASK_ACTIVE;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->ddp.requeue_task);
 }
 
 static void
 aio_ddp_requeue_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 
 	DDP_LOCK(toep);
 	aio_ddp_requeue(toep);
 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
 	DDP_UNLOCK(toep);
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aio_cancel_active(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	uint64_t valid_flag;
 	int i;
 
 	DDP_LOCK(toep);
 	if (aio_cancel_cleared(job)) {
 		DDP_UNLOCK(toep);
 		aio_ddp_cancel_one(job);
 		return;
 	}
 
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		if (toep->ddp.db[i].job == job) {
 			/* Should only ever get one cancel request for a job. */
 			MPASS(toep->ddp.db[i].cancel_pending == 0);
 
 			/*
 			 * Invalidate this buffer.  It will be
 			 * cancelled or partially completed once the
 			 * card ACKs the invalidate.
 			 */
 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
 			    V_TF_DDP_BUF1_VALID(1);
 			t4_set_tcb_field(sc, toep->ctrlq, toep,
 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
 			    i + DDP_BUF0_INVALIDATED);
 			toep->ddp.db[i].cancel_pending = 1;
 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
 			    __func__, job);
 			break;
 		}
 	}
 	DDP_UNLOCK(toep);
 }
 
 static void
 t4_aio_cancel_queued(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 	DDP_LOCK(toep);
 	if (!aio_cancel_cleared(job)) {
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (toep->ddp.waiting_count == 0)
 			ddp_queue_toep(toep);
 	}
 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
 	DDP_UNLOCK(toep);
 
 	aio_ddp_cancel_one(job);
 }
 
 int
 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 
 	/* Ignore writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
 		return (EOPNOTSUPP);
 
 	DDP_LOCK(toep);
 
 	/*
 	 * XXX: Think about possibly returning errors for ENOTCONN,
 	 * etc.  Perhaps the caller would only queue the request
 	 * if it failed with EOPNOTSUPP?
 	 */
 
 #ifdef VERBOSE_TRACES
 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
 #endif
 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
 	toep->ddp.waiting_count++;
 	toep->ddp.flags |= DDP_OK;
 
 	/*
 	 * Try to handle this request synchronously.  If this has
 	 * to block because the task is running, it will just bail
 	 * and let the task handle it instead.
 	 */
 	aio_ddp_requeue(toep);
 	DDP_UNLOCK(toep);
 	return (0);
 }
 
 void
 t4_ddp_mod_load(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	TAILQ_INIT(&ddp_orphan_pagesets);
 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
 }
 
 void
 t4_ddp_mod_unload(void)
 {
 
 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
 	mtx_destroy(&ddp_orphan_pagesets_lock);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
 }
 #endif
Index: projects/capsicum-test/sys/dev/cxgbe/tom/t4_tls.c
===================================================================
--- projects/capsicum-test/sys/dev/cxgbe/tom/t4_tls.c	(revision 345709)
+++ projects/capsicum-test/sys/dev/cxgbe/tom/t4_tls.c	(revision 345710)
@@ -1,1644 +1,1644 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017-2018 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "crypto/t4_crypto.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /*
  * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while
  * the mbuf is in the ulp_pdu_reclaimq.
  */
 #define	tls_tcp_seq	PH_loc.thirtytwo[0]
 
 /*
  * Handshake lock used for the handshake timer.  Having a global lock
  * is perhaps not ideal, but it avoids having to use callout_drain()
  * in tls_uninit_toep() which can't block.  Also, the timer shouldn't
  * actually fire for most connections.
  */
 static struct mtx tls_handshake_lock;
 
 static void
 t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
     uint64_t val)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	t4_set_tcb_field(sc, toep->ofld_txq, toep, word, mask, val, 0, 0);
 }
 
 /* TLS and DTLS common routines */
 bool
 can_tls_offload(struct adapter *sc)
 {
 
 	return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS);
 }
 
 int
 tls_tx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->tx_key_addr >= 0);
 }
 
 int
 tls_rx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->rx_key_addr >= 0);
 }
 
 static int
 key_size(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return ((tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) ?
 		tls_ofld->k_ctx.tx_key_info_size : KEY_IN_DDR_SIZE);
 }
 
 /* Set TLS Key-Id in TCB */
 static void
 t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG,
 			 V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG),
 			 V_TCB_RX_TLS_KEY_TAG(key_id));
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
 static void
 t4_clear_rx_quiesce(struct toepcb *toep)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
 }
 
 static void
 tls_clr_ofld_mode(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 
 	/* Operate in PDU extraction mode only. */
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 	    V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 	t4_clear_rx_quiesce(toep);
 }
 
 static void
 tls_clr_quiesce(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 	t4_clear_rx_quiesce(toep);
 }
 
 /*
  * Calculate the TLS data expansion size
  */
 static int
 tls_expansion_size(struct toepcb *toep, int data_len, int full_pdus_only,
     unsigned short *pdus_per_ulp)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_scmd *scmd = &tls_ofld->scmd0;
 	int expn_size = 0, frag_count = 0, pad_per_pdu = 0,
 	    pad_last_pdu = 0, last_frag_size = 0, max_frag_size = 0;
 	int exp_per_pdu = 0;
 	int hdr_len = TLS_HEADER_LENGTH;
 
 	do {
 		max_frag_size = tls_ofld->k_ctx.frag_size;
 		if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) ==
 		   SCMD_CIPH_MODE_AES_GCM) {
 			frag_count = (data_len / max_frag_size);
 			exp_per_pdu = GCM_TAG_SIZE + AEAD_EXPLICIT_DATA_SIZE +
 				hdr_len;
 			expn_size =  frag_count * exp_per_pdu;
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if(!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				frag_count += 1;
 				expn_size += exp_per_pdu;
 			}
 			break;
 		} else if (G_SCMD_CIPH_MODE(scmd->seqno_numivs) !=
 			   SCMD_CIPH_MODE_NOP) {
 			/* Calculate the number of fragments we can make */
 			frag_count  = (data_len / max_frag_size);
 			if (frag_count > 0) {
 				pad_per_pdu = (((howmany((max_frag_size +
 						       tls_ofld->mac_length),
 						      CIPHER_BLOCK_SIZE)) *
 						CIPHER_BLOCK_SIZE) -
 					       (max_frag_size +
 						tls_ofld->mac_length));
 				if (!pad_per_pdu)
 					pad_per_pdu = CIPHER_BLOCK_SIZE;
 				exp_per_pdu = pad_per_pdu +
 				       	tls_ofld->mac_length +
 					hdr_len + CIPHER_BLOCK_SIZE;
 				expn_size = frag_count * exp_per_pdu;
 			}
 			if (full_pdus_only) {
 				*pdus_per_ulp = data_len / (exp_per_pdu +
 					max_frag_size);
 				if (*pdus_per_ulp > 32)
 					*pdus_per_ulp = 32;
 				else if (!*pdus_per_ulp)
 					*pdus_per_ulp = 1;
 				expn_size = (*pdus_per_ulp) * exp_per_pdu;
 				break;
 			}
 			/* Consider the last fragment */
 			if ((last_frag_size = data_len % max_frag_size) > 0) {
 				pad_last_pdu = (((howmany((last_frag_size +
 							tls_ofld->mac_length),
 						       CIPHER_BLOCK_SIZE)) *
 						 CIPHER_BLOCK_SIZE) -
 						(last_frag_size +
 						 tls_ofld->mac_length));
 				if (!pad_last_pdu)
 					pad_last_pdu = CIPHER_BLOCK_SIZE;
 				expn_size += (pad_last_pdu +
 					      tls_ofld->mac_length + hdr_len +
 					      CIPHER_BLOCK_SIZE);
 			}
 		}
 	} while (0);
 
 	return (expn_size);
 }
 
 /* Copy Key to WR */
 static void
 tls_copy_tx_key(struct toepcb *toep, void *dst)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct ulptx_sc_memrd *sc_memrd;
 	struct ulptx_idata *sc;
 
 	if (tls_ofld->k_ctx.tx_key_info_size <= 0)
 		return;
 
 	if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR) {
 		sc = dst;
 		sc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		sc->len = htobe32(0);
 		sc_memrd = (struct ulptx_sc_memrd *)(sc + 1);
 		sc_memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tls_ofld->k_ctx.tx_key_info_size >> 4));
 		sc_memrd->addr = htobe32(tls_ofld->tx_key_addr >> 5);
 	} else if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE) {
 		memcpy(dst, &tls_ofld->k_ctx.tx,
 		    tls_ofld->k_ctx.tx_key_info_size);
 	}
 }
 
 /* TLS/DTLS content type  for CPL SFO */
 static inline unsigned char
 tls_content_type(unsigned char content_type)
 {
 	/*
 	 * XXX: Shouldn't this map CONTENT_TYPE_APP_DATA to DATA and
 	 * default to "CUSTOM" for all other types including
 	 * heartbeat?
 	 */
 	switch (content_type) {
 	case CONTENT_TYPE_CCS:
 		return CPL_TX_TLS_SFO_TYPE_CCS;
 	case CONTENT_TYPE_ALERT:
 		return CPL_TX_TLS_SFO_TYPE_ALERT;
 	case CONTENT_TYPE_HANDSHAKE:
 		return CPL_TX_TLS_SFO_TYPE_HANDSHAKE;
 	case CONTENT_TYPE_HEARTBEAT:
 		return CPL_TX_TLS_SFO_TYPE_HEARTBEAT;
 	}
 	return CPL_TX_TLS_SFO_TYPE_DATA;
 }
 
 static unsigned char
 get_cipher_key_size(unsigned int ck_size)
 {
 	switch (ck_size) {
 	case AES_NOP: /* NOP */
 		return 15;
 	case AES_128: /* AES128 */
 		return CH_CK_SIZE_128;
 	case AES_192: /* AES192 */
 		return CH_CK_SIZE_192;
 	case AES_256: /* AES256 */
 		return CH_CK_SIZE_256;
 	default:
 		return CH_CK_SIZE_256;
 	}
 }
 
 static unsigned char
 get_mac_key_size(unsigned int mk_size)
 {
 	switch (mk_size) {
 	case SHA_NOP: /* NOP */
 		return CH_MK_SIZE_128;
 	case SHA_GHASH: /* GHASH */
 	case SHA_512: /* SHA512 */
 		return CH_MK_SIZE_512;
 	case SHA_224: /* SHA2-224 */
 		return CH_MK_SIZE_192;
 	case SHA_256: /* SHA2-256*/
 		return CH_MK_SIZE_256;
 	case SHA_384: /* SHA384 */
 		return CH_MK_SIZE_512;
 	case SHA1: /* SHA1 */
 	default:
 		return CH_MK_SIZE_160;
 	}
 }
 
 static unsigned int
 get_proto_ver(int proto_ver)
 {
 	switch (proto_ver) {
 	case TLS1_2_VERSION:
 		return TLS_1_2_VERSION;
 	case TLS1_1_VERSION:
 		return TLS_1_1_VERSION;
 	case DTLS1_2_VERSION:
 		return DTLS_1_2_VERSION;
 	default:
 		return TLS_VERSION_MAX;
 	}
 }
 
 static void
 tls_rxkey_flit1(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(16ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(14ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(16ULL));
 		kwr->u.rxhdr.ivpresent_to_rxmk_size &=
 			~(V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1));
 		kwr->u.rxhdr.authmode_to_rxvalid &=
 			~(V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1));
 	} else {
 		kwr->u.rxhdr.ivinsert_to_authinsrt =
 		    htobe64(V_TLS_KEYCTX_TX_WR_IVINSERT(6ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTRTOFST(1ULL) |
 			V_TLS_KEYCTX_TX_WR_AADSTOPOFST(5ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSRTOFST(22ULL) |
 			V_TLS_KEYCTX_TX_WR_CIPHERSTOPOFST(0ULL) |
 			V_TLS_KEYCTX_TX_WR_AUTHINSRT(0ULL));
 	}
 }
 
 /* Rx key */
 static void
 prepare_rxkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 	int proto_ver = kctx->proto_ver;
 
 	kwr->u.rxhdr.flitcnt_hmacctrl =
 		((kctx->tx_key_info_size >> 4) << 3) | kctx->hmac_ctrl;
 
 	kwr->u.rxhdr.protover_ciphmode =
 		V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) |
 		V_TLS_KEYCTX_TX_WR_CIPHMODE(kctx->state.enc_mode);
 
 	kwr->u.rxhdr.authmode_to_rxvalid =
 		V_TLS_KEYCTX_TX_WR_AUTHMODE(kctx->state.auth_mode) |
 		V_TLS_KEYCTX_TX_WR_CIPHAUTHSEQCTRL(1) |
 		V_TLS_KEYCTX_TX_WR_SEQNUMCTRL(3) |
 		V_TLS_KEYCTX_TX_WR_RXVALID(1);
 
 	kwr->u.rxhdr.ivpresent_to_rxmk_size =
 		V_TLS_KEYCTX_TX_WR_IVPRESENT(0) |
 		V_TLS_KEYCTX_TX_WR_RXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_RXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_RXMK_SIZE(get_mac_key_size(mk_size));
 
 	tls_rxkey_flit1(kwr, kctx);
 
 	/* No key reversal for GCM */
 	if (kctx->state.enc_mode != CH_EVP_CIPH_GCM_MODE) {
 		t4_aes_getdeckey(kwr->keys.edkey, kctx->rx.key,
 				 (kctx->cipher_secret_size << 3));
 		memcpy(kwr->keys.edkey + kctx->cipher_secret_size,
 		       kctx->rx.key + kctx->cipher_secret_size,
 		       (IPAD_SIZE + OPAD_SIZE));
 	} else {
 		memcpy(kwr->keys.edkey, kctx->rx.key,
 		       (kctx->tx_key_info_size - SALT_SIZE));
 		memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE);
 	}
 }
 
 /* Tx key */
 static void
 prepare_txkey_wr(struct tls_keyctx *kwr, struct tls_key_context *kctx)
 {
 	unsigned int ck_size = kctx->cipher_secret_size;
 	unsigned int mk_size = kctx->mac_secret_size;
 
 	kwr->u.txhdr.ctxlen =
 		(kctx->tx_key_info_size >> 4);
 	kwr->u.txhdr.dualck_to_txvalid =
 		V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) |
 		V_TLS_KEYCTX_TX_WR_TXCK_SIZE(get_cipher_key_size(ck_size)) |
 		V_TLS_KEYCTX_TX_WR_TXMK_SIZE(get_mac_key_size(mk_size)) |
 		V_TLS_KEYCTX_TX_WR_TXVALID(1);
 
 	memcpy(kwr->keys.edkey, kctx->tx.key, HDR_KCTX_SIZE);
 	if (kctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		memcpy(kwr->u.txhdr.txsalt, kctx->tx.salt, SALT_SIZE);
 		kwr->u.txhdr.dualck_to_txvalid &=
 			~(V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1));
 	}
 	kwr->u.txhdr.dualck_to_txvalid = htons(kwr->u.txhdr.dualck_to_txvalid);
 }
 
 /* TLS Key memory management */
 static int
 get_new_keyid(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	vmem_addr_t addr;
 
 	if (vmem_alloc(sc->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (-1);
 
 	return (addr);
 }
 
 static void
 free_keyid(struct toepcb *toep, int keyid)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	vmem_free(sc->key_map, keyid, TLS_KEY_CONTEXT_SZ);
 }
 
 static void
 clear_tls_keyid(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	if (tls_ofld->rx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->rx_key_addr);
 		tls_ofld->rx_key_addr = -1;
 	}
 	if (tls_ofld->tx_key_addr >= 0) {
 		free_keyid(toep, tls_ofld->tx_key_addr);
 		tls_ofld->tx_key_addr = -1;
 	}
 }
 
 static int
 get_keyid(struct tls_ofld_info *tls_ofld, unsigned int ops)
 {
 	return (ops & KEY_WRITE_RX ? tls_ofld->rx_key_addr :
 		((ops & KEY_WRITE_TX) ? tls_ofld->tx_key_addr : -1));
 }
 
 static int
 get_tp_plen_max(struct tls_ofld_info *tls_ofld)
 {
 	int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448;
 
 	return (tls_ofld->k_ctx.frag_size <= 8192 ? plen : FC_TP_PLEN_MAX);
 }
 
 /* Send request to get the key-id */
 static int
 tls_program_key_id(struct toepcb *toep, struct tls_key_context *k_ctx)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 	struct ofld_tx_sdesc *txsd;
 	int kwrlen, kctxlen, keyid, len;
 	struct wrqe *wr;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 
 	kwrlen = sizeof(*kwr);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = roundup2(kwrlen + kctxlen, 16);
 
 	if (toep->txsd_avail == 0)
 		return (EAGAIN);
 
 	/* Dont initialize key for re-neg */
 	if (!G_KEY_CLR_LOC(k_ctx->l_p_key)) {
 		if ((keyid = get_new_keyid(toep, k_ctx)) < 0) {
 			return (ENOSPC);
 		}
 	} else {
 		keyid = get_keyid(tls_ofld, k_ctx->l_p_key);
 	}
 
 	wr = alloc_wrqe(len, toep->ofld_txq);
 	if (wr == NULL) {
 		free_keyid(toep, keyid);
 		return (ENOMEM);
 	}
 	kwr = wrtod(wr);
 	memset(kwr, 0, kwrlen);
 
 	kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_COMPL |
 	    F_FW_WR_ATOMIC);
 	kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 	kwr->protocol = get_proto_ver(k_ctx->proto_ver);
 	kwr->mfs = htons(k_ctx->frag_size);
 	kwr->reneg_to_write_rx = k_ctx->l_p_key;
 
 	/* master command */
 	kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
 	    V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1));
 	kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5));
 	kwr->len16 = htobe32((toep->tid << 8) |
 	    DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16));
 	kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5));
 
 	/* sub command */
 	kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	kwr->sc_len = htobe32(kctxlen);
 
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memset(kctx, 0, kctxlen);
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		tls_ofld->tx_key_addr = keyid;
 		prepare_txkey_wr(kctx, k_ctx);
 	} else if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		tls_ofld->rx_key_addr = keyid;
 		prepare_rxkey_wr(kctx, k_ctx);
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = DIV_ROUND_UP(len, 16);
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 
 	return (0);
 }
 
 /* Store a key received from SSL in DDR. */
 static int
 program_key_context(struct tcpcb *tp, struct toepcb *toep,
     struct tls_key_context *uk_ctx)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_key_context *k_ctx;
 	int error, key_offset;
 
 	if (tp->t_state != TCPS_ESTABLISHED) {
 		/*
 		 * XXX: Matches Linux driver, but not sure this is a
 		 * very appropriate error.
 		 */
 		return (ENOENT);
 	}
 
 	/* Stop timer on handshake completion */
 	tls_stop_handshake_timer(toep);
 
 	toep->flags &= ~TPF_FORCE_CREDITS;
 
 	CTR4(KTR_CXGBE, "%s: tid %d %s proto_ver %#x", __func__, toep->tid,
 	    G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX ? "KEY_WRITE_RX" :
 	    "KEY_WRITE_TX", uk_ctx->proto_ver);
 
 	if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX &&
 	    toep->ulp_mode != ULP_MODE_TLS)
 		return (EOPNOTSUPP);
 
 	/* Don't copy the 'tx' and 'rx' fields. */
 	k_ctx = &tls_ofld->k_ctx;
 	memcpy(&k_ctx->l_p_key, &uk_ctx->l_p_key,
 	    sizeof(*k_ctx) - offsetof(struct tls_key_context, l_p_key));
 
 	/* TLS version != 1.1 and !1.2 OR DTLS != 1.2 */
 	if (get_proto_ver(k_ctx->proto_ver) > DTLS_1_2_VERSION) {
 		if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 			tls_ofld->rx_key_addr = -1;
 			t4_clear_rx_quiesce(toep);
 		} else {
 			tls_ofld->tx_key_addr = -1;
 		}
 		return (0);
 	}
 
 	if (k_ctx->state.enc_mode == CH_EVP_CIPH_GCM_MODE) {
 		k_ctx->iv_size = 4;
 		k_ctx->mac_first = 0;
 		k_ctx->hmac_ctrl = 0;
 	} else {
 		k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */
 		k_ctx->mac_first = 1;
 	}
 
 	tls_ofld->scmd0.seqno_numivs =
 		(V_SCMD_SEQ_NO_CTRL(3) |
 		 V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) |
 		 V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 		 V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) |
 		 V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) |
 		 V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) |
 		 V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) |
 		 V_SCMD_IV_SIZE(k_ctx->iv_size));
 
 	tls_ofld->scmd0.ivgen_hdrlen =
 		(V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) |
 		 V_SCMD_KEY_CTX_INLINE(0) |
 		 V_SCMD_TLS_FRAG_ENABLE(1));
 
 	tls_ofld->mac_length = k_ctx->mac_secret_size;
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		k_ctx->rx = uk_ctx->rx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->rx_key_addr = -1;
 	} else {
 		k_ctx->tx = uk_ctx->tx;
 		/* Dont initialize key for re-neg */
 		if (!G_KEY_CLR_LOC(k_ctx->l_p_key))
 			tls_ofld->tx_key_addr = -1;
 	}
 
 	/* Flush pending data before new Tx key becomes active */
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_TX) {
 		struct sockbuf *sb;
 
 		/* XXX: This might not drain everything. */
 		t4_push_frames(sc, toep, 0);
 		sb = &toep->inp->inp_socket->so_snd;
 		SOCKBUF_LOCK(sb);
 
 		/* XXX: This asserts that everything has been pushed. */
 		MPASS(sb->sb_sndptr == NULL || sb->sb_sndptr->m_next == NULL);
 		sb->sb_sndptr = NULL;
 		tls_ofld->sb_off = sbavail(sb);
 		SOCKBUF_UNLOCK(sb);
 		tls_ofld->tx_seq_no = 0;
 	}
 
 	if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) ||
 	    (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) {
 		error = tls_program_key_id(toep, k_ctx);
 		if (error) {
 			/* XXX: Only clear quiesce for KEY_WRITE_RX? */
 			t4_clear_rx_quiesce(toep);
 			return (error);
 		}
 	}
 
 	if (G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) {
 		/*
 		 * RX key tags are an index into the key portion of MA
 		 * memory stored as an offset from the base address in
 		 * units of 64 bytes.
 		 */
 		key_offset = tls_ofld->rx_key_addr - sc->vres.key.start;
 		t4_set_tls_keyid(toep, key_offset / 64);
 		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
 						V_TF_TLS_CONTROL(1) |
 						V_TF_TLS_ACTIVE(1) |
 						V_TF_TLS_ENABLE(1))));
 		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
 				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
 				 V_TCB_TLS_SEQ(0));
 		t4_clear_rx_quiesce(toep);
 	} else {
 		unsigned short pdus_per_ulp;
 
 		if (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_IMMEDIATE)
 			tls_ofld->tx_key_addr = 1;
 
 		tls_ofld->fcplenmax = get_tp_plen_max(tls_ofld);
 		tls_ofld->expn_per_ulp = tls_expansion_size(toep,
 				tls_ofld->fcplenmax, 1, &pdus_per_ulp);
 		tls_ofld->pdus_per_ulp = pdus_per_ulp;
 		tls_ofld->adjusted_plen = tls_ofld->pdus_per_ulp *
 			((tls_ofld->expn_per_ulp/tls_ofld->pdus_per_ulp) +
 			 tls_ofld->k_ctx.frag_size);
 	}
 
 	return (0);
 }
 
 /*
  * In some cases a client connection can hang without sending the
  * ServerHelloDone message from the NIC to the host.  Send a dummy
  * RX_DATA_ACK with RX_MODULATE to unstick the connection.
  */
 static void
 tls_send_handshake_ack(void *arg)
 {
 	struct toepcb *toep = arg;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/*
 	 * XXX: Does not have the t4_get_tcb() checks to refine the
 	 * workaround.
 	 */
 	callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz);
 
 	CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid);
 	send_rx_modulate(sc, toep);
 }
 
 static void
 tls_start_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz,
 	    tls_send_handshake_ack, toep);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 void
 tls_stop_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	mtx_lock(&tls_handshake_lock);
 	callout_stop(&tls_ofld->handshake_timer);
 	mtx_unlock(&tls_handshake_lock);
 }
 
 int
 t4_ctloutput_tls(struct socket *so, struct sockopt *sopt)
 {
 	struct tls_key_context uk_ctx;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	int error, optval;
 
 	error = 0;
 	if (sopt->sopt_dir == SOPT_SET &&
 	    sopt->sopt_name == TCP_TLSOM_SET_TLS_CONTEXT) {
 		error = sooptcopyin(sopt, &uk_ctx, sizeof(uk_ctx),
 		    sizeof(uk_ctx));
 		if (error)
 			return (error);
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_SET_TLS_CONTEXT:
 			error = program_key_context(tp, toep, &uk_ctx);
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_TLS_TOM:
 			if (toep->ulp_mode == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM",
 				    __func__, toep->tid);
 				tls_clr_ofld_mode(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_TLSOM_CLR_QUIES:
 			if (toep->ulp_mode == ULP_MODE_TLS) {
 				CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES",
 				    __func__, toep->tid);
 				tls_clr_quiesce(toep);
 			} else
 				error = EOPNOTSUPP;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case TCP_TLSOM_GET_TLS_TOM:
 			/*
 			 * TLS TX is permitted on any TOE socket, but
 			 * TLS RX requires a TLS ULP mode.
 			 */
 			optval = TLS_TOM_NONE;
 			if (can_tls_offload(td_adapter(toep->td))) {
 				switch (toep->ulp_mode) {
 				case ULP_MODE_NONE:
 				case ULP_MODE_TCPDDP:
 					optval = TLS_TOM_TXONLY;
 					break;
 				case ULP_MODE_TLS:
 					optval = TLS_TOM_BOTH;
 					break;
 				}
 			}
 			CTR3(KTR_CXGBE, "%s: tid %d GET_TLS_TOM = %d",
 			    __func__, toep->tid, optval);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 void
 tls_init_toep(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR;
 	tls_ofld->rx_key_addr = -1;
 	tls_ofld->tx_key_addr = -1;
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		callout_init_mtx(&tls_ofld->handshake_timer,
 		    &tls_handshake_lock, 0);
 }
 
 void
 tls_establish(struct toepcb *toep)
 {
 
 	/*
 	 * Enable PDU extraction.
 	 *
 	 * XXX: Supposedly this should be done by the firmware when
 	 * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but
 	 * in practice this seems to be required.
 	 */
 	CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid);
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 
 	toep->flags |= TPF_FORCE_CREDITS;
 
 	tls_start_handshake_timer(toep);
 }
 
 void
 tls_uninit_toep(struct toepcb *toep)
 {
 
 	if (toep->ulp_mode == ULP_MODE_TLS)
 		tls_stop_handshake_timer(toep);
 	clear_tls_keyid(toep);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define	MIN_OFLD_TLSTX_CREDITS(toep)					\
 	(howmany(sizeof(struct fw_tlstx_data_wr) +			\
 	    sizeof(struct cpl_tx_tls_sfo) + key_size((toep)) +		\
 	    CIPHER_BLOCK_SIZE + 1, 16))
 
 static inline u_int
 max_imm_tls_space(int tx_credits)
 {
 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
 	int space;
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		space = (n * EQ_ESIZE);
 	else
 		space = tx_credits * 16;
 	return (space);
 }
 
 static int
 count_mbuf_segs(struct mbuf *m, int skip, int len, int *max_nsegs_1mbufp)
 {
 	int max_nsegs_1mbuf, n, nsegs;
 
 	while (skip >= m->m_len) {
 		skip -= m->m_len;
 		m = m->m_next;
 	}
 
 	nsegs = 0;
 	max_nsegs_1mbuf = 0;
 	while (len > 0) {
 		n = sglist_count(mtod(m, char *) + skip, m->m_len - skip);
 		if (n > max_nsegs_1mbuf)
 			max_nsegs_1mbuf = n;
 		nsegs += n;
 		len -= m->m_len - skip;
 		skip = 0;
 		m = m->m_next;
 	}
 	*max_nsegs_1mbufp = max_nsegs_1mbuf;
 	return (nsegs);
 }
 
 static void
 write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
     unsigned int immdlen, unsigned int plen, unsigned int expn,
     unsigned int pdus, uint8_t credits, int shove, int imm_ivs)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	unsigned int len = plen + expn;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) |
 	    V_FW_TLSTX_DATA_WR_COMPL(1) |
 	    V_FW_TLSTX_DATA_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) |
 	    V_FW_TLSTX_DATA_WR_LEN16(credits));
 	txwr->plen = htobe32(len);
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) |
 	    V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove));
 	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(pdus) |
 	    V_FW_TLSTX_DATA_WR_EXP(expn) |
 	    V_FW_TLSTX_DATA_WR_CTXLOC(tls_ofld->key_location) |
 	    V_FW_TLSTX_DATA_WR_IVDSGL(!imm_ivs) |
 	    V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->k_ctx.tx_key_info_size >> 4));
 	txwr->mfs = htobe16(tls_ofld->k_ctx.frag_size);
 	txwr->adjustedplen_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen));
 	txwr->expinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp));
 	txwr->pdusinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp));
 }
 
 static void
 write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep,
     struct tls_hdr *tls_hdr, unsigned int plen, unsigned int pdus)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	int data_type, seglen;
 
 	if (plen < tls_ofld->k_ctx.frag_size)
 		seglen = plen;
 	else
 		seglen = tls_ofld->k_ctx.frag_size;
 	data_type = tls_content_type(tls_hdr->type);
 	cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) |
 	    V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) |
 	    V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen));
 	cpl->pld_len = htobe32(plen);
 	if (data_type == CPL_TX_TLS_SFO_TYPE_HEARTBEAT)
 		cpl->type_protover = htobe32(
 		    V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type));
 	cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs |
 	    V_SCMD_NUM_IVS(pdus));
 	cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen);
 	cpl->scmd1 = htobe64(tls_ofld->tx_seq_no);
 	tls_ofld->tx_seq_no += pdus;
 }
 
 /*
  * Similar to write_tx_sgl() except that it accepts an optional
  * trailer buffer for IVs.
  */
 static void
 write_tlstx_sgl(void *dst, struct mbuf *start, int skip, int plen,
     void *iv_buffer, int iv_len, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	for (m = start; skip >= m->m_len; m = m->m_next)
 		skip -= m->m_len;
 
 	i = -1;
 	for (m = start; plen > 0; m = m->m_next) {
 		rc = sglist_append(&sg, mtod(m, char *) + skip,
 		    m->m_len - skip);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 		plen -= m->m_len - skip;
 		skip = 0;
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (iv_buffer != NULL) {
 		rc = sglist_append(&sg, iv_buffer, iv_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, iv_buffer %p",
 	    __func__, nsegs, start, iv_buffer));
 }
 
 /*
  * Similar to t4_push_frames() but handles TLS sockets when TLS offload
  * is enabled.  Rather than transmitting bulk data, the socket buffer
  * contains TLS records.  The work request requires a full TLS record,
  * so batch mbufs up until a full TLS record is seen.  This requires
  * reading the TLS header out of the start of each record to determine
  * its length.
  */
 void
 t4_push_tls_records(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct tls_hdr thdr;
 	struct mbuf *sndptr;
 	struct fw_tlstx_data_wr *txwr;
 	struct cpl_tx_tls_sfo *cpl;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, space, max_nsegs_1mbuf, wr_len;
 	u_int expn_size, iv_len, pdus, sndptroff;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tls_size, tx_credits, shove, /* compl,*/ sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool imm_ivs, imm_payload;
 	void *iv_buffer, *iv_dst, *buf;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
 	    toep->ulp_mode == ULP_MODE_TCPDDP || toep->ulp_mode == ULP_MODE_TLS,
 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 	KASSERT(tls_tx_key(toep),
 	    ("%s: TX key not set for toep %p", __func__, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		space = max_imm_tls_space(tx_credits);
 		wr_len = sizeof(struct fw_tlstx_data_wr) +
 		    sizeof(struct cpl_tx_tls_sfo) + key_size(toep);
 		if (wr_len + CIPHER_BLOCK_SIZE + 1 > space) {
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 			    "%s: tid %d tx_credits %d min_wr %d space %d",
 			    __func__, toep->tid, tx_credits, wr_len +
 			    CIPHER_BLOCK_SIZE + 1, space);
 #endif
 			return;
 		}
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			MPASS(tls_ofld->sb_off >= drop);
 			tls_ofld->sb_off -= drop;
 			drop = 0;
 		}
 
 		/*
 		 * Send a FIN if requested, but only if there's no
 		 * more data to send.
 		 */
 		if (sbavail(sb) == tls_ofld->sb_off &&
 		    toep->flags & TPF_SEND_FIN) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			t4_close_conn(sc, toep);
 			return;
 		}
 
 		if (sbavail(sb) < tls_ofld->sb_off + TLS_HEADER_LENGTH) {
 			/*
 			 * A full TLS header is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.  However, if the connection
 			 * has been closed, we will never get the rest
 			 * of the header so just discard the partial
 			 * header and close the connection.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE, "%s: tid %d sbavail %d sb_off %d%s",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    toep->flags & TPF_SEND_FIN ? "" : " SEND_FIN");
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			if (toep->flags & TPF_SEND_FIN)
 				t4_close_conn(sc, toep);
 			return;
 		}
 
 		/* Read the header of the next TLS record. */
 		sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff);
 		MPASS(!IS_AIOTX_MBUF(sndptr));
 		m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr);
 		tls_size = htons(thdr.length);
 		plen = TLS_HEADER_LENGTH + tls_size;
 		pdus = howmany(tls_size, tls_ofld->k_ctx.frag_size);
 		iv_len = pdus * CIPHER_BLOCK_SIZE;
 
 		if (sbavail(sb) < tls_ofld->sb_off + plen) {
 			/*
 			 * The full TLS record is not yet queued, stop
 			 * for now until more data is added to the
 			 * socket buffer.  However, if the connection
 			 * has been closed, we will never get the rest
 			 * of the record so just discard the partial
 			 * record and close the connection.
 			 */
 #ifdef VERBOSE_TRACES
 			CTR6(KTR_CXGBE,
 			    "%s: tid %d sbavail %d sb_off %d plen %d%s",
 			    __func__, toep->tid, sbavail(sb), tls_ofld->sb_off,
 			    plen, toep->flags & TPF_SEND_FIN ? "" :
 			    " SEND_FIN");
 #endif
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			if (toep->flags & TPF_SEND_FIN)
 				t4_close_conn(sc, toep);
 			return;
 		}
 
 		/* Shove if there is no additional data pending. */
 		shove = (sbavail(sb) == tls_ofld->sb_off + plen) &&
 		    !(tp->t_flags & TF_MORETOCOME);
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(sb, newsize, so, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup)
 			sowwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		/* Determine whether to use immediate vs SGL. */
 		imm_payload = false;
 		imm_ivs = false;
 		if (wr_len + iv_len <= space) {
 			imm_ivs = true;
 			wr_len += iv_len;
 			if (wr_len + tls_size <= space) {
 				wr_len += tls_size;
 				imm_payload = true;
 			}
 		}
 
 		/* Allocate space for IVs if needed. */
 		if (!imm_ivs) {
 			iv_buffer = malloc(iv_len, M_CXGBE, M_NOWAIT);
 			if (iv_buffer == NULL) {
 				/*
 				 * XXX: How to restart this?
 				 */
 				if (sowwakeup)
 					sowwakeup_locked(so);
 				else
 					SOCKBUF_UNLOCK(sb);
 				SOCKBUF_UNLOCK_ASSERT(sb);
 				CTR3(KTR_CXGBE,
 			    "%s: tid %d failed to alloc IV space len %d",
 				    __func__, toep->tid, iv_len);
 				return;
 			}
 		} else
 			iv_buffer = NULL;
 
 		/* Determine size of SGL. */
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		if (!imm_payload) {
 			nsegs = count_mbuf_segs(sndptr, sndptroff +
 			    TLS_HEADER_LENGTH, tls_size, &max_nsegs_1mbuf);
 			if (!imm_ivs) {
 				int n = sglist_count(iv_buffer, iv_len);
 				nsegs += n;
 				if (n > max_nsegs_1mbuf)
 					max_nsegs_1mbuf = n;
 			}
 
 			/* Account for SGL in work request length. */
 			wr_len += sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 		}
 
 		wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
 		if (wr == NULL) {
 			/* XXX: how will we recover from this? */
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d TLS record %d len %#x pdus %d",
 		    __func__, toep->tid, thdr.type, tls_size, pdus);
 #endif
 		txwr = wrtod(wr);
 		cpl = (struct cpl_tx_tls_sfo *)(txwr + 1);
 		memset(txwr, 0, roundup2(wr_len, 16));
 		credits = howmany(wr_len, 16);
 		expn_size = tls_expansion_size(toep, tls_size, 0, NULL);
 		write_tlstx_wr(txwr, toep, imm_payload ? tls_size : 0,
 		    tls_size, expn_size, pdus, credits, shove, imm_ivs ? 1 : 0);
 		write_tlstx_cpl(cpl, toep, &thdr, tls_size, pdus);
 		tls_copy_tx_key(toep, cpl + 1);
 
 		/* Generate random IVs */
 		buf = (char *)(cpl + 1) + key_size(toep);
 		if (imm_ivs) {
 			MPASS(iv_buffer == NULL);
 			iv_dst = buf;
 			buf = (char *)iv_dst + iv_len;
 		} else
 			iv_dst = iv_buffer;
 		arc4rand(iv_dst, iv_len, 0);
 
 		if (imm_payload) {
 			m_copydata(sndptr, sndptroff + TLS_HEADER_LENGTH,
 			    tls_size, buf);
 		} else {
 			write_tlstx_sgl(buf, sndptr,
 			    sndptroff + TLS_HEADER_LENGTH, tls_size, iv_buffer,
 			    iv_len, nsegs, max_nsegs_1mbuf);
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		sbsndptr_adv(sb, sb->sb_sndptr, plen);
 		tls_ofld->sb_off += plen;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep))
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd->iv_buffer = iv_buffer;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		atomic_add_long(&toep->vi->pi->tx_tls_records, 1);
 		atomic_add_long(&toep->vi->pi->tx_tls_octets, plen);
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 }
 
 /*
  * For TLS data we place received mbufs received via CPL_TLS_DATA into
  * an mbufq in the TLS offload state.  When CPL_RX_TLS_CMP is
  * received, the completed PDUs are placed into the socket receive
  * buffer.
  *
  * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs.
  */
 static int
 do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_tls_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	int len;
 
 	/* XXX: Should this match do_rx_data instead? */
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_tls_octets, len);
 
 	KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	/* Save TCP sequence number. */
 	m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq);
 
 	if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) {
 #ifdef INVARIANTS
 		panic("Failed to queue TLS data packet");
 #else
 		printf("%s: Failed to queue TLS data packet\n", __func__);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 #endif
 	}
 
 	tp = intotcpcb(inp);
 	tp->t_rcvtime = ticks;
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
 	    be32toh(cpl->seq));
 #endif
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *);
 	struct tlsrx_hdr_pkt *tls_hdr_pkt;
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
 	int len, pdu_length, pdu_overhead, sb_length;
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	atomic_add_long(&toep->vi->pi->rx_tls_records, 1);
 
 	KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length));
 
 	tp = intotcpcb(inp);
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u",
 	    __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt);
 #endif
 
 	tp->rcv_nxt += pdu_length;
 	if (tp->rcv_wnd < pdu_length) {
 		toep->tls.rcv_over += pdu_length - tp->rcv_wnd;
 		tp->rcv_wnd = 0;
 	} else
 		tp->rcv_wnd -= pdu_length;
 
 	/* XXX: Not sure what to do about urgent data. */
 
 	/*
 	 * The payload of this CPL is the TLS header followed by
 	 * additional fields.
 	 */
 	KASSERT(m->m_len >= sizeof(*tls_hdr_pkt),
 	    ("%s: payload too small", __func__));
 	tls_hdr_pkt = mtod(m, void *);
 
 	/*
 	 * Only the TLS header is sent to OpenSSL, so report errors by
 	 * altering the record type.
 	 */
 	if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0)
 		tls_hdr_pkt->type = CONTENT_TYPE_ERROR;
 
 	/* Trim this CPL's mbuf to only include the TLS header. */
 	KASSERT(m->m_len == len && m->m_next == NULL,
 	    ("%s: CPL spans multiple mbufs", __func__));
 	m->m_len = TLS_HEADER_LENGTH;
 	m->m_pkthdr.len = TLS_HEADER_LENGTH;
 
 	tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq);
 	if (tls_data != NULL) {
 		KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq,
 		    ("%s: sequence mismatch", __func__));
 
 		/*
 		 * Update the TLS header length to be the length of
 		 * the payload data.
 		 */
 		tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len);
 
 		m->m_next = tls_data;
 		m->m_pkthdr.len += tls_data->m_len;
 	}
 
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		struct epoch_tracker et;
 
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, pdu_length);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/*
 	 * Not all of the bytes on the wire are included in the socket
 	 * buffer (e.g. the MAC of the TLS record).  However, those
 	 * bytes are included in the TCP sequence space.  To handle
 	 * this, compute the delta for this TLS record in
 	 * 'pdu_overhead' and treat those bytes as having already been
 	 * "read" by the application for the purposes of expanding the
 	 * window.  The meat of the TLS record passed to the
 	 * application ('sb_length') will still not be counted as
 	 * "read" until userland actually reads the bytes.
 	 *
 	 * XXX: Some of the calculations below are probably still not
 	 * really correct.
 	 */
 	sb_length = m->m_pkthdr.len;
 	pdu_overhead = pdu_length - sb_length;
 	toep->rx_credits += pdu_overhead;
 	tp->rcv_wnd += pdu_overhead;
 	tp->rcv_adv += pdu_overhead;
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    sb_length > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
-		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
 
 	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sbused(sb), toep->sb_cc));
 	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
 	toep->sb_cc = sbused(sb);
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
 	    __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
 #endif
 	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->rx_credits);
 		toep->rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 t4_tls_mod_load(void)
 {
 
 	mtx_init(&tls_handshake_lock, "t4tls handshake", NULL, MTX_DEF);
 	t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp);
 }
 
 void
 t4_tls_mod_unload(void)
 {
 
 	t4_register_cpl_handler(CPL_TLS_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL);
 	mtx_destroy(&tls_handshake_lock);
 }
 #endif	/* TCP_OFFLOAD */
Index: projects/capsicum-test/sys/fs/fuse/fuse_debug.h
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_debug.h	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_debug.h	(nonexistent)
@@ -1,81 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Copyright (c) 2007-2009 Google Inc.
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above
- *   copyright notice, this list of conditions and the following disclaimer
- *   in the documentation and/or other materials provided with the
- *   distribution.
- * * Neither the name of Google Inc. nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * 
- * Copyright (C) 2005 Csaba Henk.
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#include <sys/cdefs.h>
-
-/* Debug related stuff */
-
-#ifndef FUSE_DEBUG_MODULE
-#error "FUSE_DEBUG_MODULE is not defined"
-#else
-#define FUSE_DEBUG_VAR		__CONCAT(FUSE_DEBUG_,FUSE_DEBUG_MODULE)
-#endif
-
-#define	FS_DEBUG(fmt, ...)	DEBUGX(FUSE_DEBUG_VAR >= 1, fmt, ## __VA_ARGS__)
-#define	FS_DEBUG2G(fmt, ...)	DEBUGX(FUSE_DEBUG_VAR >= 2, fmt, ## __VA_ARGS__)
-
-#define	debug_printf(fmt, ...)	FS_DEBUG(fmt, ## __VA_ARGS__)
-#define	kdebug_printf(fmt, ...)	FS_DEBUG(fmt, ## __VA_ARGS__)
-
-#define fuse_trace_printf(fmt, ...) \
-    DEBUGX(FUSE_DEBUG_VAR && FUSE_TRACE, fmt, ## __VA_ARGS__)
-#define fuse_trace_printf_func() \
-    fuse_trace_printf("%s:%d\n", __FILE__, __LINE__)
-#define fuse_trace_printf_vfsop() fuse_trace_printf_func()
-#define fuse_trace_printf_vnop()  fuse_trace_printf_func()

Property changes on: projects/capsicum-test/sys/fs/fuse/fuse_debug.h
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: projects/capsicum-test/sys/fs/fuse/fuse.h
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse.h	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse.h	(revision 345710)
@@ -1,226 +1,169 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc.
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  * 
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "fuse_kernel.h"
 
 #define FUSE_DEFAULT_DAEMON_TIMEOUT                60     /* s */
 #define FUSE_MIN_DAEMON_TIMEOUT                    0      /* s */
 #define FUSE_MAX_DAEMON_TIMEOUT                    600    /* s */
 
 #ifndef FUSE_FREEBSD_VERSION
 #define	FUSE_FREEBSD_VERSION	"0.4.4"
 #endif
 
 /* Mapping versions to features */
 
 #define FUSE_KERNELABI_GEQ(maj, min)	\
 (FUSE_KERNEL_VERSION > (maj) || (FUSE_KERNEL_VERSION == (maj) && FUSE_KERNEL_MINOR_VERSION >= (min)))
 
 /*
  * Appearance of new FUSE operations is not always in par with version
  * numbering... At least, 7.3 is a sufficient condition for having
  * FUSE_{ACCESS,CREATE}.
  */
 #if FUSE_KERNELABI_GEQ(7, 3)
 #ifndef FUSE_HAS_ACCESS
 #define FUSE_HAS_ACCESS 1
 #endif
 #ifndef FUSE_HAS_CREATE
 #define FUSE_HAS_CREATE 1
 #endif
 #else /* FUSE_KERNELABI_GEQ(7, 3) */
 #ifndef FUSE_HAS_ACCESS
 #define FUSE_HAS_ACCESS 0
 #endif
 #ifndef FUSE_HAS_CREATE
 #define FUSE_HAS_CREATE 0
 #endif
 #endif
 
 #if FUSE_KERNELABI_GEQ(7, 7)
 #ifndef FUSE_HAS_GETLK
 #define FUSE_HAS_GETLK 1
 #endif
 #ifndef FUSE_HAS_SETLK
 #define FUSE_HAS_SETLK 1
 #endif
 #ifndef FUSE_HAS_SETLKW
 #define FUSE_HAS_SETLKW 1
 #endif
 #ifndef FUSE_HAS_INTERRUPT
 #define FUSE_HAS_INTERRUPT 1
 #endif
 #else /* FUSE_KERNELABI_GEQ(7, 7) */
 #ifndef FUSE_HAS_GETLK
 #define FUSE_HAS_GETLK 0
 #endif
 #ifndef FUSE_HAS_SETLK
 #define FUSE_HAS_SETLK 0
 #endif
 #ifndef FUSE_HAS_SETLKW
 #define FUSE_HAS_SETLKW 0
 #endif
 #ifndef FUSE_HAS_INTERRUPT
 #define FUSE_HAS_INTERRUPT 0
 #endif
 #endif
 
 #if FUSE_KERNELABI_GEQ(7, 8)
 #ifndef FUSE_HAS_FLUSH_RELEASE
 #define FUSE_HAS_FLUSH_RELEASE 1
 /*
  * "DESTROY" came in the middle of the 7.8 era,
  * so this is not completely exact...
  */
 #ifndef FUSE_HAS_DESTROY
 #define FUSE_HAS_DESTROY 1
 #endif
 #endif
 #else /* FUSE_KERNELABI_GEQ(7, 8) */
 #ifndef FUSE_HAS_FLUSH_RELEASE
 #define FUSE_HAS_FLUSH_RELEASE 0
 #ifndef FUSE_HAS_DESTROY
 #define FUSE_HAS_DESTROY 0
 #endif
 #endif
 #endif
 
 /* misc */
 
 SYSCTL_DECL(_vfs_fusefs);
 
 /* Fuse locking */
 
 extern struct mtx fuse_mtx;
 #define FUSE_LOCK() fuse_lck_mtx_lock(fuse_mtx)
 #define FUSE_UNLOCK() fuse_lck_mtx_unlock(fuse_mtx)
 
 #define RECTIFY_TDCR(td, cred)			\
 do {						\
 	if (! (td))				\
 		(td) = curthread;		\
 	if (! (cred))				\
 		(cred) = (td)->td_ucred;	\
 } while (0)
 
-/* Debug related stuff */
-
-#ifndef FUSE_DEBUG_DEVICE
-#define FUSE_DEBUG_DEVICE               0
-#endif
-
-#ifndef FUSE_DEBUG_FILE
-#define FUSE_DEBUG_FILE                 0
-#endif
-
-#ifndef FUSE_DEBUG_INTERNAL
-#define FUSE_DEBUG_INTERNAL             0
-#endif
-
-#ifndef FUSE_DEBUG_IO
-#define FUSE_DEBUG_IO                   0
-#endif
-
-#ifndef FUSE_DEBUG_IPC
-#define FUSE_DEBUG_IPC                  0
-#endif
-
-#ifndef FUSE_DEBUG_LOCK
-#define FUSE_DEBUG_LOCK                 0
-#endif
-
-#ifndef FUSE_DEBUG_VFSOPS
-#define FUSE_DEBUG_VFSOPS               0
-#endif
-
-#ifndef FUSE_DEBUG_VNOPS
-#define FUSE_DEBUG_VNOPS                0
-#endif
-
-#ifndef FUSE_TRACE
-#define FUSE_TRACE                      0
-#endif
-
-#define DEBUGX(cond, fmt, ...) do {					\
-	if (((cond))) {							\
-		printf("%s: " fmt, __func__, ## __VA_ARGS__);		\
-	}								\
-} while (0)
-
-#define fuse_lck_mtx_lock(mtx) do {						\
-	DEBUGX(FUSE_DEBUG_LOCK, "0:   lock(%s): %s@%d by %d\n",			\
-	    __STRING(mtx), __func__, __LINE__, curthread->td_proc->p_pid);	\
-	mtx_lock(&(mtx));							\
-	DEBUGX(FUSE_DEBUG_LOCK, "1:   lock(%s): %s@%d by %d\n",			\
-	    __STRING(mtx), __func__, __LINE__, curthread->td_proc->p_pid);	\
-} while (0)
-
-#define fuse_lck_mtx_unlock(mtx) do {						\
-	DEBUGX(FUSE_DEBUG_LOCK, "0: unlock(%s): %s@%d by %d\n",			\
-	    __STRING(mtx), __func__, __LINE__, curthread->td_proc->p_pid);	\
-	mtx_unlock(&(mtx));							\
-	DEBUGX(FUSE_DEBUG_LOCK, "1: unlock(%s): %s@%d by %d\n",			\
-	    __STRING(mtx), __func__, __LINE__, curthread->td_proc->p_pid);	\
-} while (0)
+#define fuse_lck_mtx_lock(mtx) mtx_lock(&(mtx))
+#define fuse_lck_mtx_unlock(mtx) mtx_unlock(&(mtx))
 
 void fuse_ipc_init(void);
 void fuse_ipc_destroy(void);
 
 int fuse_device_init(void);
 void fuse_device_destroy(void);
Index: projects/capsicum-test/sys/fs/fuse/fuse_device.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_device.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_device.c	(revision 345710)
@@ -1,448 +1,460 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
+#include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/fcntl.h>
 #include <sys/sysctl.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 
 #include "fuse.h"
 #include "fuse_ipc.h"
 
-#define FUSE_DEBUG_MODULE DEVICE
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , device, trace, "int", "char*");
 
 static struct cdev *fuse_dev;
 
 static d_open_t fuse_device_open;
 static d_close_t fuse_device_close;
 static d_poll_t fuse_device_poll;
 static d_read_t fuse_device_read;
 static d_write_t fuse_device_write;
 
 static struct cdevsw fuse_device_cdevsw = {
 	.d_open = fuse_device_open,
 	.d_close = fuse_device_close,
 	.d_name = "fuse",
 	.d_poll = fuse_device_poll,
 	.d_read = fuse_device_read,
 	.d_write = fuse_device_write,
 	.d_version = D_VERSION,
 };
 
 /****************************
  *
  * >>> Fuse device op defs
  *
  ****************************/
 
 static void
 fdata_dtor(void *arg)
 {
 	struct fuse_data *fdata;
 
 	fdata = arg;
 	fdata_trydestroy(fdata);
 }
 
 /*
  * Resources are set up on a per-open basis
  */
 static int
 fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct fuse_data *fdata;
 	int error;
 
-	FS_DEBUG("device %p\n", dev);
+	SDT_PROBE2(fuse, , device, trace, 1, "device open");
 
 	fdata = fdata_alloc(dev, td->td_ucred);
 	error = devfs_set_cdevpriv(fdata, fdata_dtor);
 	if (error != 0)
 		fdata_trydestroy(fdata);
 	else
-		FS_DEBUG("%s: device opened by thread %d.\n", dev->si_name,
-		    td->td_tid);
+		SDT_PROBE2(fuse, , device, trace, 1, "device open success");
 	return (error);
 }
 
 static int
 fuse_device_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	struct fuse_data *data;
 	struct fuse_ticket *tick;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&data);
 	if (error != 0)
 		return (error);
 	if (!data)
 		panic("no fuse data upon fuse device close");
 	fdata_set_dead(data);
 
 	FUSE_LOCK();
 	fuse_lck_mtx_lock(data->aw_mtx);
 	/* wakup poll()ers */
 	selwakeuppri(&data->ks_rsel, PZERO + 1);
 	/* Don't let syscall handlers wait in vain */
 	while ((tick = fuse_aw_pop(data))) {
 		fuse_lck_mtx_lock(tick->tk_aw_mtx);
 		fticket_set_answered(tick);
 		tick->tk_aw_errno = ENOTCONN;
 		wakeup(tick);
 		fuse_lck_mtx_unlock(tick->tk_aw_mtx);
 		FUSE_ASSERT_AW_DONE(tick);
 		fuse_ticket_drop(tick);
 	}
 	fuse_lck_mtx_unlock(data->aw_mtx);
 	FUSE_UNLOCK();
 
-	FS_DEBUG("%s: device closed by thread %d.\n", dev->si_name, td->td_tid);
+	SDT_PROBE2(fuse, , device, trace, 1, "device close");
 	return (0);
 }
 
 int
 fuse_device_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct fuse_data *data;
 	int error, revents = 0;
 
 	error = devfs_get_cdevpriv((void **)&data);
 	if (error != 0)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		fuse_lck_mtx_lock(data->ms_mtx);
 		if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &data->ks_rsel);
 		fuse_lck_mtx_unlock(data->ms_mtx);
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		revents |= events & (POLLOUT | POLLWRNORM);
 	}
 	return (revents);
 }
 
 /*
  * fuse_device_read hangs on the queue of VFS messages.
  * When it's notified that there is a new one, it picks that and
  * passes up to the daemon
  */
 int
 fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	int err;
 	struct fuse_data *data;
 	struct fuse_ticket *tick;
 	void *buf[] = {NULL, NULL, NULL};
 	int buflen[3];
 	int i;
 
-	FS_DEBUG("fuse device being read on thread %d\n", uio->uio_td->td_tid);
+	SDT_PROBE2(fuse, , device, trace, 1, "fuse device read");
 
 	err = devfs_get_cdevpriv((void **)&data);
 	if (err != 0)
 		return (err);
 
 	fuse_lck_mtx_lock(data->ms_mtx);
 again:
 	if (fdata_get_dead(data)) {
-		FS_DEBUG2G("we know early on that reader should be kicked so we don't wait for news\n");
+		SDT_PROBE2(fuse, , device, trace, 2,
+			"we know early on that reader should be kicked so we "
+			"don't wait for news");
 		fuse_lck_mtx_unlock(data->ms_mtx);
 		return (ENODEV);
 	}
 	if (!(tick = fuse_ms_pop(data))) {
 		/* check if we may block */
 		if (ioflag & O_NONBLOCK) {
 			/* get outa here soon */
 			fuse_lck_mtx_unlock(data->ms_mtx);
 			return (EAGAIN);
 		} else {
 			err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0);
 			if (err != 0) {
 				fuse_lck_mtx_unlock(data->ms_mtx);
 				return (fdata_get_dead(data) ? ENODEV : err);
 			}
 			tick = fuse_ms_pop(data);
 		}
 	}
 	if (!tick) {
 		/*
 		 * We can get here if fuse daemon suddenly terminates,
 		 * eg, by being hit by a SIGKILL
 		 * -- and some other cases, too, tho not totally clear, when
 		 * (cv_signal/wakeup_one signals the whole process ?)
 		 */
-		FS_DEBUG("no message on thread #%d\n", uio->uio_td->td_tid);
+		SDT_PROBE2(fuse, , device, trace, 1, "no message on thread");
 		goto again;
 	}
 	fuse_lck_mtx_unlock(data->ms_mtx);
 
 	if (fdata_get_dead(data)) {
 		/*
 		 * somebody somewhere -- eg., umount routine --
 		 * wants this liaison finished off
 		 */
-		FS_DEBUG2G("reader is to be sacked\n");
+		SDT_PROBE2(fuse, , device, trace, 2, "reader is to be sacked");
 		if (tick) {
-			FS_DEBUG2G("weird -- \"kick\" is set tho there is message\n");
+			SDT_PROBE2(fuse, , device, trace, 2, "weird -- "
+				"\"kick\" is set tho there is message");
 			FUSE_ASSERT_MS_DONE(tick);
 			fuse_ticket_drop(tick);
 		}
 		return (ENODEV);	/* This should make the daemon get off
 					 * of us */
 	}
-	FS_DEBUG("message got on thread #%d\n", uio->uio_td->td_tid);
+	SDT_PROBE2(fuse, , device, trace, 1,
+		"fuse device read message successfully");
 
 	KASSERT(tick->tk_ms_bufdata || tick->tk_ms_bufsize == 0,
 	    ("non-null buf pointer with positive size"));
 
 	switch (tick->tk_ms_type) {
 	case FT_M_FIOV:
 		buf[0] = tick->tk_ms_fiov.base;
 		buflen[0] = tick->tk_ms_fiov.len;
 		break;
 	case FT_M_BUF:
 		buf[0] = tick->tk_ms_fiov.base;
 		buflen[0] = tick->tk_ms_fiov.len;
 		buf[1] = tick->tk_ms_bufdata;
 		buflen[1] = tick->tk_ms_bufsize;
 		break;
 	default:
 		panic("unknown message type for fuse_ticket %p", tick);
 	}
 
 	for (i = 0; buf[i]; i++) {
 		/*
 		 * Why not ban mercilessly stupid daemons who can't keep up
 		 * with us? (There is no much use of a partial read here...)
 		 */
 		/*
 		 * XXX note that in such cases Linux FUSE throws EIO at the
 		 * syscall invoker and stands back to the message queue. The
 		 * rationale should be made clear (and possibly adopt that
 		 * behaviour). Keeping the current scheme at least makes
 		 * fallacy as loud as possible...
 		 */
 		if (uio->uio_resid < buflen[i]) {
 			fdata_set_dead(data);
-			FS_DEBUG2G("daemon is stupid, kick it off...\n");
+			SDT_PROBE2(fuse, , device, trace, 2,
+			    "daemon is stupid, kick it off...");
 			err = ENODEV;
 			break;
 		}
 		err = uiomove(buf[i], buflen[i], uio);
 		if (err)
 			break;
 	}
 
 	FUSE_ASSERT_MS_DONE(tick);
 	fuse_ticket_drop(tick);
 
 	return (err);
 }
 
 static inline int
 fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio)
 {
-	FS_DEBUG("Out header -- len: %i, error: %i, unique: %llu; iovecs: %d\n",
-	    ohead->len, ohead->error, (unsigned long long)ohead->unique,
-	    uio->uio_iovcnt);
-
 	if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) {
-		FS_DEBUG("Format error: body size differs from size claimed by header\n");
+		SDT_PROBE2(fuse, , device, trace, 1, "Format error: body size "
+			"differs from size claimed by header");
 		return (EINVAL);
 	}
 	if (uio->uio_resid && ohead->error) {
-		FS_DEBUG("Format error: non zero error but message had a body\n");
+		SDT_PROBE2(fuse, , device, trace, 1, 
+			"Format error: non zero error but message had a body");
 		return (EINVAL);
 	}
 	/* Sanitize the linuxism of negative errnos */
 	ohead->error = -(ohead->error);
 
 	return (0);
 }
 
+SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_bumped_into_callback,
+		"uint64_t");
 /*
  * fuse_device_write first reads the header sent by the daemon.
  * If that's OK, looks up ticket/callback node by the unique id seen in header.
  * If the callback node contains a handler function, the uio is passed over
  * that.
  */
 static int
 fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct fuse_out_header ohead;
 	int err = 0;
 	struct fuse_data *data;
 	struct fuse_ticket *tick, *x_tick;
 	int found = 0;
 
-	FS_DEBUG("resid: %zd, iovcnt: %d, thread: %d\n",
-	    uio->uio_resid, uio->uio_iovcnt, uio->uio_td->td_tid);
-
 	err = devfs_get_cdevpriv((void **)&data);
 	if (err != 0)
 		return (err);
 
 	if (uio->uio_resid < sizeof(struct fuse_out_header)) {
-		FS_DEBUG("got less than a header!\n");
+		SDT_PROBE2(fuse, , device, trace, 1,
+			"fuse_device_write got less than a header!");
 		fdata_set_dead(data);
 		return (EINVAL);
 	}
 	if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0)
 		return (err);
 
 	/*
 	 * We check header information (which is redundant) and compare it
 	 * with what we see. If we see some inconsistency we discard the
 	 * whole answer and proceed on as if it had never existed. In
 	 * particular, no pretender will be woken up, regardless the
 	 * "unique" value in the header.
 	 */
 	if ((err = fuse_ohead_audit(&ohead, uio))) {
 		fdata_set_dead(data);
 		return (err);
 	}
 	/* Pass stuff over to callback if there is one installed */
 
 	/* Looking for ticket with the unique id of header */
 	fuse_lck_mtx_lock(data->aw_mtx);
 	TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link,
 	    x_tick) {
-		FS_DEBUG("bumped into callback #%llu\n",
-		    (unsigned long long)tick->tk_unique);
+		SDT_PROBE1(fuse, , device,
+			fuse_device_write_bumped_into_callback,
+			tick->tk_unique);
 		if (tick->tk_unique == ohead.unique) {
 			found = 1;
 			fuse_aw_remove(tick);
 			break;
 		}
 	}
 	fuse_lck_mtx_unlock(data->aw_mtx);
 
 	if (found) {
 		if (tick->tk_aw_handler) {
 			/*
 			 * We found a callback with proper handler. In this
 			 * case the out header will be 0wnd by the callback,
 			 * so the fun of freeing that is left for her.
 			 * (Then, by all chance, she'll just get that's done
 			 * via ticket_drop(), so no manual mucking
 			 * around...)
 			 */
-			FS_DEBUG("pass ticket to a callback\n");
+			SDT_PROBE2(fuse, , device, trace, 1,
+				"pass ticket to a callback");
 			memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead));
 			err = tick->tk_aw_handler(tick, uio);
 		} else {
 			/* pretender doesn't wanna do anything with answer */
-			FS_DEBUG("stuff devalidated, so we drop it\n");
+			SDT_PROBE2(fuse, , device, trace, 1,
+				"stuff devalidated, so we drop it");
 		}
 
 		/*
 		 * As aw_mtx was not held during the callback execution the
 		 * ticket may have been inserted again.  However, this is safe
 		 * because fuse_ticket_drop() will deal with refcount anyway.
 		 */
 		fuse_ticket_drop(tick);
 	} else {
 		/* no callback at all! */
-		FS_DEBUG("erhm, no handler for this response\n");
+		SDT_PROBE2(fuse, , device, trace, 1,
+			"erhm, no handler for this response");
 		err = EINVAL;
 	}
 
 	return (err);
 }
 
 int
 fuse_device_init(void)
 {
 
 	fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR,
 	    S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "fuse");
 	if (fuse_dev == NULL)
 		return (ENOMEM);
 	return (0);
 }
 
 void
 fuse_device_destroy(void)
 {
 
 	MPASS(fuse_dev != NULL);
 	destroy_dev(fuse_dev);
 }
Index: projects/capsicum-test/sys/fs/fuse/fuse_file.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_file.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_file.c	(revision 345710)
@@ -1,281 +1,282 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
+#include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 
-#define FUSE_DEBUG_MODULE FILE
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , file, trace, "int", "char*");
 
 static int fuse_fh_count = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, filehandle_count, CTLFLAG_RD,
     &fuse_fh_count, 0, "number of open FUSE filehandles");
 
 int
 fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type,
     struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred)
 {
 	struct fuse_dispatcher fdi;
 	struct fuse_open_in *foi;
 	struct fuse_open_out *foo;
 
 	int err = 0;
 	int oflags = 0;
 	int op = FUSE_OPEN;
 
-	fuse_trace_printf("fuse_filehandle_open(vp=%p, fufh_type=%d)\n",
-	    vp, fufh_type);
-
 	if (fuse_filehandle_valid(vp, fufh_type)) {
 		panic("FUSE: filehandle_open called despite valid fufh (type=%d)",
 		    fufh_type);
 		/* NOTREACHED */
 	}
 	/*
 	 * Note that this means we are effectively FILTERING OUT open() flags.
 	 */
 	oflags = fuse_filehandle_xlate_to_oflags(fufh_type);
 
 	if (vnode_isdir(vp)) {
 		op = FUSE_OPENDIR;
 		if (fufh_type != FUFH_RDONLY) {
+			SDT_PROBE2(fuse, , file, trace, 1,
+				"non-rdonly fh requested for a directory?");
 			printf("FUSE:non-rdonly fh requested for a directory?\n");
 			fufh_type = FUFH_RDONLY;
 		}
 	}
 	fdisp_init(&fdi, sizeof(*foi));
 	fdisp_make_vp(&fdi, op, vp, td, cred);
 
 	foi = fdi.indata;
 	foi->flags = oflags;
 
 	if ((err = fdisp_wait_answ(&fdi))) {
-		debug_printf("OUCH ... daemon didn't give fh (err = %d)\n", err);
+		SDT_PROBE2(fuse, , file, trace, 1,
+			"OUCH ... daemon didn't give fh");
 		if (err == ENOENT) {
 			fuse_internal_vnode_disappear(vp);
 		}
 		goto out;
 	}
 	foo = fdi.answ;
 
 	fuse_filehandle_init(vp, fufh_type, fufhp, foo->fh);
 
 	/*
 	 * For WRONLY opens, force DIRECT_IO.  This is necessary
 	 * since writing a partial block through the buffer cache
 	 * will result in a read of the block and that read won't
 	 * be allowed by the WRONLY open.
 	 */
 	if (fufh_type == FUFH_WRONLY)
 		fuse_vnode_open(vp, foo->open_flags | FOPEN_DIRECT_IO, td);
 	else
 		fuse_vnode_open(vp, foo->open_flags, td);
 
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 int
 fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type,
     struct thread *td, struct ucred *cred)
 {
 	struct fuse_dispatcher fdi;
 	struct fuse_release_in *fri;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh = NULL;
 
 	int err = 0;
 	int op = FUSE_RELEASE;
 
-	fuse_trace_printf("fuse_filehandle_put(vp=%p, fufh_type=%d)\n",
-	    vp, fufh_type);
-
 	fufh = &(fvdat->fufh[fufh_type]);
 	if (!FUFH_IS_VALID(fufh)) {
 		panic("FUSE: filehandle_put called on invalid fufh (type=%d)",
 		    fufh_type);
 		/* NOTREACHED */
 	}
 	if (fuse_isdeadfs(vp)) {
 		goto out;
 	}
 	if (vnode_isdir(vp))
 		op = FUSE_RELEASEDIR;
 	fdisp_init(&fdi, sizeof(*fri));
 	fdisp_make_vp(&fdi, op, vp, td, cred);
 	fri = fdi.indata;
 	fri->fh = fufh->fh_id;
 	fri->flags = fuse_filehandle_xlate_to_oflags(fufh_type);
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 
 out:
 	atomic_subtract_acq_int(&fuse_fh_count, 1);
 	fufh->fh_id = (uint64_t)-1;
 	fufh->fh_type = FUFH_INVALID;
 
 	return err;
 }
 
 int
 fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh;
 
 	fufh = &(fvdat->fufh[fufh_type]);
 	return FUFH_IS_VALID(fufh);
 }
 
 /*
  * Check for a valid file handle, first the type requested, but if that
  * isn't valid, try for FUFH_RDWR.
  * Return the FUFH type that is valid or FUFH_INVALID if there are none.
  * This is a variant of fuse_filehandle_vaild() analogous to
  * fuse_filehandle_getrw().
  */
 fufh_type_t
 fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh;
 
 	fufh = &fvdat->fufh[fufh_type];
 	if (FUFH_IS_VALID(fufh) != 0)
 		return (fufh_type);
 	fufh = &fvdat->fufh[FUFH_RDWR];
 	if (FUFH_IS_VALID(fufh) != 0)
 		return (FUFH_RDWR);
 	return (FUFH_INVALID);
 }
 
 int
 fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type,
     struct fuse_filehandle **fufhp)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh;
 
 	fufh = &(fvdat->fufh[fufh_type]);
 	if (!FUFH_IS_VALID(fufh))
 		return EBADF;
 	if (fufhp != NULL)
 		*fufhp = fufh;
 	return 0;
 }
 
 int
 fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type,
     struct fuse_filehandle **fufhp)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh;
 
 	fufh = &(fvdat->fufh[fufh_type]);
 	if (!FUFH_IS_VALID(fufh)) {
 		fufh_type = FUFH_RDWR;
 	}
 	return fuse_filehandle_get(vp, fufh_type, fufhp);
 }
 
 void
 fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type,
     struct fuse_filehandle **fufhp, uint64_t fh_id)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh;
 
-	FS_DEBUG("id=%jd type=%d\n", (intmax_t)fh_id, fufh_type);
 	fufh = &(fvdat->fufh[fufh_type]);
 	MPASS(!FUFH_IS_VALID(fufh));
 	fufh->fh_id = fh_id;
 	fufh->fh_type = fufh_type;
 	if (!FUFH_IS_VALID(fufh)) {
 		panic("FUSE: init: invalid filehandle id (type=%d)", fufh_type);
 	}
 	if (fufhp != NULL)
 		*fufhp = fufh;
 
 	atomic_add_acq_int(&fuse_fh_count, 1);
 }
Index: projects/capsicum-test/sys/fs/fuse/fuse_internal.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_internal.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_internal.c	(revision 345710)
@@ -1,640 +1,694 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 #include "fuse_file.h"
 #include "fuse_param.h"
 
-#define FUSE_DEBUG_MODULE INTERNAL
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , internal, trace, "int", "char*");
 
 #ifdef ZERO_PAD_INCOMPLETE_BUFS
 static int isbzero(void *buf, size_t len);
 
 #endif
 
 /* access */
 
 int
 fuse_internal_access(struct vnode *vp,
     mode_t mode,
     struct fuse_access_param *facp,
     struct thread *td,
     struct ucred *cred)
 {
 	int err = 0;
 	uint32_t mask = 0;
 	int dataflags;
 	int vtype;
 	struct mount *mp;
 	struct fuse_dispatcher fdi;
 	struct fuse_access_in *fai;
 	struct fuse_data *data;
 
 	/* NOT YET DONE */
 	/*
 	 * If this vnop gives you trouble, just return 0 here for a lazy
 	 * kludge.
 	 */
 	/* return 0;*/
 
-	fuse_trace_printf_func();
-
 	mp = vnode_mount(vp);
 	vtype = vnode_vtype(vp);
 
 	data = fuse_get_mpdata(mp);
 	dataflags = data->dataflags;
 
 	if ((mode & VWRITE) && vfs_isrdonly(mp)) {
 		return EACCES;
 	}
 	/* Unless explicitly permitted, deny everyone except the fs owner. */
 	    if (vnode_isvroot(vp) && !(facp->facc_flags & FACCESS_NOCHECKSPY)) {
 		if (!(dataflags & FSESS_DAEMON_CAN_SPY)) {
 			int denied = fuse_match_cred(data->daemoncred,
 			    cred);
 
 			if (denied) {
 				return EPERM;
 			}
 		}
 		facp->facc_flags |= FACCESS_NOCHECKSPY;
 	}
 	if (!(facp->facc_flags & FACCESS_DO_ACCESS)) {
 		return 0;
 	}
 	if (((vtype == VREG) && (mode & VEXEC))) {
 #ifdef NEED_MOUNT_ARGUMENT_FOR_THIS
 		/* Let	 the kernel handle this through open / close heuristics.*/
 		    return ENOTSUP;
 #else
 		    /* 	Let the kernel handle this. */
 		    return 0;
 #endif
 	}
 	if (!fsess_isimpl(mp, FUSE_ACCESS)) {
 		/* Let the kernel handle this. */
 		    return 0;
 	}
 	if (dataflags & FSESS_DEFAULT_PERMISSIONS) {
 		/* Let the kernel handle this. */
 		    return 0;
 	}
 	if ((mode & VADMIN) != 0) {
 		err = priv_check_cred(cred, PRIV_VFS_ADMIN);
 		if (err) {
 			return err;
 		}
 	}
 	if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) {
 		mask |= W_OK;
 	}
 	if ((mode & VREAD) != 0) {
 		mask |= R_OK;
 	}
 	if ((mode & VEXEC) != 0) {
 		mask |= X_OK;
 	}
 	bzero(&fdi, sizeof(fdi));
 
 	fdisp_init(&fdi, sizeof(*fai));
 	fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred);
 
 	fai = fdi.indata;
 	fai->mask = F_OK;
 	fai->mask |= mask;
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_ACCESS);
 		err = 0;
 	}
 	return err;
 }
 
+/*
+ * Cache FUSE attributes from feo, in attr cache associated with vnode 'vp'.
+ * Optionally, if argument 'vap' is not NULL, store a copy of the converted
+ * attributes there as well.
+ *
+ * If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do
+ * return the result to the caller).
+ */
+void
+fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr,
+	uint64_t attr_valid, uint32_t attr_valid_nsec, struct vattr *vap)
+{
+	struct mount *mp;
+	struct fuse_vnode_data *fvdat;
+	struct vattr *vp_cache_at;
+
+	mp = vnode_mount(vp);
+	fvdat = VTOFUD(vp);
+
+	/* Honor explicit do-not-cache requests from user filesystems. */
+	if (attr_valid == 0 && attr_valid_nsec == 0)
+		fvdat->valid_attr_cache = false;
+	else
+		fvdat->valid_attr_cache = true;
+
+	vp_cache_at = VTOVA(vp);
+
+	if (vap == NULL && vp_cache_at == NULL)
+		return;
+
+	if (vap == NULL)
+		vap = vp_cache_at;
+
+	vattr_null(vap);
+
+	vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
+	vap->va_fileid = attr->ino;
+	vap->va_mode = attr->mode & ~S_IFMT;
+	vap->va_nlink     = attr->nlink;
+	vap->va_uid       = attr->uid;
+	vap->va_gid       = attr->gid;
+	vap->va_rdev      = attr->rdev;
+	vap->va_size      = attr->size;
+	/* XXX on i386, seconds are truncated to 32 bits */
+	vap->va_atime.tv_sec  = attr->atime;
+	vap->va_atime.tv_nsec = attr->atimensec;
+	vap->va_mtime.tv_sec  = attr->mtime;
+	vap->va_mtime.tv_nsec = attr->mtimensec;
+	vap->va_ctime.tv_sec  = attr->ctime;
+	vap->va_ctime.tv_nsec = attr->ctimensec;
+	vap->va_blocksize = PAGE_SIZE;
+	vap->va_type = IFTOVT(attr->mode);
+	vap->va_bytes = attr->blocks * S_BLKSIZE;
+	vap->va_flags = 0;
+
+	if (vap != vp_cache_at && vp_cache_at != NULL)
+		memcpy(vp_cache_at, vap, sizeof(*vap));
+}
+
+
 /* fsync */
 
 int
 fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio)
 {
-	fuse_trace_printf_func();
-
 	if (tick->tk_aw_ohead.error == ENOSYS) {
 		fsess_set_notimpl(tick->tk_data->mp, fticket_opcode(tick));
 	}
 	return 0;
 }
 
 int
 fuse_internal_fsync(struct vnode *vp,
     struct thread *td,
     struct ucred *cred,
     struct fuse_filehandle *fufh)
 {
 	int op = FUSE_FSYNC;
 	struct fuse_fsync_in *ffsi;
 	struct fuse_dispatcher fdi;
 
-	fuse_trace_printf_func();
-
 	if (vnode_isdir(vp)) {
 		op = FUSE_FSYNCDIR;
 	}
 	fdisp_init(&fdi, sizeof(*ffsi));
 	fdisp_make_vp(&fdi, op, vp, td, cred);
 	ffsi = fdi.indata;
 	ffsi->fh = fufh->fh_id;
 
 	ffsi->fsync_flags = 1;		/* datasync */
 
 	fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback);
 	fuse_insert_message(fdi.tick);
 
 	fdisp_destroy(&fdi);
 
 	return 0;
 
 }
 
 /* readdir */
 
 int
 fuse_internal_readdir(struct vnode *vp,
     struct uio *uio,
     struct fuse_filehandle *fufh,
     struct fuse_iov *cookediov)
 {
 	int err = 0;
 	struct fuse_dispatcher fdi;
 	struct fuse_read_in *fri;
 
 	if (uio_resid(uio) == 0) {
 		return 0;
 	}
 	fdisp_init(&fdi, 0);
 
 	/*
 	 * Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p
 	 * I/O).
 	 */
 
 	while (uio_resid(uio) > 0) {
 
 		fdi.iosize = sizeof(*fri);
 		fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
 
 		fri = fdi.indata;
 		fri->fh = fufh->fh_id;
 		fri->offset = uio_offset(uio);
 		fri->size = min(uio_resid(uio), FUSE_DEFAULT_IOSIZE);
 		/* mp->max_read */
 
 		    if ((err = fdisp_wait_answ(&fdi))) {
 			break;
 		}
 		if ((err = fuse_internal_readdir_processdata(uio, fri->size, fdi.answ,
 		    fdi.iosize, cookediov))) {
 			break;
 		}
 	}
 
 	fdisp_destroy(&fdi);
 	return ((err == -1) ? 0 : err);
 }
 
 int
 fuse_internal_readdir_processdata(struct uio *uio,
     size_t reqsize,
     void *buf,
     size_t bufsize,
     void *param)
 {
 	int err = 0;
 	int cou = 0;
 	int bytesavail;
 	size_t freclen;
 
 	struct dirent *de;
 	struct fuse_dirent *fudge;
 	struct fuse_iov *cookediov = param;
 
 	if (bufsize < FUSE_NAME_OFFSET) {
 		return -1;
 	}
 	for (;;) {
 
 		if (bufsize < FUSE_NAME_OFFSET) {
 			err = -1;
 			break;
 		}
 		fudge = (struct fuse_dirent *)buf;
 		freclen = FUSE_DIRENT_SIZE(fudge);
 
 		cou++;
 
 		if (bufsize < freclen) {
 			err = ((cou == 1) ? -1 : 0);
 			break;
 		}
 #ifdef ZERO_PAD_INCOMPLETE_BUFS
 		if (isbzero(buf, FUSE_NAME_OFFSET)) {
 			err = -1;
 			break;
 		}
 #endif
 
 		if (!fudge->namelen || fudge->namelen > MAXNAMLEN) {
 			err = EINVAL;
 			break;
 		}
 		bytesavail = GENERIC_DIRSIZ((struct pseudo_dirent *)
 					    &fudge->namelen);
 
 		if (bytesavail > uio_resid(uio)) {
 			err = -1;
 			break;
 		}
 		fiov_refresh(cookediov);
 		fiov_adjust(cookediov, bytesavail);
 
 		de = (struct dirent *)cookediov->base;
 		de->d_fileno = fudge->ino;
 		de->d_reclen = bytesavail;
 		de->d_type = fudge->type;
 		de->d_namlen = fudge->namelen;
 		memcpy((char *)cookediov->base + sizeof(struct dirent) - 
 		       MAXNAMLEN - 1,
 		       (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
 		dirent_terminate(de);
 
 		err = uiomove(cookediov->base, cookediov->len, uio);
 		if (err) {
 			break;
 		}
 		buf = (char *)buf + freclen;
 		bufsize -= freclen;
 		uio_setoffset(uio, fudge->off);
 	}
 
 	return err;
 }
 
 /* remove */
 
 int
 fuse_internal_remove(struct vnode *dvp,
     struct vnode *vp,
     struct componentname *cnp,
     enum fuse_opcode op)
 {
 	struct fuse_dispatcher fdi;
 	struct fuse_vnode_data *fvdat;
 	int err;
 
 	err = 0;
 	fvdat = VTOFUD(vp);
 
-	debug_printf("dvp=%p, cnp=%p, op=%d\n", vp, cnp, op);
-
 	fdisp_init(&fdi, cnp->cn_namelen + 1);
 	fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred);
 
 	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /* rename */
 
 int
 fuse_internal_rename(struct vnode *fdvp,
     struct componentname *fcnp,
     struct vnode *tdvp,
     struct componentname *tcnp)
 {
 	struct fuse_dispatcher fdi;
 	struct fuse_rename_in *fri;
 	int err = 0;
 
 	fdisp_init(&fdi, sizeof(*fri) + fcnp->cn_namelen + tcnp->cn_namelen + 2);
 	fdisp_make_vp(&fdi, FUSE_RENAME, fdvp, tcnp->cn_thread, tcnp->cn_cred);
 
 	fri = fdi.indata;
 	fri->newdir = VTOI(tdvp);
 	memcpy((char *)fdi.indata + sizeof(*fri), fcnp->cn_nameptr,
 	    fcnp->cn_namelen);
 	((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen] = '\0';
 	memcpy((char *)fdi.indata + sizeof(*fri) + fcnp->cn_namelen + 1,
 	    tcnp->cn_nameptr, tcnp->cn_namelen);
 	((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen +
 	    tcnp->cn_namelen + 1] = '\0';
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /* strategy */
 
 /* entity creation */
 
 void
 fuse_internal_newentry_makerequest(struct mount *mp,
     uint64_t dnid,
     struct componentname *cnp,
     enum fuse_opcode op,
     void *buf,
     size_t bufsize,
     struct fuse_dispatcher *fdip)
 {
-	debug_printf("fdip=%p\n", fdip);
-
 	fdip->iosize = bufsize + cnp->cn_namelen + 1;
 
 	fdisp_make(fdip, op, mp, dnid, cnp->cn_thread, cnp->cn_cred);
 	memcpy(fdip->indata, buf, bufsize);
 	memcpy((char *)fdip->indata + bufsize, cnp->cn_nameptr, cnp->cn_namelen);
 	((char *)fdip->indata)[bufsize + cnp->cn_namelen] = '\0';
 }
 
 int
 fuse_internal_newentry_core(struct vnode *dvp,
     struct vnode **vpp,
     struct componentname *cnp,
     enum vtype vtyp,
     struct fuse_dispatcher *fdip)
 {
 	int err = 0;
 	struct fuse_entry_out *feo;
 	struct mount *mp = vnode_mount(dvp);
 
 	if ((err = fdisp_wait_answ(fdip))) {
 		return err;
 	}
 	feo = fdip->answ;
 
 	if ((err = fuse_internal_checkentry(feo, vtyp))) {
 		return err;
 	}
 	err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vtyp);
 	if (err) {
 		fuse_internal_forget_send(mp, cnp->cn_thread, cnp->cn_cred,
 		    feo->nodeid, 1);
 		return err;
 	}
-	cache_attrs(*vpp, feo, NULL);
+	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
+		feo->attr_valid_nsec, NULL);
 
 	return err;
 }
 
 int
 fuse_internal_newentry(struct vnode *dvp,
     struct vnode **vpp,
     struct componentname *cnp,
     enum fuse_opcode op,
     void *buf,
     size_t bufsize,
     enum vtype vtype)
 {
 	int err;
 	struct fuse_dispatcher fdi;
 	struct mount *mp = vnode_mount(dvp);
 
 	fdisp_init(&fdi, 0);
 	fuse_internal_newentry_makerequest(mp, VTOI(dvp), cnp, op, buf,
 	    bufsize, &fdi);
 	err = fuse_internal_newentry_core(dvp, vpp, cnp, vtype, &fdi);
 	fdisp_destroy(&fdi);
 
 	return err;
 }
 
 /* entity destruction */
 
 int
 fuse_internal_forget_callback(struct fuse_ticket *ftick, struct uio *uio)
 {
 	fuse_internal_forget_send(ftick->tk_data->mp, curthread, NULL,
 	    ((struct fuse_in_header *)ftick->tk_ms_fiov.base)->nodeid, 1);
 
 	return 0;
 }
 
 void
 fuse_internal_forget_send(struct mount *mp,
     struct thread *td,
     struct ucred *cred,
     uint64_t nodeid,
     uint64_t nlookup)
 {
 
 	struct fuse_dispatcher fdi;
 	struct fuse_forget_in *ffi;
 
-	debug_printf("mp=%p, nodeid=%ju, nlookup=%ju\n",
-	    mp, (uintmax_t)nodeid, (uintmax_t)nlookup);
-
 	/*
          * KASSERT(nlookup > 0, ("zero-times forget for vp #%llu",
          *         (long long unsigned) nodeid));
          */
 
 	fdisp_init(&fdi, sizeof(*ffi));
 	fdisp_make(&fdi, FUSE_FORGET, mp, nodeid, td, cred);
 
 	ffi = fdi.indata;
 	ffi->nlookup = nlookup;
 
 	fuse_insert_message(fdi.tick);
 	fdisp_destroy(&fdi);
 }
 
 void
 fuse_internal_vnode_disappear(struct vnode *vp)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 
 	ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear");
 	fvdat->flag |= FN_REVOKED;
 	fvdat->valid_attr_cache = false;
 	cache_purge(vp);
 }
 
 /* fuse start/stop */
 
 int
 fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio)
 {
 	int err = 0;
 	struct fuse_data *data = tick->tk_data;
 	struct fuse_init_out *fiio;
 
 	if ((err = tick->tk_aw_ohead.error)) {
 		goto out;
 	}
 	if ((err = fticket_pull(tick, uio))) {
 		goto out;
 	}
 	fiio = fticket_resp(tick)->base;
 
 	/* XXX: Do we want to check anything further besides this? */
 	if (fiio->major < 7) {
-		debug_printf("userpace version too low\n");
+		SDT_PROBE2(fuse, , internal, trace, 1,
+			"userpace version too low");
 		err = EPROTONOSUPPORT;
 		goto out;
 	}
 	data->fuse_libabi_major = fiio->major;
 	data->fuse_libabi_minor = fiio->minor;
 
 	if (fuse_libabi_geq(data, 7, 5)) {
 		if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) {
 			data->max_write = fiio->max_write;
 		} else {
 			err = EINVAL;
 		}
 	} else {
 		/* Old fix values */
 		data->max_write = 4096;
 	}
 
 out:
 	if (err) {
 		fdata_set_dead(data);
 	}
 	FUSE_LOCK();
 	data->dataflags |= FSESS_INITED;
 	wakeup(&data->ticketer);
 	FUSE_UNLOCK();
 
 	return 0;
 }
 
 void
 fuse_internal_send_init(struct fuse_data *data, struct thread *td)
 {
 	struct fuse_init_in *fiii;
 	struct fuse_dispatcher fdi;
 
 	fdisp_init(&fdi, sizeof(*fiii));
 	fdisp_make(&fdi, FUSE_INIT, data->mp, 0, td, NULL);
 	fiii = fdi.indata;
 	fiii->major = FUSE_KERNEL_VERSION;
 	fiii->minor = FUSE_KERNEL_MINOR_VERSION;
 	fiii->max_readahead = FUSE_DEFAULT_IOSIZE * 16;
 	fiii->flags = 0;
 
 	fuse_insert_callback(fdi.tick, fuse_internal_init_callback);
 	fuse_insert_message(fdi.tick);
 	fdisp_destroy(&fdi);
 }
 
 #ifdef ZERO_PAD_INCOMPLETE_BUFS
 static int
 isbzero(void *buf, size_t len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (((char *)buf)[i])
 			return (0);
 	}
 
 	return (1);
 }
 
 #endif
Index: projects/capsicum-test/sys/fs/fuse/fuse_internal.h
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_internal.h	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_internal.h	(revision 345710)
@@ -1,349 +1,274 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  * 
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _FUSE_INTERNAL_H_
 #define _FUSE_INTERNAL_H_
 
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 
 static inline bool
 vfs_isrdonly(struct mount *mp)
 {
 	return ((mp->mnt_flag & MNT_RDONLY) != 0);
 }
 
 static inline struct mount *
 vnode_mount(struct vnode *vp)
 {
 	return (vp->v_mount);
 }
 
 static inline bool
 vnode_mountedhere(struct vnode *vp)
 {
 	return (vp->v_mountedhere != NULL);
 }
 
 static inline enum vtype
 vnode_vtype(struct vnode *vp)
 {
 	return (vp->v_type);
 }
 
 static inline bool
 vnode_isvroot(struct vnode *vp)
 {
 	return ((vp->v_vflag & VV_ROOT) != 0);
 }
 
 static inline bool
 vnode_isreg(struct vnode *vp)
 {
 	return (vp->v_type == VREG);
 }
 
 static inline bool
 vnode_isdir(struct vnode *vp)
 {
 	return (vp->v_type == VDIR);
 }
 
 static inline bool
 vnode_islnk(struct vnode *vp)
 {
 	return (vp->v_type == VLNK);
 }
 
 static inline ssize_t
 uio_resid(struct uio *uio)
 {
 	return (uio->uio_resid);
 }
 
 static inline off_t
 uio_offset(struct uio *uio)
 {
 	return (uio->uio_offset);
 }
 
 static inline void
 uio_setoffset(struct uio *uio, off_t offset)
 {
 	uio->uio_offset = offset;
 }
 
 static inline void
 uio_setresid(struct uio *uio, ssize_t resid)
 {
 	uio->uio_resid = resid;
 }
 
 /* miscellaneous */
 
 static inline bool
 fuse_isdeadfs(struct vnode *vp)
 {
 	struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
 
 	return (data->dataflags & FSESS_DEAD);
 }
 
 static inline uint64_t
 fuse_iosize(struct vnode *vp)
 {
 	return (vp->v_mount->mnt_stat.f_iosize);
 }
 
 /* access */
 
 #define FVP_ACCESS_NOOP		0x01
 
 #define FACCESS_VA_VALID	0x01
 #define FACCESS_DO_ACCESS	0x02
 #define FACCESS_STICKY		0x04
 #define FACCESS_CHOWN		0x08
 #define FACCESS_NOCHECKSPY	0x10
 #define FACCESS_SETGID		0x12
 
 #define FACCESS_XQUERIES	(FACCESS_STICKY | FACCESS_CHOWN | FACCESS_SETGID)
 
 struct fuse_access_param {
 	uid_t		xuid;
 	gid_t		xgid;
 	uint32_t	facc_flags;
 };
 
 static inline int
 fuse_match_cred(struct ucred *basecred, struct ucred *usercred)
 {
 	if (basecred->cr_uid == usercred->cr_uid             &&
 	    basecred->cr_uid == usercred->cr_ruid            &&
 	    basecred->cr_uid == usercred->cr_svuid           &&
 	    basecred->cr_groups[0] == usercred->cr_groups[0] &&
 	    basecred->cr_groups[0] == usercred->cr_rgid      &&
 	    basecred->cr_groups[0] == usercred->cr_svgid)
 		return (0);
 
 	return (EPERM);
 }
 
 int fuse_internal_access(struct vnode *vp, mode_t mode,
     struct fuse_access_param *facp, struct thread *td, struct ucred *cred);
 
 /* attributes */
+void fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr,
+	uint64_t attr_valid, uint32_t attr_valid_nsec, struct vattr *vap);
 
-/*
- * Cache FUSE attributes 'fat', with nominal expiration
- * 'attr_valid'.'attr_valid_nsec', in attr cache associated with vnode 'vp'.
- * Optionally, if argument 'vap' is not NULL, store a copy of the converted
- * attributes there as well.
- *
- * If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do
- * return the result to the caller).
- */
-static inline void
-fuse_internal_attr_fat2vat(struct vnode *vp, struct fuse_attr *fat,
-    uint64_t attr_valid, uint32_t attr_valid_nsec, struct vattr *vap)
-{
-	struct mount *mp;
-	struct fuse_vnode_data *fvdat;
-	struct vattr *vp_cache_at;
-
-	mp = vnode_mount(vp);
-	fvdat = VTOFUD(vp);
-
-	DEBUGX(FUSE_DEBUG_INTERNAL, "node #%ju, mode 0%o\n",
-	    (uintmax_t)fat->ino, fat->mode);
-
-	/* Honor explicit do-not-cache requests from user filesystems. */
-	if (attr_valid == 0 && attr_valid_nsec == 0)
-		fvdat->valid_attr_cache = false;
-	else
-		fvdat->valid_attr_cache = true;
-
-	vp_cache_at = VTOVA(vp);
-
-	if (vap == NULL && vp_cache_at == NULL)
-		return;
-
-	if (vap == NULL)
-		vap = vp_cache_at;
-
-	vattr_null(vap);
-
-	vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
-	vap->va_fileid = fat->ino;
-	vap->va_mode = fat->mode & ~S_IFMT;
-	vap->va_nlink     = fat->nlink;
-	vap->va_uid       = fat->uid;
-	vap->va_gid       = fat->gid;
-	vap->va_rdev      = fat->rdev;
-	vap->va_size      = fat->size;
-	/* XXX on i386, seconds are truncated to 32 bits */
-	vap->va_atime.tv_sec  = fat->atime;
-	vap->va_atime.tv_nsec = fat->atimensec;
-	vap->va_mtime.tv_sec  = fat->mtime;
-	vap->va_mtime.tv_nsec = fat->mtimensec;
-	vap->va_ctime.tv_sec  = fat->ctime;
-	vap->va_ctime.tv_nsec = fat->ctimensec;
-	vap->va_blocksize = PAGE_SIZE;
-	vap->va_type = IFTOVT(fat->mode);
-	vap->va_bytes = fat->blocks * S_BLKSIZE;
-	vap->va_flags = 0;
-
-	if (vap != vp_cache_at && vp_cache_at != NULL)
-		memcpy(vp_cache_at, vap, sizeof(*vap));
-}
-
-
-#define	cache_attrs(vp, fuse_out, vap_out)				\
-	fuse_internal_attr_fat2vat((vp), &(fuse_out)->attr,		\
-	    (fuse_out)->attr_valid, (fuse_out)->attr_valid_nsec, (vap_out))
-
 /* fsync */
 
 int fuse_internal_fsync(struct vnode *vp, struct thread *td,
     struct ucred *cred, struct fuse_filehandle *fufh);
 int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio);
 
 /* readdir */
 
 struct pseudo_dirent {
 	uint32_t d_namlen;
 };
 
 int fuse_internal_readdir(struct vnode *vp, struct uio *uio,
     struct fuse_filehandle *fufh, struct fuse_iov *cookediov);
 int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize,
     void *buf, size_t bufsize, void *param);
 
 /* remove */
 
 int fuse_internal_remove(struct vnode *dvp, struct vnode *vp,
     struct componentname *cnp, enum fuse_opcode op);
 
 /* rename */
 
 int fuse_internal_rename(struct vnode *fdvp, struct componentname *fcnp,
     struct vnode *tdvp, struct componentname *tcnp);
 
 /* revoke */
 
 void fuse_internal_vnode_disappear(struct vnode *vp);
 
 /* strategy */
 
 /* entity creation */
 
 static inline int
 fuse_internal_checkentry(struct fuse_entry_out *feo, enum vtype vtyp)
 {
-	DEBUGX(FUSE_DEBUG_INTERNAL,
-	    "feo=%p, vtype=%d\n", feo, vtyp);
-
 	if (vtyp != IFTOVT(feo->attr.mode)) {
-		DEBUGX(FUSE_DEBUG_INTERNAL,
-		    "EINVAL -- %x != %x\n", vtyp, IFTOVT(feo->attr.mode));
 		return (EINVAL);
 	}
 
 	if (feo->nodeid == FUSE_NULL_ID) {
-		DEBUGX(FUSE_DEBUG_INTERNAL,
-		    "EINVAL -- feo->nodeid is NULL\n");
 		return (EINVAL);
 	}
 
 	if (feo->nodeid == FUSE_ROOT_ID) {
-		DEBUGX(FUSE_DEBUG_INTERNAL,
-		    "EINVAL -- feo->nodeid is FUSE_ROOT_ID\n");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 int fuse_internal_newentry(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize,
     enum vtype vtyp);
 
 void fuse_internal_newentry_makerequest(struct mount *mp, uint64_t dnid,
     struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize,
     struct fuse_dispatcher *fdip);
 
 int fuse_internal_newentry_core(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, enum vtype vtyp, struct fuse_dispatcher *fdip);
 
 /* entity destruction */
 
 int fuse_internal_forget_callback(struct fuse_ticket *tick, struct uio *uio);
 void fuse_internal_forget_send(struct mount *mp, struct thread *td,
     struct ucred *cred, uint64_t nodeid, uint64_t nlookup);
 
 /* fuse start/stop */
 
 int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio);
 void fuse_internal_send_init(struct fuse_data *data, struct thread *td);
 
 #endif /* _FUSE_INTERNAL_H_ */
Index: projects/capsicum-test/sys/fs/fuse/fuse_io.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_io.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_io.c	(revision 345710)
@@ -1,824 +1,841 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_node.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_io.h"
 
-#define FUSE_DEBUG_MODULE IO
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*");
 
-
 static int 
 fuse_read_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh);
 static int 
 fuse_read_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh);
 static int 
 fuse_write_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
 static int 
 fuse_write_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
 
+SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*",
+		"int", "struct ucred*", "struct fuse_filehandle*");
 int
 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
     struct ucred *cred)
 {
 	struct fuse_filehandle *fufh;
 	int err, directio;
 
 	MPASS(vp->v_type == VREG || vp->v_type == VDIR);
 
 	err = fuse_filehandle_getrw(vp,
 	    (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
 	if (err) {
 		printf("FUSE: io dispatch: filehandles are closed\n");
 		return err;
 	}
+	SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh);
+
 	/*
          * Ideally, when the daemon asks for direct io at open time, the
          * standard file flag should be set according to this, so that would
          * just change the default mode, which later on could be changed via
          * fcntl(2).
          * But this doesn't work, the O_DIRECT flag gets cleared at some point
          * (don't know where). So to make any use of the Fuse direct_io option,
          * we hardwire it into the file's private data (similarly to Linux,
          * btw.).
          */
 	directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp));
 
 	switch (uio->uio_rw) {
 	case UIO_READ:
 		if (directio) {
-			FS_DEBUG("direct read of vnode %ju via file handle %ju\n",
-			    (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id);
+			SDT_PROBE2(fuse, , io, trace, 1,
+				"direct read of vnode");
 			err = fuse_read_directbackend(vp, uio, cred, fufh);
 		} else {
-			FS_DEBUG("buffered read of vnode %ju\n", 
-			      (uintmax_t)VTOILLU(vp));
+			SDT_PROBE2(fuse, , io, trace, 1,
+				"buffered read of vnode");
 			err = fuse_read_biobackend(vp, uio, cred, fufh);
 		}
 		break;
 	case UIO_WRITE:
 		/*
 		 * Kludge: simulate write-through caching via write-around
 		 * caching.  Same effect, as far as never caching dirty data,
 		 * but slightly pessimal in that newly written data is not
 		 * cached.
 		 */
 		if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) {
-			FS_DEBUG("direct write of vnode %ju via file handle %ju\n",
-			    (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id);
+			SDT_PROBE2(fuse, , io, trace, 1,
+				"direct write of vnode");
 			err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag);
 		} else {
-			FS_DEBUG("buffered write of vnode %ju\n", 
-			      (uintmax_t)VTOILLU(vp));
+			SDT_PROBE2(fuse, , io, trace, 1,
+				"buffered write of vnode");
 			err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag);
 		}
 		break;
 	default:
 		panic("uninterpreted mode passed to fuse_io_dispatch");
 	}
 
 	return (err);
 }
 
+SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int");
+SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int");
+SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int");
 static int
 fuse_read_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh)
 {
 	struct buf *bp;
 	daddr_t lbn;
 	int bcount;
 	int err = 0, n = 0, on = 0;
 	off_t filesize;
 
 	const int biosize = fuse_iosize(vp);
 
-	FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n",
-	    uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize);
-
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	bcount = biosize;
 	filesize = VTOFUD(vp)->filesize;
 
 	do {
 		if (fuse_isdeadfs(vp)) {
 			err = ENXIO;
 			break;
 		}
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
 
-		FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on);
+		SDT_PROBE3(fuse, , io, read_bio_backend_start,
+			biosize, (int)lbn, on);
 
 		/*
 	         * Obtain the buffer cache block.  Figure out the buffer size
 	         * when we are at EOF.  If we are modifying the size of the
 	         * buffer based on an EOF condition we need to hold
 	         * nfs_rslock() through obtaining the buffer to prevent
 	         * a potential writer-appender from messing with n_size.
 	         * Otherwise we may accidentally truncate the buffer and
 	         * lose dirty data.
 	         *
 	         * Note that bcount is *not* DEV_BSIZE aligned.
 	         */
 		if ((off_t)lbn * biosize >= filesize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > filesize) {
 			bcount = filesize - (off_t)lbn *biosize;
 		}
 		bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
 
 		if (!bp)
 			return (EINTR);
 
 		/*
 	         * If B_CACHE is not set, we must issue the read.  If this
 	         * fails, we return an error.
 	         */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			err = fuse_io_strategy(vp, bp);
 			if (err) {
 				brelse(bp);
 				return (err);
 			}
 		}
 		/*
 	         * on is the offset into the current bp.  Figure out how many
 	         * bytes we can copy out of the bp.  Note that bcount is
 	         * NOT DEV_BSIZE aligned.
 	         *
 	         * Then figure out how many bytes we can copy into the uio.
 	         */
 
 		n = 0;
 		if (on < bcount)
 			n = MIN((unsigned)(bcount - on), uio->uio_resid);
 		if (n > 0) {
-			FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p,"
-				" saying %d was asked for\n",
-				n, bp->b_data + on, n + (int)bp->b_resid);
+			SDT_PROBE2(fuse, , io, read_bio_backend_feed,
+				n, n + (int)bp->b_resid);
 			err = uiomove(bp->b_data + on, n, uio);
 		}
 		brelse(bp);
-		FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n",
-		    err, uio->uio_resid, n);
+		SDT_PROBE3(fuse, , io, read_bio_backend_end, err,
+			uio->uio_resid, n);
 	} while (err == 0 && uio->uio_resid > 0 && n > 0);
 
 	return (err);
 }
 
+SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*");
+SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete,
+	"struct fuse_dispatcher*", "struct uio*");
+
 static int
 fuse_read_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh)
 {
 	struct fuse_dispatcher fdi;
 	struct fuse_read_in *fri;
 	int err = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	fdisp_init(&fdi, 0);
 
 	/*
          * XXX In "normal" case we use an intermediate kernel buffer for
          * transmitting data from daemon's context to ours. Eventually, we should
          * get rid of this. Anyway, if the target uio lives in sysspace (we are
          * called from pageops), and the input data doesn't need kernel-side
          * processing (we are not called from readdir) we can already invoke
          * an optimized, "peer-to-peer" I/O routine.
          */
 	while (uio->uio_resid > 0) {
 		fdi.iosize = sizeof(*fri);
 		fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred);
 		fri = fdi.indata;
 		fri->fh = fufh->fh_id;
 		fri->offset = uio->uio_offset;
 		fri->size = MIN(uio->uio_resid,
 		    fuse_get_mpdata(vp->v_mount)->max_read);
 
-		FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n",
-			(uintmax_t)fri->fh, (uintmax_t)fri->offset, 
-			(uintmax_t)fri->size);
+		SDT_PROBE1(fuse, , io, read_directbackend_start, fri);
 
 		if ((err = fdisp_wait_answ(&fdi)))
 			goto out;
 
-		FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; "
-			"resid=%zd offset=%ju\n",
-			fri->size, fdi.iosize, uio->uio_resid, 
-			(uintmax_t)uio->uio_offset);
+		SDT_PROBE2(fuse, , io, read_directbackend_complete,
+			fdi.iosize, uio);
 
 		if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio)))
 			break;
 		if (fdi.iosize < fri->size)
 			break;
 	}
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 static int
 fuse_write_directbackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_write_in *fwi;
 	struct fuse_dispatcher fdi;
 	size_t chunksize;
 	int diff;
 	int err = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (ioflag & IO_APPEND)
 		uio_setoffset(uio, fvdat->filesize);
 
 	fdisp_init(&fdi, 0);
 
 	while (uio->uio_resid > 0) {
 		chunksize = MIN(uio->uio_resid,
 		    fuse_get_mpdata(vp->v_mount)->max_write);
 
 		fdi.iosize = sizeof(*fwi) + chunksize;
 		fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred);
 
 		fwi = fdi.indata;
 		fwi->fh = fufh->fh_id;
 		fwi->offset = uio->uio_offset;
 		fwi->size = chunksize;
 
 		if ((err = uiomove((char *)fdi.indata + sizeof(*fwi),
 		    chunksize, uio)))
 			break;
 
 		if ((err = fdisp_wait_answ(&fdi)))
 			break;
 
+		/* Adjust the uio in the case of short writes */
 		diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size;
 		if (diff < 0) {
 			err = EINVAL;
 			break;
+		} else if (diff > 0 && !(ioflag & IO_DIRECT)) {
+			/* 
+			 * XXX We really should be directly checking whether
+			 * the file was opened with FOPEN_DIRECT_IO, not
+			 * IO_DIRECT.  IO_DIRECT can be set in multiple ways.
+			 */
+			SDT_PROBE2(fuse, , io, trace, 1,
+				"misbehaving filesystem: short writes are only "
+				"allowed with direct_io");
 		}
 		uio->uio_resid += diff;
 		uio->uio_offset -= diff;
+
 		if (uio->uio_offset > fvdat->filesize &&
 		    fuse_data_cache_mode != FUSE_CACHE_UC) {
 			fuse_vnode_setsize(vp, cred, uio->uio_offset);
 			fvdat->flag &= ~FN_SIZECHANGE;
 		}
 	}
 
 	fdisp_destroy(&fdi);
 
 	return (err);
 }
 
+SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int",
+		"struct uio*", "int", "bool");
+SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int");
+
 static int
 fuse_write_biobackend(struct vnode *vp, struct uio *uio,
     struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct buf *bp;
 	daddr_t lbn;
 	int bcount;
 	int n, on, err = 0;
 
 	const int biosize = fuse_iosize(vp);
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
-	FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n",
-	    uio->uio_resid, uio->uio_offset, fvdat->filesize);
 	if (vp->v_type != VREG)
 		return (EIO);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	if (uio->uio_resid == 0)
 		return (0);
 	if (ioflag & IO_APPEND)
 		uio_setoffset(uio, fvdat->filesize);
 
 	/*
          * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
          * would exceed the local maximum per-file write commit size when
          * combined with those, we must decide whether to flush,
          * go synchronous, or return err.  We don't bother checking
          * IO_UNIT -- we just make all writes atomic anyway, as there's
          * no point optimizing for something that really won't ever happen.
          */
 	do {
 		if (fuse_isdeadfs(vp)) {
 			err = ENXIO;
 			break;
 		}
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
 		n = MIN((unsigned)(biosize - on), uio->uio_resid);
 
-		FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n",
-			(uintmax_t)lbn, on, n, 
-			(uintmax_t)uio->uio_offset, uio->uio_resid);
-
 again:
 		/*
 	         * Handle direct append and file extension cases, calculate
 	         * unaligned buffer size.
 	         */
 		if (uio->uio_offset == fvdat->filesize && n) {
 			/*
 	                 * Get the buffer (in its pre-append state to maintain
 	                 * B_CACHE if it was previously set).  Resize the
 	                 * nfsnode after we have locked the buffer to prevent
 	                 * readers from reading garbage.
 	                 */
 			bcount = on;
-			FS_DEBUG("getting block from OS, bcount %d\n", bcount);
+			SDT_PROBE6(fuse, , io, write_biobackend_start,
+				lbn, on, n, uio, bcount, true);
 			bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
 
 			if (bp != NULL) {
 				long save;
 
 				err = fuse_vnode_setsize(vp, cred, 
 							 uio->uio_offset + n);
 				if (err) {
 					brelse(bp);
 					break;
 				}
 				save = bp->b_flags & B_CACHE;
 				bcount += n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 			}
 		} else {
 			/*
 	                 * Obtain the locked cache block first, and then
 	                 * adjust the file's size as appropriate.
 	                 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < fvdat->filesize) {
 				if ((off_t)(lbn + 1) * biosize < fvdat->filesize)
 					bcount = biosize;
 				else
 					bcount = fvdat->filesize - 
 					  (off_t)lbn *biosize;
 			}
-			FS_DEBUG("getting block from OS, bcount %d\n", bcount);
+			SDT_PROBE6(fuse, , io, write_biobackend_start,
+				lbn, on, n, uio, bcount, false);
 			bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
 			if (bp && uio->uio_offset + n > fvdat->filesize) {
 				err = fuse_vnode_setsize(vp, cred, 
 							 uio->uio_offset + n);
 				if (err) {
 					brelse(bp);
 					break;
 				}
 			}
 		}
 
 		if (!bp) {
 			err = EINTR;
 			break;
 		}
 		/*
 	         * Issue a READ if B_CACHE is not set.  In special-append
 	         * mode, B_CACHE is based on the buffer prior to the write
 	         * op and is typically set, avoiding the read.  If a read
 	         * is required in special append mode, the server will
 	         * probably send us a short-read since we extended the file
 	         * on our end, resulting in b_resid == 0 and, thusly,
 	         * B_CACHE getting set.
 	         *
 	         * We can also avoid issuing the read if the write covers
 	         * the entire buffer.  We have to make sure the buffer state
 	         * is reasonable in this case since we will not be initiating
 	         * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 	         * more information.
 	         *
 	         * B_CACHE may also be set due to the buffer being cached
 	         * normally.
 	         */
 
 		if (on == 0 && n == bcount) {
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			fuse_io_strategy(vp, bp);
 			if ((err = bp->b_error)) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 
 		/*
 	         * If dirtyend exceeds file size, chop it down.  This should
 	         * not normally occur but there is an append race where it
 	         * might occur XXX, so we log it.
 	         *
 	         * If the chopping creates a reverse-indexed or degenerate
 	         * situation with dirtyoff/end, we 0 both of them.
 	         */
 
 		if (bp->b_dirtyend > bcount) {
-			FS_DEBUG("FUSE append race @%lx:%d\n",
+			SDT_PROBE2(fuse, , io, write_biobackend_append_race,
 			    (long)bp->b_blkno * biosize,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 	         * If the new write will leave a contiguous dirty
 	         * area, just update the b_dirtyoff and b_dirtyend,
 	         * otherwise force a write rpc of the old dirty area.
 	         *
 	         * While it is possible to merge discontiguous writes due to
 	         * our having a B_CACHE buffer ( and thus valid read data
 	         * for the hole), we don't because it could lead to
 	         * significant cache coherency problems with multiple clients,
 	         * especially if locking is implemented later on.
 	         *
 	         * as an optimization we could theoretically maintain
 	         * a linked list of discontinuous areas, but we would still
 	         * have to commit them separately so there isn't much
 	         * advantage to it except perhaps a bit of asynchronization.
 	         */
 
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			/*
 	                 * Yes, we mean it. Write out everything to "storage"
 	                 * immediately, without hesitation. (Apart from other
 	                 * reasons: the only way to know if a write is valid
 	                 * if its actually written out.)
 	                 */
 			bwrite(bp);
 			if (bp->b_error == EINTR) {
 				err = EINTR;
 				break;
 			}
 			goto again;
 		}
 		err = uiomove((char *)bp->b_data + on, n, uio);
 
 		/*
 	         * Since this block is being modified, it must be written
 	         * again and not just committed.  Since write clustering does
 	         * not work for the stage 1 data write, only the stage 2
 	         * commit rpc, we have to clear B_CLUSTEROK as well.
 	         */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		if (err) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_error = err;
 			brelse(bp);
 			break;
 		}
 		/*
 	         * Only update dirtyoff/dirtyend if not a degenerate
 	         * condition.
 	         */
 		if (n) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = MIN(on, bp->b_dirtyoff);
 				bp->b_dirtyend = MAX((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_valid(bp, on, n);
 		}
 		err = bwrite(bp);
 		if (err)
 			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
 	if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0)
 		fuse_vnode_savesize(vp, cred);
 
 	return (err);
 }
 
 int
 fuse_io_strategy(struct vnode *vp, struct buf *bp)
 {
 	struct fuse_filehandle *fufh;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct ucred *cred;
 	struct uio *uiop;
 	struct uio uio;
 	struct iovec io;
 	int error = 0;
 
 	const int biosize = fuse_iosize(vp);
 
 	MPASS(vp->v_type == VREG || vp->v_type == VDIR);
 	MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE);
-	FS_DEBUG("inode=%ju offset=%jd resid=%ld\n",
-	    (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize),
-	    bp->b_bcount);
 
 	error = fuse_filehandle_getrw(vp,
 	    (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
 	if (error) {
 		printf("FUSE: strategy: filehandles are closed\n");
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 		return (error);
 	}
 	cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
 
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = curthread;
 
 	/*
          * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
          * do this here so we do not have to do it in all the code that
          * calls us.
          */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("fuse_io_strategy: bp %p already marked done", bp));
 	if (bp->b_iocmd == BIO_READ) {
 		io.iov_len = uiop->uio_resid = bp->b_bcount;
 		io.iov_base = bp->b_data;
 		uiop->uio_rw = UIO_READ;
 
 		uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
 		error = fuse_read_directbackend(vp, uiop, cred, fufh);
 
 		/* XXXCEM: Potentially invalid access to cached_attrs here */
 		if ((!error && uiop->uio_resid) ||
 		    (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO &&
 		    uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 &&
 		    uiop->uio_offset >= fvdat->cached_attrs.va_size)) {
 			/*
 	                 * If we had a short read with no error, we must have
 	                 * hit a file hole.  We should zero-fill the remainder.
 	                 * This can also occur if the server hits the file EOF.
 	                 *
 	                 * Holes used to be able to occur due to pending
 	                 * writes, but that is not possible any longer.
 	                 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			int left = uiop->uio_resid;
 
 			if (error != 0) {
 				printf("FUSE: Fix broken io: offset %ju, "
 				       " resid %zd, file size %ju/%ju\n", 
 				       (uintmax_t)uiop->uio_offset,
 				    uiop->uio_resid, fvdat->filesize,
 				    fvdat->cached_attrs.va_size);
 				error = 0;
 			}
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		}
 		if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_error = error;
 		}
 	} else {
 		/*
 	         * If we only need to commit, try to commit
 	         */
 		if (bp->b_flags & B_NEEDCOMMIT) {
-			FS_DEBUG("write: B_NEEDCOMMIT flags set\n");
+			SDT_PROBE2(fuse, , io, trace, 1,
+				"write: B_NEEDCOMMIT flags set");
 		}
 		/*
 	         * Setup for actual write
 	         */
 		if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 
 		    fvdat->filesize)
 			bp->b_dirtyend = fvdat->filesize - 
 				(off_t)bp->b_blkno * biosize;
 
 		if (bp->b_dirtyend > bp->b_dirtyoff) {
 			io.iov_len = uiop->uio_resid = bp->b_dirtyend
 			    - bp->b_dirtyoff;
 			uiop->uio_offset = (off_t)bp->b_blkno * biosize
 			    + bp->b_dirtyoff;
 			io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 			uiop->uio_rw = UIO_WRITE;
 
 			error = fuse_write_directbackend(vp, uiop, cred, fufh, 0);
 
 			if (error == EINTR || error == ETIMEDOUT
 			    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 
 				bp->b_flags &= ~(B_INVAL | B_NOCACHE);
 				if ((bp->b_flags & B_PAGING) == 0) {
 					bdirty(bp);
 					bp->b_flags &= ~B_DONE;
 				}
 				if ((error == EINTR || error == ETIMEDOUT) &&
 				    (bp->b_flags & B_ASYNC) == 0)
 					bp->b_flags |= B_EINTR;
 			} else {
 				if (error) {
 					bp->b_ioflags |= BIO_ERROR;
 					bp->b_flags |= B_INVAL;
 					bp->b_error = error;
 				}
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 			}
 		} else {
 			bp->b_resid = 0;
 			bufdone(bp);
 			return (0);
 		}
 	}
 	bp->b_resid = uiop->uio_resid;
 	bufdone(bp);
 	return (error);
 }
 
 int
 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td)
 {
 	struct vop_fsync_args a = {
 		.a_vp = vp,
 		.a_waitfor = waitfor,
 		.a_td = td,
 	};
 
 	return (vop_stdfsync(&a));
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 fuse_io_invalbuf(struct vnode *vp, struct thread *td)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	int error = 0;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return 0;
 
 	ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf");
 
 	while (fvdat->flag & FN_FLUSHINPROG) {
 		struct proc *p = td->td_proc;
 
 		if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF)
 			return EIO;
 		fvdat->flag |= FN_FLUSHWANT;
 		tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz);
 		error = 0;
 		if (p != NULL) {
 			PROC_LOCK(p);
 			if (SIGNOTEMPTY(p->p_siglist) ||
 			    SIGNOTEMPTY(td->td_siglist))
 				error = EINTR;
 			PROC_UNLOCK(p);
 		}
 		if (error == EINTR)
 			return EINTR;
 	}
 	fvdat->flag |= FN_FLUSHINPROG;
 
 	if (vp->v_bufobj.bo_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object);
 	}
 	error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
 	while (error) {
 		if (error == ERESTART || error == EINTR) {
 			fvdat->flag &= ~FN_FLUSHINPROG;
 			if (fvdat->flag & FN_FLUSHWANT) {
 				fvdat->flag &= ~FN_FLUSHWANT;
 				wakeup(&fvdat->flag);
 			}
 			return EINTR;
 		}
 		error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
 	}
 	fvdat->flag &= ~FN_FLUSHINPROG;
 	if (fvdat->flag & FN_FLUSHWANT) {
 		fvdat->flag &= ~FN_FLUSHWANT;
 		wakeup(&fvdat->flag);
 	}
 	return (error);
 }
Index: projects/capsicum-test/sys/fs/fuse/fuse_ipc.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_ipc.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_ipc.c	(revision 345710)
@@ -1,880 +1,824 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
+#include <sys/sdt.h>
 #include <sys/vnode.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 
 #include "fuse.h"
 #include "fuse_node.h"
 #include "fuse_ipc.h"
 #include "fuse_internal.h"
 
-#define FUSE_DEBUG_MODULE IPC
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*");
 
 static struct fuse_ticket *fticket_alloc(struct fuse_data *data);
 static void fticket_refresh(struct fuse_ticket *ftick);
 static void fticket_destroy(struct fuse_ticket *ftick);
 static int fticket_wait_answer(struct fuse_ticket *ftick);
 static inline int 
 fticket_aw_pull_uio(struct fuse_ticket *ftick,
     struct uio *uio);
 
 static int fuse_body_audit(struct fuse_ticket *ftick, size_t blen);
 
 static fuse_handler_t fuse_standard_handler;
 
 SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables");
 SYSCTL_STRING(_vfs_fusefs, OID_AUTO, version, CTLFLAG_RD,
     FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version");
 static int fuse_ticket_count = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, ticket_count, CTLFLAG_RW,
     &fuse_ticket_count, 0, "number of allocated tickets");
 static long fuse_iov_permanent_bufsize = 1 << 19;
 
 SYSCTL_LONG(_vfs_fusefs, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW,
     &fuse_iov_permanent_bufsize, 0,
     "limit for permanently stored buffer size for fuse_iovs");
 static int fuse_iov_credit = 16;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, iov_credit, CTLFLAG_RW,
     &fuse_iov_credit, 0,
     "how many times is an oversized fuse_iov tolerated");
 
 MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer");
 static uma_zone_t ticket_zone;
 
 static void
 fuse_block_sigs(sigset_t *oldset)
 {
 	sigset_t newset;
 
 	SIGFILLSET(newset);
 	SIGDELSET(newset, SIGKILL);
 	if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0))
 		panic("%s: Invalid operation for kern_sigprocmask()",
 		    __func__);
 }
 
 static void
 fuse_restore_sigs(sigset_t *oldset)
 {
 
 	if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0))
 		panic("%s: Invalid operation for kern_sigprocmask()",
 		    __func__);
 }
 
 void
 fiov_init(struct fuse_iov *fiov, size_t size)
 {
 	uint32_t msize = FU_AT_LEAST(size);
 
-	debug_printf("fiov=%p, size=%zd\n", fiov, size);
-
 	fiov->len = 0;
 
 	fiov->base = malloc(msize, M_FUSEMSG, M_WAITOK | M_ZERO);
 
 	fiov->allocated_size = msize;
 	fiov->credit = fuse_iov_credit;
 }
 
 void
 fiov_teardown(struct fuse_iov *fiov)
 {
-	debug_printf("fiov=%p\n", fiov);
-
 	MPASS(fiov->base != NULL);
 	free(fiov->base, M_FUSEMSG);
 }
 
 void
 fiov_adjust(struct fuse_iov *fiov, size_t size)
 {
-	debug_printf("fiov=%p, size=%zd\n", fiov, size);
-
 	if (fiov->allocated_size < size ||
 	    (fuse_iov_permanent_bufsize >= 0 &&
 	    fiov->allocated_size - size > fuse_iov_permanent_bufsize &&
 	    --fiov->credit < 0)) {
 
 		fiov->base = realloc(fiov->base, FU_AT_LEAST(size), M_FUSEMSG,
 		    M_WAITOK | M_ZERO);
 		if (!fiov->base) {
 			panic("FUSE: realloc failed");
 		}
 		fiov->allocated_size = FU_AT_LEAST(size);
 		fiov->credit = fuse_iov_credit;
 	}
 	fiov->len = size;
 }
 
 void
 fiov_refresh(struct fuse_iov *fiov)
 {
-	debug_printf("fiov=%p\n", fiov);
-
 	bzero(fiov->base, fiov->len);
 	fiov_adjust(fiov, 0);
 }
 
 static int
 fticket_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct fuse_ticket *ftick = mem;
 	struct fuse_data *data = arg;
 
-	debug_printf("ftick=%p data=%p\n", ftick, data);
-
 	FUSE_ASSERT_MS_DONE(ftick);
 	FUSE_ASSERT_AW_DONE(ftick);
 
 	ftick->tk_data = data;
 
 	if (ftick->tk_unique != 0)
 		fticket_refresh(ftick);
 
 	/* May be truncated to 32 bits */
 	ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1);
 	if (ftick->tk_unique == 0)
 		ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1);
 
 	refcount_init(&ftick->tk_refcount, 1);
 	atomic_add_acq_int(&fuse_ticket_count, 1);
 
 	return 0;
 }
 
 static void
 fticket_dtor(void *mem, int size, void *arg)
 {
+#ifdef INVARIANTS
 	struct fuse_ticket *ftick = mem;
+#endif
 
-	debug_printf("ftick=%p\n", ftick);
-
 	FUSE_ASSERT_MS_DONE(ftick);
 	FUSE_ASSERT_AW_DONE(ftick);
 
 	atomic_subtract_acq_int(&fuse_ticket_count, 1);
 }
 
 static int
 fticket_init(void *mem, int size, int flags)
 {
 	struct fuse_ticket *ftick = mem;
 
-	FS_DEBUG("ftick=%p\n", ftick);
-
 	bzero(ftick, sizeof(struct fuse_ticket));
 
 	fiov_init(&ftick->tk_ms_fiov, sizeof(struct fuse_in_header));
 	ftick->tk_ms_type = FT_M_FIOV;
 
 	mtx_init(&ftick->tk_aw_mtx, "fuse answer delivery mutex", NULL, MTX_DEF);
 	fiov_init(&ftick->tk_aw_fiov, 0);
 	ftick->tk_aw_type = FT_A_FIOV;
 
 	return 0;
 }
 
 static void
 fticket_fini(void *mem, int size)
 {
 	struct fuse_ticket *ftick = mem;
 
-	FS_DEBUG("ftick=%p\n", ftick);
-
 	fiov_teardown(&ftick->tk_ms_fiov);
 	fiov_teardown(&ftick->tk_aw_fiov);
 	mtx_destroy(&ftick->tk_aw_mtx);
 }
 
 static inline struct fuse_ticket *
 fticket_alloc(struct fuse_data *data)
 {
 	return uma_zalloc_arg(ticket_zone, data, M_WAITOK);
 }
 
 static inline void
 fticket_destroy(struct fuse_ticket *ftick)
 {
 	return uma_zfree(ticket_zone, ftick);
 }
 
 static	inline
 void
 fticket_refresh(struct fuse_ticket *ftick)
 {
-	debug_printf("ftick=%p\n", ftick);
-
 	FUSE_ASSERT_MS_DONE(ftick);
 	FUSE_ASSERT_AW_DONE(ftick);
 
 	fiov_refresh(&ftick->tk_ms_fiov);
 	ftick->tk_ms_bufdata = NULL;
 	ftick->tk_ms_bufsize = 0;
 	ftick->tk_ms_type = FT_M_FIOV;
 
 	bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header));
 
 	fiov_refresh(&ftick->tk_aw_fiov);
 	ftick->tk_aw_errno = 0;
 	ftick->tk_aw_bufdata = NULL;
 	ftick->tk_aw_bufsize = 0;
 	ftick->tk_aw_type = FT_A_FIOV;
 
 	ftick->tk_flag = 0;
 }
 
 static int
 fticket_wait_answer(struct fuse_ticket *ftick)
 {
 	sigset_t tset;
 	int err = 0;
 	struct fuse_data *data;
 
-	debug_printf("ftick=%p\n", ftick);
 	fuse_lck_mtx_lock(ftick->tk_aw_mtx);
 
 	if (fticket_answered(ftick)) {
 		goto out;
 	}
 	data = ftick->tk_data;
 
 	if (fdata_get_dead(data)) {
 		err = ENOTCONN;
 		fticket_set_answered(ftick);
 		goto out;
 	}
 	fuse_block_sigs(&tset);
 	err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans",
 	    data->daemon_timeout * hz);
 	fuse_restore_sigs(&tset);
 	if (err == EAGAIN) {		/* same as EWOULDBLOCK */
 #ifdef XXXIP				/* die conditionally */
 		if (!fdata_get_dead(data)) {
 			fdata_set_dead(data);
 		}
 #endif
 		err = ETIMEDOUT;
 		fticket_set_answered(ftick);
 	}
 out:
 	if (!(err || fticket_answered(ftick))) {
-		debug_printf("FUSE: requester was woken up but still no answer");
+		SDT_PROBE2(fuse, , ipc, trace, 1,
+			"FUSE: requester was woken up but still no answer");
 		err = ENXIO;
 	}
 	fuse_lck_mtx_unlock(ftick->tk_aw_mtx);
 
 	return err;
 }
 
 static	inline
 int
 fticket_aw_pull_uio(struct fuse_ticket *ftick, struct uio *uio)
 {
 	int err = 0;
 	size_t len = uio_resid(uio);
 
-	debug_printf("ftick=%p, uio=%p\n", ftick, uio);
-
 	if (len) {
 		switch (ftick->tk_aw_type) {
 		case FT_A_FIOV:
 			fiov_adjust(fticket_resp(ftick), len);
 			err = uiomove(fticket_resp(ftick)->base, len, uio);
-			if (err) {
-				debug_printf("FUSE: FT_A_FIOV: error is %d"
-					     " (%p, %zd, %p)\n",
-					     err, fticket_resp(ftick)->base, 
-					     len, uio);
-			}
 			break;
 
 		case FT_A_BUF:
 			ftick->tk_aw_bufsize = len;
 			err = uiomove(ftick->tk_aw_bufdata, len, uio);
-			if (err) {
-				debug_printf("FUSE: FT_A_BUF: error is %d"
-					     " (%p, %zd, %p)\n",
-					     err, ftick->tk_aw_bufdata, len, uio);
-			}
 			break;
 
 		default:
 			panic("FUSE: unknown answer type for ticket %p", ftick);
 		}
 	}
 	return err;
 }
 
 int
 fticket_pull(struct fuse_ticket *ftick, struct uio *uio)
 {
 	int err = 0;
 
-	debug_printf("ftick=%p, uio=%p\n", ftick, uio);
-
 	if (ftick->tk_aw_ohead.error) {
 		return 0;
 	}
 	err = fuse_body_audit(ftick, uio_resid(uio));
 	if (!err) {
 		err = fticket_aw_pull_uio(ftick, uio);
 	}
 	return err;
 }
 
 struct fuse_data *
 fdata_alloc(struct cdev *fdev, struct ucred *cred)
 {
 	struct fuse_data *data;
 
-	debug_printf("fdev=%p\n", fdev);
-
 	data = malloc(sizeof(struct fuse_data), M_FUSEMSG, M_WAITOK | M_ZERO);
 
 	data->fdev = fdev;
 	mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF);
 	STAILQ_INIT(&data->ms_head);
 	mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF);
 	TAILQ_INIT(&data->aw_head);
 	data->daemoncred = crhold(cred);
 	data->daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT;
 	sx_init(&data->rename_lock, "fuse rename lock");
 	data->ref = 1;
 
 	return data;
 }
 
 void
 fdata_trydestroy(struct fuse_data *data)
 {
-	FS_DEBUG("data=%p data.mp=%p data.fdev=%p data.flags=%04x\n",
-	    data, data->mp, data->fdev, data->dataflags);
-
-	FS_DEBUG("destroy: data=%p\n", data);
 	data->ref--;
 	MPASS(data->ref >= 0);
 	if (data->ref != 0)
 		return;
 
 	/* Driving off stage all that stuff thrown at device... */
 	mtx_destroy(&data->ms_mtx);
 	mtx_destroy(&data->aw_mtx);
 	sx_destroy(&data->rename_lock);
 
 	crfree(data->daemoncred);
 
 	free(data, M_FUSEMSG);
 }
 
 void
 fdata_set_dead(struct fuse_data *data)
 {
-	debug_printf("data=%p\n", data);
-
 	FUSE_LOCK();
 	if (fdata_get_dead(data)) {
 		FUSE_UNLOCK();
 		return;
 	}
 	fuse_lck_mtx_lock(data->ms_mtx);
 	data->dataflags |= FSESS_DEAD;
 	wakeup_one(data);
 	selwakeuppri(&data->ks_rsel, PZERO + 1);
 	wakeup(&data->ticketer);
 	fuse_lck_mtx_unlock(data->ms_mtx);
 	FUSE_UNLOCK();
 }
 
 struct fuse_ticket *
 fuse_ticket_fetch(struct fuse_data *data)
 {
 	int err = 0;
 	struct fuse_ticket *ftick;
 
-	debug_printf("data=%p\n", data);
-
 	ftick = fticket_alloc(data);
 
 	if (!(data->dataflags & FSESS_INITED)) {
 		/* Sleep until get answer for INIT messsage */
 		FUSE_LOCK();
 		if (!(data->dataflags & FSESS_INITED) && data->ticketer > 2) {
 			err = msleep(&data->ticketer, &fuse_mtx, PCATCH | PDROP,
 			    "fu_ini", 0);
 			if (err)
 				fdata_set_dead(data);
 		} else
 			FUSE_UNLOCK();
 	}
 	return ftick;
 }
 
 int
 fuse_ticket_drop(struct fuse_ticket *ftick)
 {
 	int die;
 
 	die = refcount_release(&ftick->tk_refcount);
-	debug_printf("ftick=%p refcount=%d\n", ftick, ftick->tk_refcount);
 	if (die)
 		fticket_destroy(ftick);
 
 	return die;
 }
 
 void
 fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t * handler)
 {
-	debug_printf("ftick=%p, handler=%p data=%p\n", ftick, ftick->tk_data, 
-		     handler);
-
 	if (fdata_get_dead(ftick->tk_data)) {
 		return;
 	}
 	ftick->tk_aw_handler = handler;
 
 	fuse_lck_mtx_lock(ftick->tk_data->aw_mtx);
 	fuse_aw_push(ftick);
 	fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx);
 }
 
 void
 fuse_insert_message(struct fuse_ticket *ftick)
 {
-	debug_printf("ftick=%p\n", ftick);
-
 	if (ftick->tk_flag & FT_DIRTY) {
 		panic("FUSE: ticket reused without being refreshed");
 	}
 	ftick->tk_flag |= FT_DIRTY;
 
 	if (fdata_get_dead(ftick->tk_data)) {
 		return;
 	}
 	fuse_lck_mtx_lock(ftick->tk_data->ms_mtx);
 	fuse_ms_push(ftick);
 	wakeup_one(ftick->tk_data);
 	selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1);
 	fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx);
 }
 
 static int
 fuse_body_audit(struct fuse_ticket *ftick, size_t blen)
 {
 	int err = 0;
 	enum fuse_opcode opcode;
 
-	debug_printf("ftick=%p, blen = %zu\n", ftick, blen);
-
 	opcode = fticket_opcode(ftick);
 
 	switch (opcode) {
 	case FUSE_LOOKUP:
 		err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_FORGET:
 		panic("FUSE: a handler has been intalled for FUSE_FORGET");
 		break;
 
 	case FUSE_GETATTR:
 		err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_SETATTR:
 		err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_READLINK:
 		err = (PAGE_SIZE >= blen) ? 0 : EINVAL;
 		break;
 
 	case FUSE_SYMLINK:
 		err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_MKNOD:
 		err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_MKDIR:
 		err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_UNLINK:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_RMDIR:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_RENAME:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_LINK:
 		err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_OPEN:
 		err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_READ:
 		err = (((struct fuse_read_in *)(
 		    (char *)ftick->tk_ms_fiov.base +
 		    sizeof(struct fuse_in_header)
 		    ))->size >= blen) ? 0 : EINVAL;
 		break;
 
 	case FUSE_WRITE:
 		err = (blen == sizeof(struct fuse_write_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_STATFS:
 		if (fuse_libabi_geq(ftick->tk_data, 7, 4)) {
 			err = (blen == sizeof(struct fuse_statfs_out)) ? 
 			  0 : EINVAL;
 		} else {
 			err = (blen == FUSE_COMPAT_STATFS_SIZE) ? 0 : EINVAL;
 		}
 		break;
 
 	case FUSE_RELEASE:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_FSYNC:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_SETXATTR:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_GETXATTR:
 	case FUSE_LISTXATTR:
 		/*
 		 * These can have varying response lengths, and 0 length
 		 * isn't necessarily invalid.
 		 */
 		err = 0;
 		break;
 
 	case FUSE_REMOVEXATTR:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_FLUSH:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_INIT:
 		if (blen == sizeof(struct fuse_init_out) || blen == 8) {
 			err = 0;
 		} else {
 			err = EINVAL;
 		}
 		break;
 
 	case FUSE_OPENDIR:
 		err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_READDIR:
 		err = (((struct fuse_read_in *)(
 		    (char *)ftick->tk_ms_fiov.base +
 		    sizeof(struct fuse_in_header)
 		    ))->size >= blen) ? 0 : EINVAL;
 		break;
 
 	case FUSE_RELEASEDIR:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_FSYNCDIR:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_GETLK:
 		panic("FUSE: no response body format check for FUSE_GETLK");
 		break;
 
 	case FUSE_SETLK:
 		panic("FUSE: no response body format check for FUSE_SETLK");
 		break;
 
 	case FUSE_SETLKW:
 		panic("FUSE: no response body format check for FUSE_SETLKW");
 		break;
 
 	case FUSE_ACCESS:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	case FUSE_CREATE:
 		err = (blen == sizeof(struct fuse_entry_out) +
 		    sizeof(struct fuse_open_out)) ? 0 : EINVAL;
 		break;
 
 	case FUSE_DESTROY:
 		err = (blen == 0) ? 0 : EINVAL;
 		break;
 
 	default:
 		panic("FUSE: opcodes out of sync (%d)\n", opcode);
 	}
 
 	return err;
 }
 
 static inline void
 fuse_setup_ihead(struct fuse_in_header *ihead, struct fuse_ticket *ftick,
     uint64_t nid, enum fuse_opcode op, size_t blen, pid_t pid,
     struct ucred *cred)
 {
 	ihead->len = sizeof(*ihead) + blen;
 	ihead->unique = ftick->tk_unique;
 	ihead->nodeid = nid;
 	ihead->opcode = op;
 
-	debug_printf("ihead=%p, ftick=%p, nid=%ju, op=%d, blen=%zu\n",
-	    ihead, ftick, (uintmax_t)nid, op, blen);
-
 	ihead->pid = pid;
 	ihead->uid = cred->cr_uid;
 	ihead->gid = cred->cr_rgid;
 }
 
 /*
  * fuse_standard_handler just pulls indata and wakes up pretender.
  * Doesn't try to interpret data, that's left for the pretender.
  * Though might do a basic size verification before the pull-in takes place
  */
 
 static int
 fuse_standard_handler(struct fuse_ticket *ftick, struct uio *uio)
 {
 	int err = 0;
 
-	debug_printf("ftick=%p, uio=%p\n", ftick, uio);
-
 	err = fticket_pull(ftick, uio);
 
 	fuse_lck_mtx_lock(ftick->tk_aw_mtx);
 
 	if (!fticket_answered(ftick)) {
 		fticket_set_answered(ftick);
 		ftick->tk_aw_errno = err;
 		wakeup(ftick);
 	}
 	fuse_lck_mtx_unlock(ftick->tk_aw_mtx);
 
 	return err;
 }
 
 void
 fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
     struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
-	debug_printf("fdip=%p, op=%d, mp=%p, nid=%ju\n",
-	    fdip, op, mp, (uintmax_t)nid);
-
 	if (fdip->tick) {
 		fticket_refresh(fdip->tick);
 	} else {
 		fdip->tick = fuse_ticket_fetch(data);
 	}
 
 	FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh,
 	    fdip->indata, fdip->iosize);
 
 	fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid, cred);
 }
 
 void
 fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp,
     uint64_t nid, struct thread *td, struct ucred *cred)
 {
 	RECTIFY_TDCR(td, cred);
 
 	return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred);
 }
 
 void
 fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
     struct vnode *vp, struct thread *td, struct ucred *cred)
 {
-	debug_printf("fdip=%p, op=%d, vp=%p\n", fdip, op, vp);
 	RECTIFY_TDCR(td, cred);
 	return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp),
 	    td->td_proc->p_pid, cred);
 }
 
+SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int");
+
 int
 fdisp_wait_answ(struct fuse_dispatcher *fdip)
 {
 	int err = 0;
 
 	fdip->answ_stat = 0;
 	fuse_insert_callback(fdip->tick, fuse_standard_handler);
 	fuse_insert_message(fdip->tick);
 
 	if ((err = fticket_wait_answer(fdip->tick))) {
-		debug_printf("IPC: interrupted, err = %d\n", err);
-
 		fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx);
 
 		if (fticket_answered(fdip->tick)) {
 			/*
 	                 * Just between noticing the interrupt and getting here,
 	                 * the standard handler has completed his job.
 	                 * So we drop the ticket and exit as usual.
 	                 */
-			debug_printf("IPC: already answered\n");
+			SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+				"IPC: interrupted, already answered", err);
 			fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx);
 			goto out;
 		} else {
 			/*
 	                 * So we were faster than the standard handler.
 	                 * Then by setting the answered flag we get *him*
 	                 * to drop the ticket.
 	                 */
-			debug_printf("IPC: setting to answered\n");
+			SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+				"IPC: interrupted, setting to answered", err);
 			fticket_set_answered(fdip->tick);
 			fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx);
 			return err;
 		}
 	}
-	debug_printf("IPC: not interrupted, err = %d\n", err);
 
 	if (fdip->tick->tk_aw_errno) {
-		debug_printf("IPC: explicit EIO-ing, tk_aw_errno = %d\n",
-		    fdip->tick->tk_aw_errno);
+		SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+			"IPC: explicit EIO-ing", fdip->tick->tk_aw_errno);
 		err = EIO;
 		goto out;
 	}
 	if ((err = fdip->tick->tk_aw_ohead.error)) {
-		debug_printf("IPC: setting status to %d\n",
-		    fdip->tick->tk_aw_ohead.error);
+		SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+			"IPC: setting status", fdip->tick->tk_aw_ohead.error);
 		/*
 	         * This means a "proper" fuse syscall error.
 	         * We record this value so the caller will
 	         * be able to know it's not a boring messaging
 	         * failure, if she wishes so (and if not, she can
 	         * just simply propagate the return value of this routine).
 	         * [XXX Maybe a bitflag would do the job too,
 	         * if other flags needed, this will be converted thusly.]
 	         */
 		fdip->answ_stat = err;
 		goto out;
 	}
 	fdip->answ = fticket_resp(fdip->tick)->base;
 	fdip->iosize = fticket_resp(fdip->tick)->len;
 
-	debug_printf("IPC: all is well\n");
-
 	return 0;
 
 out:
-	debug_printf("IPC: dropping ticket, err = %d\n", err);
-
 	return err;
 }
 
 void
 fuse_ipc_init(void)
 {
 	ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket),
 	    fticket_ctor, fticket_dtor, fticket_init, fticket_fini,
 	    UMA_ALIGN_PTR, 0);
 }
 
 void
 fuse_ipc_destroy(void)
 {
 	uma_zdestroy(ticket_zone);
 }
Index: projects/capsicum-test/sys/fs/fuse/fuse_ipc.h
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_ipc.h	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_ipc.h	(revision 345710)
@@ -1,412 +1,396 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  * 
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _FUSE_IPC_H_
 #define _FUSE_IPC_H_
 
 #include <sys/param.h>
 #include <sys/refcount.h>
 
 struct fuse_iov {
 	void   *base;
 	size_t  len;
 	size_t  allocated_size;
 	int     credit;
 };
 
 void fiov_init(struct fuse_iov *fiov, size_t size);
 void fiov_teardown(struct fuse_iov *fiov);
 void fiov_refresh(struct fuse_iov *fiov);
 void fiov_adjust(struct fuse_iov *fiov, size_t size);
 
 #define FUSE_DIMALLOC(fiov, spc1, spc2, amnt) do {		\
 	fiov_adjust(fiov, (sizeof(*(spc1)) + (amnt)));		\
 	(spc1) = (fiov)->base;					\
 	(spc2) = (char *)(fiov)->base + (sizeof(*(spc1)));	\
 } while (0)
 
 #define FU_AT_LEAST(siz) max((siz), 160)
 
 #define FUSE_ASSERT_AW_DONE(ftick)					\
 	KASSERT((ftick)->tk_aw_link.tqe_next == NULL &&			\
 	    (ftick)->tk_aw_link.tqe_prev == NULL,			\
 	    ("FUSE: ticket still on answer delivery list %p", (ftick)))
 
 #define FUSE_ASSERT_MS_DONE(ftick)				\
 	KASSERT((ftick)->tk_ms_link.stqe_next == NULL,		\
 	    ("FUSE: ticket still on message list %p", (ftick)))
 
 struct fuse_ticket;
 struct fuse_data;
 
 typedef int fuse_handler_t(struct fuse_ticket *ftick, struct uio *uio);
 
 struct fuse_ticket {
 	/* fields giving the identity of the ticket */
 	uint64_t			tk_unique;
 	struct fuse_data		*tk_data;
 	int				tk_flag;
 	u_int				tk_refcount;
 
 	/* fields for initiating an upgoing message */
 	struct fuse_iov			tk_ms_fiov;
 	void				*tk_ms_bufdata;
 	size_t				tk_ms_bufsize;
 	enum { FT_M_FIOV, FT_M_BUF }	tk_ms_type;
 	STAILQ_ENTRY(fuse_ticket)	tk_ms_link;
 
 	/* fields for handling answers coming from userspace */
 	struct fuse_iov			tk_aw_fiov;
 	void				*tk_aw_bufdata;
 	size_t				tk_aw_bufsize;
 	enum { FT_A_FIOV, FT_A_BUF }	tk_aw_type;
 
 	struct fuse_out_header		tk_aw_ohead;
 	int				tk_aw_errno;
 	struct mtx			tk_aw_mtx;
 	fuse_handler_t			*tk_aw_handler;
 	TAILQ_ENTRY(fuse_ticket)	tk_aw_link;
 };
 
 #define FT_ANSW  0x01  /* request of ticket has already been answered */
 #define FT_DIRTY 0x04  /* ticket has been used */
 
 static inline struct fuse_iov *
 fticket_resp(struct fuse_ticket *ftick)
 {
 	return (&ftick->tk_aw_fiov);
 }
 
 static inline bool
 fticket_answered(struct fuse_ticket *ftick)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "-> ftick=%p\n", ftick);
 	mtx_assert(&ftick->tk_aw_mtx, MA_OWNED);
 	return (ftick->tk_flag & FT_ANSW);
 }
 
 static inline void
 fticket_set_answered(struct fuse_ticket *ftick)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "-> ftick=%p\n", ftick);
 	mtx_assert(&ftick->tk_aw_mtx, MA_OWNED);
 	ftick->tk_flag |= FT_ANSW;
 }
 
 static inline enum fuse_opcode
 fticket_opcode(struct fuse_ticket *ftick)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "-> ftick=%p\n", ftick);
 	return (((struct fuse_in_header *)(ftick->tk_ms_fiov.base))->opcode);
 }
 
 int fticket_pull(struct fuse_ticket *ftick, struct uio *uio);
 
 enum mountpri { FM_NOMOUNTED, FM_PRIMARY, FM_SECONDARY };
 
 /*
  * The data representing a FUSE session.
  */
 struct fuse_data {
 	struct cdev			*fdev;
 	struct mount			*mp;
 	struct vnode			*vroot;
 	struct ucred			*daemoncred;
 	int				dataflags;
 	int				ref;
 
 	struct mtx			ms_mtx;
 	STAILQ_HEAD(, fuse_ticket)	ms_head;
 
 	struct mtx			aw_mtx;
 	TAILQ_HEAD(, fuse_ticket)	aw_head;
 
 	u_long				ticketer;
 
 	struct sx			rename_lock;
 
 	uint32_t			fuse_libabi_major;
 	uint32_t			fuse_libabi_minor;
 
 	uint32_t			max_write;
 	uint32_t			max_read;
 	uint32_t			subtype;
 	char				volname[MAXPATHLEN];
 
 	struct selinfo			ks_rsel;
 
 	int				daemon_timeout;
 	uint64_t			notimpl;
 };
 
 #define FSESS_DEAD                0x0001 /* session is to be closed */
 #define FSESS_UNUSED0             0x0002 /* unused */
 #define FSESS_INITED              0x0004 /* session has been inited */
 #define FSESS_DAEMON_CAN_SPY      0x0010 /* let non-owners access this fs */
                                          /* (and being observed by the daemon) */
 #define FSESS_PUSH_SYMLINKS_IN    0x0020 /* prefix absolute symlinks with mp */
 #define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */
 #define FSESS_NO_ATTRCACHE        0x0080 /* no attribute caching */
 #define FSESS_NO_READAHEAD        0x0100 /* no readaheads */
 #define FSESS_NO_DATACACHE        0x0200 /* disable buffer cache */
 #define FSESS_NO_NAMECACHE        0x0400 /* disable name cache */
 #define FSESS_NO_MMAP             0x0800 /* disable mmap */
 #define FSESS_BROKENIO            0x1000 /* fix broken io */
 
 enum fuse_data_cache_mode {
 	FUSE_CACHE_UC,
 	FUSE_CACHE_WT,
 	FUSE_CACHE_WB,
 };
 
 extern int fuse_data_cache_mode;
 extern int fuse_data_cache_invalidate;
 extern int fuse_mmap_enable;
 extern int fuse_sync_resize;
 extern int fuse_fix_broken_io;
 
 static inline struct fuse_data *
 fuse_get_mpdata(struct mount *mp)
 {
 	return mp->mnt_data;
 }
 
 static inline bool
 fsess_isimpl(struct mount *mp, int opcode)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	return ((data->notimpl & (1ULL << opcode)) == 0);
 
 }
 static inline void
 fsess_set_notimpl(struct mount *mp, int opcode)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	data->notimpl |= (1ULL << opcode);
 }
 
 static inline bool
 fsess_opt_datacache(struct mount *mp)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	return (fuse_data_cache_mode != FUSE_CACHE_UC &&
 	    (data->dataflags & FSESS_NO_DATACACHE) == 0);
 }
 
 static inline bool
 fsess_opt_mmap(struct mount *mp)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	if (!fuse_mmap_enable || fuse_data_cache_mode == FUSE_CACHE_UC)
 		return (false);
 	return ((data->dataflags & (FSESS_NO_DATACACHE | FSESS_NO_MMAP)) == 0);
 }
 
 static inline bool
 fsess_opt_brokenio(struct mount *mp)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	return (fuse_fix_broken_io || (data->dataflags & FSESS_BROKENIO));
 }
 
 static inline void
 fuse_ms_push(struct fuse_ticket *ftick)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "ftick=%p refcount=%d\n", ftick,
-	    ftick->tk_refcount + 1);
 	mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED);
 	refcount_acquire(&ftick->tk_refcount);
 	STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link);
 }
 
 static inline struct fuse_ticket *
 fuse_ms_pop(struct fuse_data *data)
 {
 	struct fuse_ticket *ftick = NULL;
 
 	mtx_assert(&data->ms_mtx, MA_OWNED);
 
 	if ((ftick = STAILQ_FIRST(&data->ms_head))) {
 		STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link);
 #ifdef INVARIANTS
 		ftick->tk_ms_link.stqe_next = NULL;
 #endif
 	}
-	DEBUGX(FUSE_DEBUG_IPC, "ftick=%p refcount=%d\n", ftick,
-	    ftick ? ftick->tk_refcount : -1);
 
 	return (ftick);
 }
 
 static inline void
 fuse_aw_push(struct fuse_ticket *ftick)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "ftick=%p refcount=%d\n", ftick,
-	    ftick->tk_refcount + 1);
 	mtx_assert(&ftick->tk_data->aw_mtx, MA_OWNED);
 	refcount_acquire(&ftick->tk_refcount);
 	TAILQ_INSERT_TAIL(&ftick->tk_data->aw_head, ftick, tk_aw_link);
 }
 
 static inline void
 fuse_aw_remove(struct fuse_ticket *ftick)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "ftick=%p refcount=%d\n",
-	    ftick, ftick->tk_refcount);
 	mtx_assert(&ftick->tk_data->aw_mtx, MA_OWNED);
 	TAILQ_REMOVE(&ftick->tk_data->aw_head, ftick, tk_aw_link);
 #ifdef INVARIANTS
 	ftick->tk_aw_link.tqe_next = NULL;
 	ftick->tk_aw_link.tqe_prev = NULL;
 #endif
 }
 
 static inline struct fuse_ticket *
 fuse_aw_pop(struct fuse_data *data)
 {
 	struct fuse_ticket *ftick;
 
 	mtx_assert(&data->aw_mtx, MA_OWNED);
 
 	if ((ftick = TAILQ_FIRST(&data->aw_head)) != NULL)
 		fuse_aw_remove(ftick);
-	DEBUGX(FUSE_DEBUG_IPC, "ftick=%p refcount=%d\n", ftick,
-	    ftick ? ftick->tk_refcount : -1);
 
 	return (ftick);
 }
 
 struct fuse_ticket *fuse_ticket_fetch(struct fuse_data *data);
 int fuse_ticket_drop(struct fuse_ticket *ftick);
 void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t *handler);
 void fuse_insert_message(struct fuse_ticket *ftick);
 
 static inline bool
 fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min)
 {
 	return (data->fuse_libabi_major > abi_maj ||
 	    (data->fuse_libabi_major == abi_maj &&
 	     data->fuse_libabi_minor >= abi_min));
 }
 
 struct fuse_data *fdata_alloc(struct cdev *dev, struct ucred *cred);
 void fdata_trydestroy(struct fuse_data *data);
 void fdata_set_dead(struct fuse_data *data);
 
 static inline bool
 fdata_get_dead(struct fuse_data *data)
 {
 	return (data->dataflags & FSESS_DEAD);
 }
 
 struct fuse_dispatcher {
 	struct fuse_ticket    *tick;
 	struct fuse_in_header *finh;
 
 	void    *indata;
 	size_t   iosize;
 	uint64_t nodeid;
 	int      answ_stat;
 	void    *answ;
 };
 
 static inline void
 fdisp_init(struct fuse_dispatcher *fdisp, size_t iosize)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "-> fdisp=%p, iosize=%zx\n", fdisp, iosize);
 	fdisp->iosize = iosize;
 	fdisp->tick = NULL;
 }
 
 static inline void
 fdisp_destroy(struct fuse_dispatcher *fdisp)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "-> fdisp=%p, ftick=%p\n", fdisp, fdisp->tick);
 	fuse_ticket_drop(fdisp->tick);
 #ifdef INVARIANTS
 	fdisp->tick = NULL;
 #endif
 }
 
 void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op,
     struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred);
 
 void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
     struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred);
 
 void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
     struct vnode *vp, struct thread *td, struct ucred *cred);
 
 int fdisp_wait_answ(struct fuse_dispatcher *fdip);
 
 static inline int
 fdisp_simple_putget_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
     struct vnode *vp, struct thread *td, struct ucred *cred)
 {
-	DEBUGX(FUSE_DEBUG_IPC, "-> fdip=%p, opcode=%d, vp=%p\n", fdip, op, vp);
 	fdisp_make_vp(fdip, op, vp, td, cred);
 	return (fdisp_wait_answ(fdip));
 }
 
 #endif /* _FUSE_IPC_H_ */
Index: projects/capsicum-test/sys/fs/fuse/fuse_main.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_main.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_main.c	(revision 345710)
@@ -1,165 +1,168 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/queue.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/file.h>
 #include <sys/buf.h>
+#include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include "fuse.h"
 
 static void fuse_bringdown(eventhandler_tag eh_tag);
 static int fuse_loader(struct module *m, int what, void *arg);
 
 struct mtx fuse_mtx;
 
 extern struct vfsops fuse_vfsops;
 extern struct cdevsw fuse_cdevsw;
 extern struct vop_vector fuse_vnops;
 extern uma_zone_t fuse_pbuf_zone;
 
 static struct vfsconf fuse_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "fusefs",
 	.vfc_vfsops = &fuse_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_JAIL | VFCF_SYNTHETIC
 };
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_major, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, FUSE_KERNEL_VERSION, "FUSE kernel abi major version");
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_minor, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, FUSE_KERNEL_MINOR_VERSION, "FUSE kernel abi minor version");
+SDT_PROVIDER_DEFINE(fuse);
 
 /******************************
  *
  * >>> Module management stuff
  *
  ******************************/
 
 static void
 fuse_bringdown(eventhandler_tag eh_tag)
 {
 
 	fuse_ipc_destroy();
 	fuse_device_destroy();
 	mtx_destroy(&fuse_mtx);
 }
 
 static int
 fuse_loader(struct module *m, int what, void *arg)
 {
 	static eventhandler_tag eh_tag = NULL;
 	int err = 0;
 
 	switch (what) {
 	case MOD_LOAD:			/* kldload */
 		mtx_init(&fuse_mtx, "fuse_mtx", NULL, MTX_DEF);
 		err = fuse_device_init();
 		if (err) {
 			mtx_destroy(&fuse_mtx);
 			return (err);
 		}
 		fuse_ipc_init();
 		fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2);
 
 		/* vfs_modevent ignores its first arg */
 		if ((err = vfs_modevent(NULL, what, &fuse_vfsconf)))
 			fuse_bringdown(eh_tag);
 		else
 			printf("fuse-freebsd: version %s, FUSE ABI %d.%d\n",
 			    FUSE_FREEBSD_VERSION,
 			    FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
 		break;
 	case MOD_UNLOAD:
 		if ((err = vfs_modevent(NULL, what, &fuse_vfsconf)))
 			return (err);
 		fuse_bringdown(eh_tag);
 		uma_zdestroy(fuse_pbuf_zone);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (err);
 }
 
 /* Registering the module */
 
 static moduledata_t fuse_moddata = {
 	"fusefs",
 	fuse_loader,
 	&fuse_vfsconf
 };
 
 DECLARE_MODULE(fusefs, fuse_moddata, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(fusefs, 1);
Index: projects/capsicum-test/sys/fs/fuse/fuse_node.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_node.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_node.c	(revision 345710)
@@ -1,432 +1,428 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/fcntl.h>
 #include <sys/fnv_hash.h>
 #include <sys/priv.h>
 #include <security/mac/mac_framework.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include "fuse.h"
 #include "fuse_node.h"
 #include "fuse_internal.h"
 #include "fuse_io.h"
 #include "fuse_ipc.h"
 
-#define FUSE_DEBUG_MODULE VNOPS
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , node, trace, "int", "char*");
 
 MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data");
 
 static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS);
 
 static int fuse_node_count = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, node_count, CTLFLAG_RD,
     &fuse_node_count, 0, "Count of FUSE vnodes");
 
 int	fuse_data_cache_mode = FUSE_CACHE_WT;
 
 SYSCTL_PROC(_vfs_fusefs, OID_AUTO, data_cache_mode, CTLTYPE_INT|CTLFLAG_RW,
     &fuse_data_cache_mode, 0, sysctl_fuse_cache_mode, "I",
     "Zero: disable caching of FUSE file data; One: write-through caching "
     "(default); Two: write-back caching (generally unsafe)");
 
 int	fuse_data_cache_invalidate = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, data_cache_invalidate, CTLFLAG_RW,
     &fuse_data_cache_invalidate, 0,
     "If non-zero, discard cached clean file data when there are no active file"
     " users");
 
 int	fuse_mmap_enable = 1;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, mmap_enable, CTLFLAG_RW,
     &fuse_mmap_enable, 0,
     "If non-zero, and data_cache_mode is also non-zero, enable mmap(2) of "
     "FUSE files");
 
 int	fuse_refresh_size = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, refresh_size, CTLFLAG_RW,
     &fuse_refresh_size, 0,
     "If non-zero, and no dirty file extension data is buffered, fetch file "
     "size before write operations");
 
 int	fuse_sync_resize = 1;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, sync_resize, CTLFLAG_RW,
     &fuse_sync_resize, 0,
     "If a cached write extended a file, inform FUSE filesystem of the changed"
     "size immediately subsequent to the issued writes");
 
 int	fuse_fix_broken_io = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, fix_broken_io, CTLFLAG_RW,
     &fuse_fix_broken_io, 0,
     "If non-zero, print a diagnostic warning if a userspace filesystem returns"
     " EIO on reads of recently extended portions of files");
 
 static int
 sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS)
 {
 	int val, error;
 
 	val = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	switch (val) {
 	case FUSE_CACHE_UC:
 	case FUSE_CACHE_WT:
 	case FUSE_CACHE_WB:
 		*(int *)arg1 = val;
 		break;
 	default:
 		return (EDOM);
 	}
 	return (0);
 }
 
 static void
 fuse_vnode_init(struct vnode *vp, struct fuse_vnode_data *fvdat,
     uint64_t nodeid, enum vtype vtyp)
 {
 	int i;
 
 	fvdat->nid = nodeid;
 	vattr_null(&fvdat->cached_attrs);
 	if (nodeid == FUSE_ROOT_ID) {
 		vp->v_vflag |= VV_ROOT;
 	}
 	vp->v_type = vtyp;
 	vp->v_data = fvdat;
 
 	for (i = 0; i < FUFH_MAXTYPE; i++)
 		fvdat->fufh[i].fh_type = FUFH_INVALID;
 
 	atomic_add_acq_int(&fuse_node_count, 1);
 }
 
 void
 fuse_vnode_destroy(struct vnode *vp)
 {
 	struct fuse_vnode_data *fvdat = vp->v_data;
 
 	vp->v_data = NULL;
 	free(fvdat, M_FUSEVN);
 
 	atomic_subtract_acq_int(&fuse_node_count, 1);
 }
 
 static int
 fuse_vnode_cmp(struct vnode *vp, void *nidp)
 {
 	return (VTOI(vp) != *((uint64_t *)nidp));
 }
 
 static uint32_t inline
 fuse_vnode_hash(uint64_t id)
 {
 	return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT));
 }
 
 static int
 fuse_vnode_alloc(struct mount *mp,
     struct thread *td,
     uint64_t nodeid,
     enum vtype vtyp,
     struct vnode **vpp)
 {
 	struct fuse_vnode_data *fvdat;
 	struct vnode *vp2;
 	int err = 0;
 
-	FS_DEBUG("been asked for vno #%ju\n", (uintmax_t)nodeid);
-
 	if (vtyp == VNON) {
 		return EINVAL;
 	}
 	*vpp = NULL;
 	err = vfs_hash_get(mp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE, td, vpp,
 	    fuse_vnode_cmp, &nodeid);
 	if (err)
 		return (err);
 
 	if (*vpp) {
 		MPASS((*vpp)->v_type == vtyp && (*vpp)->v_data != NULL);
-		FS_DEBUG("vnode taken from hash\n");
+		SDT_PROBE2(fuse, , node, trace, 1, "vnode taken from hash");
 		return (0);
 	}
 	fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK | M_ZERO);
 	err = getnewvnode("fuse", mp, &fuse_vnops, vpp);
 	if (err) {
 		free(fvdat, M_FUSEVN);
 		return (err);
 	}
 	lockmgr((*vpp)->v_vnlock, LK_EXCLUSIVE, NULL);
 	fuse_vnode_init(*vpp, fvdat, nodeid, vtyp);
 	err = insmntque(*vpp, mp);
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc");
 	if (err) {
 		free(fvdat, M_FUSEVN);
 		*vpp = NULL;
 		return (err);
 	}
 	err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE,
 	    td, &vp2, fuse_vnode_cmp, &nodeid);
 	if (err)
 		return (err);
 	if (vp2 != NULL) {
 		*vpp = vp2;
 		return (0);
 	}
 
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc");
 
 	return (0);
 }
 
 int
 fuse_vnode_get(struct mount *mp,
     struct fuse_entry_out *feo,
     uint64_t nodeid,
     struct vnode *dvp,
     struct vnode **vpp,
     struct componentname *cnp,
     enum vtype vtyp)
 {
 	struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread);
 	int err = 0;
 
-	debug_printf("dvp=%p\n", dvp);
-
 	err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp);
 	if (err) {
 		return err;
 	}
 	if (dvp != NULL) {
 		MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 		MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'));
 		fuse_vnode_setparent(*vpp, dvp);
 	}
 	if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0 &&
 	    feo != NULL &&
 	    (feo->entry_valid != 0 || feo->entry_valid_nsec != 0)) {
 		ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get");
 		ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get");
 		cache_enter(dvp, *vpp, cnp);
 	}
 
 	/*
 	 * In userland, libfuse uses cached lookups for dot and dotdot entries,
 	 * thus it does not really bump the nlookup counter for forget.
 	 * Follow the same semantic and avoid tu bump it in order to keep
 	 * nlookup counters consistent.
 	 */
 	if (cnp == NULL || ((cnp->cn_flags & ISDOTDOT) == 0 &&
 	    (cnp->cn_namelen != 1 || cnp->cn_nameptr[0] != '.')))
 		VTOFUD(*vpp)->nlookup++;
 
 	return 0;
 }
 
 void
 fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td)
 {
 	/*
 	 * Funcation is called for every vnode open.
 	 * Merge fuse_open_flags it may be 0
 	 */
 	/*
 	 * Ideally speaking, direct io should be enabled on
 	 * fd's but do not see of any way of providing that
 	 * this implementation.
 	 *
 	 * Also cannot think of a reason why would two
 	 * different fd's on same vnode would like
 	 * have DIRECT_IO turned on and off. But linux
 	 * based implementation works on an fd not an
 	 * inode and provides such a feature.
 	 *
 	 * XXXIP: Handle fd based DIRECT_IO
 	 */
 	if (fuse_open_flags & FOPEN_DIRECT_IO) {
 		ASSERT_VOP_ELOCKED(vp, __func__);
 		VTOFUD(vp)->flag |= FN_DIRECTIO;
 		fuse_io_invalbuf(vp, td);
 	} else {
 		if ((fuse_open_flags & FOPEN_KEEP_CACHE) == 0)
 			fuse_io_invalbuf(vp, td);
 	        VTOFUD(vp)->flag &= ~FN_DIRECTIO;
 	}
 
 	if (vnode_vtype(vp) == VREG) {
 		/* XXXIP prevent getattr, by using cached node size */
 		vnode_create_vobject(vp, 0, td);
 	}
 }
 
 int
 fuse_vnode_savesize(struct vnode *vp, struct ucred *cred)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct thread *td = curthread;
 	struct fuse_filehandle *fufh = NULL;
 	struct fuse_dispatcher fdi;
 	struct fuse_setattr_in *fsai;
 	int err = 0;
 
-	FS_DEBUG("inode=%ju size=%ju\n", (uintmax_t)VTOI(vp),
-	    (uintmax_t)fvdat->filesize);
 	ASSERT_VOP_ELOCKED(vp, "fuse_io_extend");
 
 	if (fuse_isdeadfs(vp)) {
 		return EBADF;
 	}
 	if (vnode_vtype(vp) == VDIR) {
 		return EISDIR;
 	}
 	if (vfs_isrdonly(vnode_mount(vp))) {
 		return EROFS;
 	}
 	if (cred == NULL) {
 		cred = td->td_ucred;
 	}
 	fdisp_init(&fdi, sizeof(*fsai));
 	fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
 	fsai = fdi.indata;
 	fsai->valid = 0;
 
 	/* Truncate to a new value. */
 	fsai->size = fvdat->filesize;
 	fsai->valid |= FATTR_SIZE;
 
 	fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
 	if (fufh) {
 		fsai->fh = fufh->fh_id;
 		fsai->valid |= FATTR_FH;
 	}
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 	if (err == 0)
 		fvdat->flag &= ~FN_SIZECHANGE;
 
 	return err;
 }
 
 void
 fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred)
 {
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct vattr va;
 
 	if ((fvdat->flag & FN_SIZECHANGE) != 0 ||
 	    fuse_data_cache_mode == FUSE_CACHE_UC ||
 	    (fuse_refresh_size == 0 && fvdat->filesize != 0))
 		return;
 
 	VOP_GETATTR(vp, &va, cred);
-	FS_DEBUG("refreshed file size: %jd\n", (intmax_t)VTOFUD(vp)->filesize);
+	SDT_PROBE2(fuse, , node, trace, 1, "refreshed file size");
 }
 
 int
 fuse_vnode_setsize(struct vnode *vp, struct ucred *cred, off_t newsize)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	off_t oldsize;
 	int err = 0;
 
-	FS_DEBUG("inode=%ju oldsize=%ju newsize=%ju\n",
-	    (uintmax_t)VTOI(vp), (uintmax_t)fvdat->filesize,
-	    (uintmax_t)newsize);
 	ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize");
 
 	oldsize = fvdat->filesize;
 	fvdat->filesize = newsize;
 	fvdat->flag |= FN_SIZECHANGE;
 
 	if (newsize < oldsize) {
 		err = vtruncbuf(vp, cred, newsize, fuse_iosize(vp));
 	}
 	vnode_pager_setsize(vp, newsize);
 	return err;
 }
Index: projects/capsicum-test/sys/fs/fuse/fuse_vfsops.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_vfsops.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_vfsops.c	(revision 345710)
@@ -1,533 +1,529 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/filedesc.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/fcntl.h>
 
 #include "fuse.h"
 #include "fuse_param.h"
 #include "fuse_node.h"
 #include "fuse_ipc.h"
 #include "fuse_internal.h"
 
 #include <sys/priv.h>
 #include <security/mac/mac_framework.h>
 
-#define FUSE_DEBUG_MODULE VFSOPS
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , vfsops, trace, "int", "char*");
 
 /* This will do for privilege types for now */
 #ifndef PRIV_VFS_FUSE_ALLOWOTHER
 #define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER
 #endif
 #ifndef PRIV_VFS_FUSE_MOUNT_NONUSER
 #define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER
 #endif
 #ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT
 #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER
 #endif
 
 static vfs_mount_t fuse_vfsop_mount;
 static vfs_unmount_t fuse_vfsop_unmount;
 static vfs_root_t fuse_vfsop_root;
 static vfs_statfs_t fuse_vfsop_statfs;
 
 struct vfsops fuse_vfsops = {
 	.vfs_mount = fuse_vfsop_mount,
 	.vfs_unmount = fuse_vfsop_unmount,
 	.vfs_root = fuse_vfsop_root,
 	.vfs_statfs = fuse_vfsop_statfs,
 };
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, init_backgrounded, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 1, "indicate async handshake");
 static int fuse_enforce_dev_perms = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW,
     &fuse_enforce_dev_perms, 0,
     "enforce fuse device permissions for secondary mounts");
 static unsigned sync_unmount = 1;
 
 SYSCTL_UINT(_vfs_fusefs, OID_AUTO, sync_unmount, CTLFLAG_RW,
     &sync_unmount, 0, "specify when to use synchronous unmount");
 
 MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer");
 
 static int
 fuse_getdevice(const char *fspec, struct thread *td, struct cdev **fdevp)
 {
 	struct nameidata nd, *ndp = &nd;
 	struct vnode *devvp;
 	struct cdev *fdev;
 	int err;
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 
 	NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td);
 	if ((err = namei(ndp)) != 0)
 		return err;
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	devvp = ndp->ni_vp;
 
 	if (devvp->v_type != VCHR) {
 		vrele(devvp);
 		return ENXIO;
 	}
 	fdev = devvp->v_rdev;
 	dev_ref(fdev);
 
 	if (fuse_enforce_dev_perms) {
 		/*
 	         * Check if mounter can open the fuse device.
 	         *
 	         * This has significance only if we are doing a secondary mount
 	         * which doesn't involve actually opening fuse devices, but we
 	         * still want to enforce the permissions of the device (in
 	         * order to keep control over the circle of fuse users).
 	         *
 	         * (In case of primary mounts, we are either the superuser so
 	         * we can do anything anyway, or we can mount only if the
 	         * device is already opened by us, ie. we are permitted to open
 	         * the device.)
 	         */
 #if 0
 #ifdef MAC
 		err = mac_check_vnode_open(td->td_ucred, devvp, VREAD | VWRITE);
 		if (!err)
 #endif
 #endif /* 0 */
 			err = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td);
 		if (err) {
 			vrele(devvp);
 			dev_rel(fdev);
 			return err;
 		}
 	}
 	/*
 	 * according to coda code, no extra lock is needed --
 	 * although in sys/vnode.h this field is marked "v"
 	 */
 	vrele(devvp);
 
 	if (!fdev->si_devsw ||
 	    strcmp("fuse", fdev->si_devsw->d_name)) {
 		dev_rel(fdev);
 		return ENXIO;
 	}
 	*fdevp = fdev;
 
 	return 0;
 }
 
 #define FUSE_FLAGOPT(fnam, fval) do {				\
 	vfs_flagopt(opts, #fnam, &mntopts, fval);		\
 	vfs_flagopt(opts, "__" #fnam, &__mntopts, fval);	\
 } while (0)
 
+SDT_PROBE_DEFINE1(fuse, , vfsops, mntopts, "uint64_t");
+SDT_PROBE_DEFINE4(fuse, , vfsops, mount_err, "char*", "struct fuse_data*",
+	"struct mount*", "int");
+
 static int
 fuse_vfsop_mount(struct mount *mp)
 {
 	int err;
 
 	uint64_t mntopts, __mntopts;
 	uint32_t max_read;
 	int daemon_timeout;
 	int fd;
 
 	size_t len;
 
 	struct cdev *fdev;
 	struct fuse_data *data;
 	struct thread *td;
 	struct file *fp, *fptmp;
 	char *fspec, *subtype;
 	struct vfsoptlist *opts;
 
 	subtype = NULL;
 	max_read = ~0;
 	err = 0;
 	mntopts = 0;
 	__mntopts = 0;
 	td = curthread;
 
-	fuse_trace_printf_vfsop();
-
 	if (mp->mnt_flag & MNT_UPDATE)
 		return EOPNOTSUPP;
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_SYNCHRONOUS;
 	mp->mnt_data = NULL;
 	MNT_IUNLOCK(mp);
 	/* Get the new options passed to mount */
 	opts = mp->mnt_optnew;
 
 	if (!opts)
 		return EINVAL;
 
 	/* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */
 	if (!vfs_getopts(opts, "fspath", &err))
 		return err;
 
 	/* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
 	fspec = vfs_getopts(opts, "from", &err);
 	if (!fspec)
 		return err;
 
 	/* `fd' contains the filedescriptor for this session; REQUIRED */
 	if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
 		return EINVAL;
 
 	err = fuse_getdevice(fspec, td, &fdev);
 	if (err != 0)
 		return err;
 
 	/*
 	 * With the help of underscored options the mount program
 	 * can inform us from the flags it sets by default
 	 */
 	FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY);
 	FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN);
 	FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS);
 	FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE);
 	FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD);
 	FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE);
 	FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE);
 	FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP);
 	FUSE_FLAGOPT(brokenio, FSESS_BROKENIO);
 
 	(void)vfs_scanopt(opts, "max_read=", "%u", &max_read);
 	if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) {
 		if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT)
 			daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT;
 		else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT)
 			daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT;
 	} else {
 		daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT;
 	}
 	subtype = vfs_getopts(opts, "subtype=", &err);
 
-	FS_DEBUG2G("mntopts 0x%jx\n", (uintmax_t)mntopts);
+	SDT_PROBE1(fuse, , vfsops, mntopts, mntopts);
 
 	err = fget(td, fd, &cap_read_rights, &fp);
 	if (err != 0) {
-		FS_DEBUG("invalid or not opened device: data=%p\n", data);
+		SDT_PROBE2(fuse, , vfsops, trace, 1,
+			"invalid or not opened device");
 		goto out;
 	}
 	fptmp = td->td_fpop;
 	td->td_fpop = fp;
 	err = devfs_get_cdevpriv((void **)&data);
 	td->td_fpop = fptmp;
 	fdrop(fp, td);
 	FUSE_LOCK();
 	if (err != 0 || data == NULL || data->mp != NULL) {
-		FS_DEBUG("invalid or not opened device: data=%p data.mp=%p\n",
-		    data, data != NULL ? data->mp : NULL);
 		err = ENXIO;
+		SDT_PROBE4(fuse, , vfsops, mount_err,
+			"invalid or not opened device", data, mp, err);
 		FUSE_UNLOCK();
 		goto out;
 	}
 	if (fdata_get_dead(data)) {
-		FS_DEBUG("device is dead during mount: data=%p\n", data);
 		err = ENOTCONN;
+		SDT_PROBE4(fuse, , vfsops, mount_err,
+			"device is dead during mount", data, mp, err);
 		FUSE_UNLOCK();
 		goto out;
 	}
 	/* Sanity + permission checks */
 	if (!data->daemoncred)
 		panic("fuse daemon found, but identity unknown");
 	if (mntopts & FSESS_DAEMON_CAN_SPY)
 		err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
 	if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
 		/* are we allowed to do the first mount? */
 		err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);
 	if (err) {
 		FUSE_UNLOCK();
 		goto out;
 	}
 	data->ref++;
 	data->mp = mp;
 	data->dataflags |= mntopts;
 	data->max_read = max_read;
 	data->daemon_timeout = daemon_timeout;
 	FUSE_UNLOCK();
 
 	vfs_getnewfsid(mp);
 	MNT_ILOCK(mp);
 	mp->mnt_data = data;
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_USES_BCACHE;
 	MNT_IUNLOCK(mp);
 	/* We need this here as this slot is used by getnewvnode() */
 	mp->mnt_stat.f_iosize = maxbcachebuf;
 	if (subtype) {
 		strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN);
 		strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN);
 	}
 	copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len);
 	bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len);
-	FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname);
 
 	/* Now handshaking with daemon */
 	fuse_internal_send_init(data, td);
 
 out:
 	if (err) {
 		FUSE_LOCK();
 		if (data->mp == mp) {
 			/*
 			 * Destroy device only if we acquired reference to
 			 * it
 			 */
-			FS_DEBUG("mount failed, destroy device: data=%p mp=%p"
-			      " err=%d\n",
-			    data, mp, err);
+			SDT_PROBE4(fuse, , vfsops, mount_err,
+				"mount failed, destroy device", data, mp, err);
 			data->mp = NULL;
 			fdata_trydestroy(data);
 		}
 		FUSE_UNLOCK();
 		dev_rel(fdev);
 	}
 	return err;
 }
 
 static int
 fuse_vfsop_unmount(struct mount *mp, int mntflags)
 {
 	int err = 0;
 	int flags = 0;
 
 	struct cdev *fdev;
 	struct fuse_data *data;
 	struct fuse_dispatcher fdi;
 	struct thread *td = curthread;
 
-	fuse_trace_printf_vfsop();
-
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
 	data = fuse_get_mpdata(mp);
 	if (!data) {
 		panic("no private data for mount point?");
 	}
 	/* There is 1 extra root vnode reference (mp->mnt_data). */
 	FUSE_LOCK();
 	if (data->vroot != NULL) {
 		struct vnode *vroot = data->vroot;
 
 		data->vroot = NULL;
 		FUSE_UNLOCK();
 		vrele(vroot);
 	} else
 		FUSE_UNLOCK();
 	err = vflush(mp, 0, flags, td);
 	if (err) {
-		debug_printf("vflush failed");
 		return err;
 	}
 	if (fdata_get_dead(data)) {
 		goto alreadydead;
 	}
 	fdisp_init(&fdi, 0);
 	fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 
 	fdata_set_dead(data);
 
 alreadydead:
 	FUSE_LOCK();
 	data->mp = NULL;
 	fdev = data->fdev;
 	fdata_trydestroy(data);
 	FUSE_UNLOCK();
 
 	MNT_ILOCK(mp);
 	mp->mnt_data = NULL;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 
 	dev_rel(fdev);
 
 	return 0;
 }
 
 static int
 fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp)
 {
 	struct fuse_data *data = fuse_get_mpdata(mp);
 	int err = 0;
 
 	if (data->vroot != NULL) {
 		err = vget(data->vroot, lkflags, curthread);
 		if (err == 0)
 			*vpp = data->vroot;
 	} else {
 		err = fuse_vnode_get(mp, NULL, FUSE_ROOT_ID, NULL, vpp, NULL,
 		    VDIR);
 		if (err == 0) {
 			FUSE_LOCK();
 			MPASS(data->vroot == NULL || data->vroot == *vpp);
 			if (data->vroot == NULL) {
-				FS_DEBUG("new root vnode\n");
+				SDT_PROBE2(fuse, , vfsops, trace, 1,
+					"new root vnode");
 				data->vroot = *vpp;
 				FUSE_UNLOCK();
 				vref(*vpp);
 			} else if (data->vroot != *vpp) {
-				FS_DEBUG("root vnode race\n");
+				SDT_PROBE2(fuse, , vfsops, trace, 1,
+					"root vnode race");
 				FUSE_UNLOCK();
 				VOP_UNLOCK(*vpp, 0);
 				vrele(*vpp);
 				vrecycle(*vpp);
 				*vpp = data->vroot;
 			} else
 				FUSE_UNLOCK();
 		}
 	}
 	return err;
 }
 
 static int
 fuse_vfsop_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct fuse_dispatcher fdi;
 	int err = 0;
 
 	struct fuse_statfs_out *fsfo;
 	struct fuse_data *data;
 
-	FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname);
 	data = fuse_get_mpdata(mp);
 
 	if (!(data->dataflags & FSESS_INITED))
 		goto fake;
 
 	fdisp_init(&fdi, 0);
 	fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL);
 	err = fdisp_wait_answ(&fdi);
 	if (err) {
 		fdisp_destroy(&fdi);
 		if (err == ENOTCONN) {
 			/*
 	                 * We want to seem a legitimate fs even if the daemon
 	                 * is stiff dead... (so that, eg., we can still do path
 	                 * based unmounting after the daemon dies).
 	                 */
 			goto fake;
 		}
 		return err;
 	}
 	fsfo = fdi.answ;
 
 	sbp->f_blocks = fsfo->st.blocks;
 	sbp->f_bfree = fsfo->st.bfree;
 	sbp->f_bavail = fsfo->st.bavail;
 	sbp->f_files = fsfo->st.files;
 	sbp->f_ffree = fsfo->st.ffree;	/* cast from uint64_t to int64_t */
 	sbp->f_namemax = fsfo->st.namelen;
 	sbp->f_bsize = fsfo->st.frsize;	/* cast from uint32_t to uint64_t */
-
-	FS_DEBUG("fuse_statfs_out -- blocks: %llu, bfree: %llu, bavail: %llu, "
-	      "fil	es: %llu, ffree: %llu, bsize: %i, namelen: %i\n",
-	      (unsigned long long)fsfo->st.blocks, 
-	      (unsigned long long)fsfo->st.bfree,
-	      (unsigned long long)fsfo->st.bavail, 
-	      (unsigned long long)fsfo->st.files,
-	      (unsigned long long)fsfo->st.ffree, fsfo->st.bsize, 
-	      fsfo->st.namelen);
 
 	fdisp_destroy(&fdi);
 	return 0;
 
 fake:
 	sbp->f_blocks = 0;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 0;
 	sbp->f_ffree = 0;
 	sbp->f_namemax = 0;
 	sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE;
 
 	return 0;
 }
Index: projects/capsicum-test/sys/fs/fuse/fuse_vnops.c
===================================================================
--- projects/capsicum-test/sys/fs/fuse/fuse_vnops.c	(revision 345709)
+++ projects/capsicum-test/sys/fs/fuse/fuse_vnops.c	(revision 345710)
@@ -1,2422 +1,2375 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/extattr.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_object.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 #include "fuse_param.h"
 #include "fuse_io.h"
 
 #include <sys/priv.h>
 
-#define FUSE_DEBUG_MODULE VNOPS
-#include "fuse_debug.h"
+SDT_PROVIDER_DECLARE(fuse);
+/* 
+ * Fuse trace probe:
+ * arg0: verbosity.  Higher numbers give more verbose messages
+ * arg1: Textual message
+ */
+SDT_PROBE_DEFINE2(fuse, , vnops, trace, "int", "char*");
 
 /* vnode ops */
 static vop_access_t fuse_vnop_access;
 static vop_close_t fuse_vnop_close;
 static vop_create_t fuse_vnop_create;
 static vop_deleteextattr_t fuse_vnop_deleteextattr;
 static vop_fsync_t fuse_vnop_fsync;
 static vop_getattr_t fuse_vnop_getattr;
 static vop_getextattr_t fuse_vnop_getextattr;
 static vop_inactive_t fuse_vnop_inactive;
 static vop_link_t fuse_vnop_link;
 static vop_listextattr_t fuse_vnop_listextattr;
 static vop_lookup_t fuse_vnop_lookup;
 static vop_mkdir_t fuse_vnop_mkdir;
 static vop_mknod_t fuse_vnop_mknod;
 static vop_open_t fuse_vnop_open;
 static vop_pathconf_t fuse_vnop_pathconf;
 static vop_read_t fuse_vnop_read;
 static vop_readdir_t fuse_vnop_readdir;
 static vop_readlink_t fuse_vnop_readlink;
 static vop_reclaim_t fuse_vnop_reclaim;
 static vop_remove_t fuse_vnop_remove;
 static vop_rename_t fuse_vnop_rename;
 static vop_rmdir_t fuse_vnop_rmdir;
 static vop_setattr_t fuse_vnop_setattr;
 static vop_setextattr_t fuse_vnop_setextattr;
 static vop_strategy_t fuse_vnop_strategy;
 static vop_symlink_t fuse_vnop_symlink;
 static vop_write_t fuse_vnop_write;
 static vop_getpages_t fuse_vnop_getpages;
 static vop_putpages_t fuse_vnop_putpages;
 static vop_print_t fuse_vnop_print;
 
 struct vop_vector fuse_vnops = {
 	.vop_default = &default_vnodeops,
 	.vop_access = fuse_vnop_access,
 	.vop_close = fuse_vnop_close,
 	.vop_create = fuse_vnop_create,
 	.vop_deleteextattr = fuse_vnop_deleteextattr,
 	.vop_fsync = fuse_vnop_fsync,
 	.vop_getattr = fuse_vnop_getattr,
 	.vop_getextattr = fuse_vnop_getextattr,
 	.vop_inactive = fuse_vnop_inactive,
 	.vop_link = fuse_vnop_link,
 	.vop_listextattr = fuse_vnop_listextattr,
 	.vop_lookup = fuse_vnop_lookup,
 	.vop_mkdir = fuse_vnop_mkdir,
 	.vop_mknod = fuse_vnop_mknod,
 	.vop_open = fuse_vnop_open,
 	.vop_pathconf = fuse_vnop_pathconf,
 	.vop_read = fuse_vnop_read,
 	.vop_readdir = fuse_vnop_readdir,
 	.vop_readlink = fuse_vnop_readlink,
 	.vop_reclaim = fuse_vnop_reclaim,
 	.vop_remove = fuse_vnop_remove,
 	.vop_rename = fuse_vnop_rename,
 	.vop_rmdir = fuse_vnop_rmdir,
 	.vop_setattr = fuse_vnop_setattr,
 	.vop_setextattr = fuse_vnop_setextattr,
 	.vop_strategy = fuse_vnop_strategy,
 	.vop_symlink = fuse_vnop_symlink,
 	.vop_write = fuse_vnop_write,
 	.vop_getpages = fuse_vnop_getpages,
 	.vop_putpages = fuse_vnop_putpages,
 	.vop_print = fuse_vnop_print,
 };
 
 static u_long fuse_lookup_cache_hits = 0;
 
 SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
     &fuse_lookup_cache_hits, 0, "number of positive cache hits in lookup");
 
 static u_long fuse_lookup_cache_misses = 0;
 
 SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
     &fuse_lookup_cache_misses, 0, "number of cache misses in lookup");
 
 int	fuse_lookup_cache_enable = 1;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, lookup_cache_enable, CTLFLAG_RW,
     &fuse_lookup_cache_enable, 0, "if non-zero, enable lookup cache");
 
 /*
  * XXX: This feature is highly experimental and can bring to instabilities,
  * needs revisiting before to be enabled by default.
  */
 static int fuse_reclaim_revoked = 0;
 
 SYSCTL_INT(_vfs_fusefs, OID_AUTO, reclaim_revoked, CTLFLAG_RW,
     &fuse_reclaim_revoked, 0, "");
 
 uma_zone_t fuse_pbuf_zone;
 
 #define fuse_vm_page_lock(m)		vm_page_lock((m));
 #define fuse_vm_page_unlock(m)		vm_page_unlock((m));
 #define fuse_vm_page_lock_queues()	((void)0)
 #define fuse_vm_page_unlock_queues()	((void)0)
 
 /*
     struct vnop_access_args {
 	struct vnode *a_vp;
 #if VOP_ACCESS_TAKES_ACCMODE_T
 	accmode_t a_accmode;
 #else
 	int a_mode;
 #endif
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int accmode = ap->a_accmode;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_access_param facp;
 	struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
 
 	int err;
 
-	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	if (fuse_isdeadfs(vp)) {
 		if (vnode_isvroot(vp)) {
 			return 0;
 		}
 		return ENXIO;
 	}
 	if (!(data->dataflags & FSESS_INITED)) {
 		if (vnode_isvroot(vp)) {
 			if (priv_check_cred(cred, PRIV_VFS_ADMIN) ||
 			    (fuse_match_cred(data->daemoncred, cred) == 0)) {
 				return 0;
 			}
 		}
 		return EBADF;
 	}
 	if (vnode_islnk(vp)) {
 		return 0;
 	}
 	bzero(&facp, sizeof(facp));
 
 	err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred);
-	FS_DEBUG2G("err=%d accmode=0x%x\n", err, accmode);
 	return err;
 }
 
 /*
     struct vnop_close_args {
 	struct vnode *a_vp;
 	int  a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred = ap->a_cred;
 	int fflag = ap->a_fflag;
 	fufh_type_t fufh_type;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if (vnode_isdir(vp)) {
 		if (fuse_filehandle_valid(vp, FUFH_RDONLY)) {
 			fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
 		}
 		return 0;
 	}
 	if (fflag & IO_NDELAY) {
 		return 0;
 	}
 	fufh_type = fuse_filehandle_xlate_from_fflags(fflag);
 
 	if (!fuse_filehandle_valid(vp, fufh_type)) {
 		int i;
 
 		for (i = 0; i < FUFH_MAXTYPE; i++)
 			if (fuse_filehandle_valid(vp, i))
 				break;
 		if (i == FUFH_MAXTYPE)
 			panic("FUSE: fufh type %d found to be invalid in close"
 			      " (fflag=0x%x)\n",
 			      fufh_type, fflag);
 	}
 	if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 		fuse_vnode_savesize(vp, cred);
 	}
 	return 0;
 }
 
 /*
     struct vnop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	struct fuse_open_in *foi;
 	struct fuse_entry_out *feo;
 	struct fuse_dispatcher fdi;
 	struct fuse_dispatcher *fdip = &fdi;
 
 	int err;
 
 	struct mount *mp = vnode_mount(dvp);
 	uint64_t parentnid = VTOFUD(dvp)->nid;
 	mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	uint64_t x_fh_id;
 	uint32_t x_open_flags;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	bzero(&fdi, sizeof(fdi));
 
 	/* XXX:	Will we ever want devices ? */
 	if ((vap->va_type != VREG)) {
 		printf("fuse_vnop_create: unsupported va_type %d\n",
 		    vap->va_type);
 		return (EINVAL);
 	}
-	debug_printf("parent nid = %ju, mode = %x\n", (uintmax_t)parentnid,
-	    mode);
 
 	fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1);
 	if (!fsess_isimpl(mp, FUSE_CREATE)) {
-		debug_printf("eh, daemon doesn't implement create?\n");
+		SDT_PROBE2(fuse, , vnops, trace, 1,
+			"eh, daemon doesn't implement create?");
 		return (EINVAL);
 	}
 	fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred);
 
 	foi = fdip->indata;
 	foi->mode = mode;
 	foi->flags = O_CREAT | O_RDWR;
 
 	memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr,
 	    cnp->cn_namelen);
 	((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0';
 
 	err = fdisp_wait_answ(fdip);
 
 	if (err) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_CREATE);
-		debug_printf("create: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 	feo = fdip->answ;
 
 	if ((err = fuse_internal_checkentry(feo, VREG))) {
 		goto out;
 	}
 	err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, VREG);
 	if (err) {
 		struct fuse_release_in *fri;
 		uint64_t nodeid = feo->nodeid;
 		uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
 
 		fdisp_init(fdip, sizeof(*fri));
 		fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred);
 		fri = fdip->indata;
 		fri->fh = fh_id;
 		fri->flags = OFLAGS(mode);
 		fuse_insert_callback(fdip->tick, fuse_internal_forget_callback);
 		fuse_insert_message(fdip->tick);
 		return err;
 	}
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create");
 
 	fdip->answ = feo + 1;
 
 	x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
 	x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags;
 	fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id);
 	fuse_vnode_open(*vpp, x_open_flags, td);
 	cache_purge_negative(dvp);
 
 out:
 	fdisp_destroy(fdip);
 	return err;
 }
 
 /*
  * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux
  * version of FUSE also has a FUSE_FLUSH method.
  *
  * On Linux, fsync() synchronizes a file's complete in-core state with that
  * on disk. The call is not supposed to return until the system has completed
  * that action or until an error is detected.
  *
  * Linux also has an fdatasync() call that is similar to fsync() but is not
  * required to update the metadata such as access time and modification time.
  */
 
 /*
     struct vnop_fsync_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode * a_vp;
 	struct ucred * a_cred;
 	int  a_waitfor;
 	struct thread * a_td;
     };
 */
 static int
 fuse_vnop_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_filehandle *fufh;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 
 	int type, err = 0;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if ((err = vop_stdfsync(ap)))
 		return err;
 
 	if (!fsess_isimpl(vnode_mount(vp),
 	    (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
 		goto out;
 	}
 	for (type = 0; type < FUFH_MAXTYPE; type++) {
 		fufh = &(fvdat->fufh[type]);
 		if (FUFH_IS_VALID(fufh)) {
 			fuse_internal_fsync(vp, td, NULL, fufh);
 		}
 	}
 
 out:
 	return 0;
 }
 
 /*
     struct vnop_getattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
+	struct fuse_attr_out *fao;
 
 	int err = 0;
 	int dataflags;
 	struct fuse_dispatcher fdi;
 
-	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
 
 	/* Note that we are not bailing out on a dead file system just yet. */
 
 	if (!(dataflags & FSESS_INITED)) {
 		if (!vnode_isvroot(vp)) {
 			fdata_set_dead(fuse_get_mpdata(vnode_mount(vp)));
 			err = ENOTCONN;
-			debug_printf("fuse_getattr b: returning ENOTCONN\n");
 			return err;
 		} else {
 			goto fake;
 		}
 	}
 	fdisp_init(&fdi, 0);
 	if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) {
 		if ((err == ENOTCONN) && vnode_isvroot(vp)) {
-			/* see comment at similar place in fuse_statfs() */
+			/* see comment in fuse_vfsop_statfs() */
 			fdisp_destroy(&fdi);
 			goto fake;
 		}
 		if (err == ENOENT) {
 			fuse_internal_vnode_disappear(vp);
 		}
 		goto out;
 	}
 
-	cache_attrs(vp, (struct fuse_attr_out *)fdi.answ, vap);
+	fao = (struct fuse_attr_out *)fdi.answ;
+	fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
+		fao->attr_valid_nsec, vap);
 	if (vap->va_type != vnode_vtype(vp)) {
 		fuse_internal_vnode_disappear(vp);
 		err = ENOENT;
 		goto out;
 	}
 	if ((fvdat->flag & FN_SIZECHANGE) != 0)
 		vap->va_size = fvdat->filesize;
 
 	if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) {
 		/*
 	         * This is for those cases when the file size changed without us
 	         * knowing, and we want to catch up.
 	         */
 		off_t new_filesize = ((struct fuse_attr_out *)
 				      fdi.answ)->attr.size;
 
 		if (fvdat->filesize != new_filesize) {
 			fuse_vnode_setsize(vp, cred, new_filesize);
 			fvdat->flag &= ~FN_SIZECHANGE;
 		}
 	}
-	debug_printf("fuse_getattr e: returning 0\n");
 
 out:
 	fdisp_destroy(&fdi);
 	return err;
 
 fake:
 	bzero(vap, sizeof(*vap));
 	vap->va_type = vnode_vtype(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh = NULL;
 
 	int type, need_flush = 1;
 
-	FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	for (type = 0; type < FUFH_MAXTYPE; type++) {
 		fufh = &(fvdat->fufh[type]);
 		if (FUFH_IS_VALID(fufh)) {
 			if (need_flush && vp->v_type == VREG) {
 				if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 					fuse_vnode_savesize(vp, NULL);
 				}
 				if (fuse_data_cache_invalidate ||
 				    (fvdat->flag & FN_REVOKED) != 0)
 					fuse_io_invalbuf(vp, td);
 				else
 					fuse_io_flushbuf(vp, MNT_WAIT, td);
 				need_flush = 0;
 			}
 			fuse_filehandle_close(vp, type, td, NULL);
 		}
 	}
 
 	if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) {
 		vrecycle(vp);
 	}
 	return 0;
 }
 
 /*
     struct vnop_link_args {
 	struct vnode *a_tdvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 
 	struct vattr *vap = VTOVA(vp);
 
 	struct fuse_dispatcher fdi;
 	struct fuse_entry_out *feo;
 	struct fuse_link_in fli;
 
 	int err;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_mount(tdvp) != vnode_mount(vp)) {
 		return EXDEV;
 	}
 
 	/*
 	 * This is a seatbelt check to protect naive userspace filesystems from
 	 * themselves and the limitations of the FUSE IPC protocol.  If a
 	 * filesystem does not allow attribute caching, assume it is capable of
 	 * validating that nlink does not overflow.
 	 */
 	if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX)
 		return EMLINK;
 	fli.oldnodeid = VTOI(vp);
 
 	fdisp_init(&fdi, 0);
 	fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp,
 	    FUSE_LINK, &fli, sizeof(fli), &fdi);
 	if ((err = fdisp_wait_answ(&fdi))) {
 		goto out;
 	}
 	feo = fdi.answ;
 
 	err = fuse_internal_checkentry(feo, vnode_vtype(vp));
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_lookup_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
     };
 */
 int
 fuse_vnop_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	int nameiop = cnp->cn_nameiop;
 	int flags = cnp->cn_flags;
 	int wantparent = flags & (LOCKPARENT | WANTPARENT);
 	int islastcn = flags & ISLASTCN;
 	struct mount *mp = vnode_mount(dvp);
 
 	int err = 0;
 	int lookup_err = 0;
 	struct vnode *vp = NULL;
 
 	struct fuse_dispatcher fdi;
 	enum fuse_opcode op;
 
 	uint64_t nid;
 	struct fuse_access_param facp;
 
-	FS_DEBUG2G("parent_inode=%ju - %*s\n",
-	    (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr);
-
 	if (fuse_isdeadfs(dvp)) {
 		*vpp = NULL;
 		return ENXIO;
 	}
 	if (!vnode_isdir(dvp)) {
 		return ENOTDIR;
 	}
 	if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) {
 		return EROFS;
 	}
 	/*
 	 * We do access check prior to doing anything else only in the case
 	 * when we are at fs root (we'd like to say, "we are at the first
 	 * component", but that's not exactly the same... nevermind).
 	 * See further comments at further access checks.
 	 */
 
 	bzero(&facp, sizeof(facp));
 	if (vnode_isvroot(dvp)) {	/* early permission check hack */
 		if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) {
 			return err;
 		}
 	}
 	if (flags & ISDOTDOT) {
 		nid = VTOFUD(dvp)->parent_nid;
 		if (nid == 0) {
 			return ENOENT;
 		}
 		fdisp_init(&fdi, 0);
 		op = FUSE_GETATTR;
 		goto calldaemon;
 	} else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') {
 		nid = VTOI(dvp);
 		fdisp_init(&fdi, 0);
 		op = FUSE_GETATTR;
 		goto calldaemon;
 	} else if (fuse_lookup_cache_enable) {
 		err = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 		switch (err) {
 
 		case -1:		/* positive match */
 			atomic_add_acq_long(&fuse_lookup_cache_hits, 1);
 			return 0;
 
 		case 0:		/* no match in cache */
 			atomic_add_acq_long(&fuse_lookup_cache_misses, 1);
 			break;
 
 		case ENOENT:		/* negative match */
 			/* fall through */
 		default:
 			return err;
 		}
 	}
 	nid = VTOI(dvp);
 	fdisp_init(&fdi, cnp->cn_namelen + 1);
 	op = FUSE_LOOKUP;
 
 calldaemon:
 	fdisp_make(&fdi, op, mp, nid, td, cred);
 
 	if (op == FUSE_LOOKUP) {
 		memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 		((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 	}
 	lookup_err = fdisp_wait_answ(&fdi);
 
 	if ((op == FUSE_LOOKUP) && !lookup_err) {	/* lookup call succeeded */
 		nid = ((struct fuse_entry_out *)fdi.answ)->nodeid;
 		if (!nid) {
 			/*
 	                 * zero nodeid is the same as "not found",
 	                 * but it's also cacheable (which we keep
 	                 * keep on doing not as of writing this)
 	                 */
 			lookup_err = ENOENT;
 		} else if (nid == FUSE_ROOT_ID) {
 			lookup_err = EINVAL;
 		}
 	}
 	if (lookup_err &&
 	    (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) {
 		fdisp_destroy(&fdi);
 		return lookup_err;
 	}
 	/* lookup_err, if non-zero, must be ENOENT at this point */
 
 	if (lookup_err) {
 
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn
 		     /* && directory dvp has not been removed */ ) {
 
 			if (vfs_isrdonly(mp)) {
 				err = EROFS;
 				goto out;
 			}
 #if 0 /* THINK_ABOUT_THIS */
 			if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
 				goto out;
 			}
 #endif
 
 			/*
 	                 * Possibly record the position of a slot in the
 	                 * directory large enough for the new component name.
 	                 * This can be recorded in the vnode private data for
 	                 * dvp. Set the SAVENAME flag to hold onto the
 	                 * pathname for use later in VOP_CREATE or VOP_RENAME.
 	                 */
 			cnp->cn_flags |= SAVENAME;
 
 			err = EJUSTRETURN;
 			goto out;
 		}
 		/* Consider inserting name into cache. */
 
 		/*
 	         * No we can't use negative caching, as the fs
 	         * changes are out of our control.
 	         * False positives' falseness turns out just as things
 	         * go by, but false negatives' falseness doesn't.
 	         * (and aiding the caching mechanism with extra control
 	         * mechanisms comes quite close to beating the whole purpose
 	         * caching...)
 	         */
 #if 0
 		if ((cnp->cn_flags & MAKEENTRY) != 0) {
-			FS_DEBUG("inserting NULL into cache\n");
+			SDT_PROBE2(fuse, , vnops, trace, 1,
+				"inserting NULL into cache");
 			cache_enter(dvp, NULL, cnp);
 		}
 #endif
 		err = ENOENT;
 		goto out;
 
 	} else {
 
 		/* !lookup_err */
 
 		struct fuse_entry_out *feo = NULL;
 		struct fuse_attr *fattr = NULL;
 
 		if (op == FUSE_GETATTR) {
 			fattr = &((struct fuse_attr_out *)fdi.answ)->attr;
 		} else {
 			feo = (struct fuse_entry_out *)fdi.answ;
 			fattr = &(feo->attr);
 		}
 
 		/*
 	         * If deleting, and at end of pathname, return parameters
 	         * which can be used to remove file.  If the wantparent flag
 	         * isn't set, we return only the directory, otherwise we go on
 	         * and lock the inode, being careful with ".".
 	         */
 		if (nameiop == DELETE && islastcn) {
 			/*
 	                 * Check for write access on directory.
 	                 */
 			facp.xuid = fattr->uid;
 			facp.facc_flags |= FACCESS_STICKY;
 			err = fuse_internal_access(dvp, VWRITE, &facp, td, cred);
 			facp.facc_flags &= ~FACCESS_XQUERIES;
 
 			if (err) {
 				goto out;
 			}
 			if (nid == VTOI(dvp)) {
 				vref(dvp);
 				*vpp = dvp;
 			} else {
 				err = fuse_vnode_get(dvp->v_mount, feo, nid,
 				    dvp, &vp, cnp, IFTOVT(fattr->mode));
 				if (err)
 					goto out;
 				*vpp = vp;
 			}
 
 			/*
 			 * Save the name for use in VOP_RMDIR and VOP_REMOVE
 			 * later.
 			 */
 			cnp->cn_flags |= SAVENAME;
 			goto out;
 
 		}
 		/*
 	         * If rewriting (RENAME), return the inode and the
 	         * information required to rewrite the present directory
 	         * Must get inode of directory entry to verify it's a
 	         * regular file, or empty directory.
 	         */
 		if (nameiop == RENAME && wantparent && islastcn) {
 
 #if 0 /* THINK_ABOUT_THIS */
 			if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
 				goto out;
 			}
 #endif
 
 			/*
 	                 * Check for "."
 	                 */
 			if (nid == VTOI(dvp)) {
 				err = EISDIR;
 				goto out;
 			}
 			err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
 			    &vp, cnp, IFTOVT(fattr->mode));
 			if (err) {
 				goto out;
 			}
 			*vpp = vp;
 			/*
 	                 * Save the name for use in VOP_RENAME later.
 	                 */
 			cnp->cn_flags |= SAVENAME;
 
 			goto out;
 		}
 		if (flags & ISDOTDOT) {
 			struct mount *mp;
 			int ltype;
 
 			/*
 			 * Expanded copy of vn_vget_ino() so that
 			 * fuse_vnode_get() can be used.
 			 */
 			mp = dvp->v_mount;
 			ltype = VOP_ISLOCKED(dvp);
 			err = vfs_busy(mp, MBF_NOWAIT);
 			if (err != 0) {
 				vfs_ref(mp);
 				VOP_UNLOCK(dvp, 0);
 				err = vfs_busy(mp, 0);
 				vn_lock(dvp, ltype | LK_RETRY);
 				vfs_rel(mp);
 				if (err)
 					goto out;
 				if ((dvp->v_iflag & VI_DOOMED) != 0) {
 					err = ENOENT;
 					vfs_unbusy(mp);
 					goto out;
 				}
 			}
 			VOP_UNLOCK(dvp, 0);
 			err = fuse_vnode_get(vnode_mount(dvp), feo, nid, NULL,
 			    &vp, cnp, IFTOVT(fattr->mode));
 			vfs_unbusy(mp);
 			vn_lock(dvp, ltype | LK_RETRY);
 			if ((dvp->v_iflag & VI_DOOMED) != 0) {
 				if (err == 0)
 					vput(vp);
 				err = ENOENT;
 			}
 			if (err)
 				goto out;
 			*vpp = vp;
 		} else if (nid == VTOI(dvp)) {
 			vref(dvp);
 			*vpp = dvp;
 		} else {
 			struct fuse_vnode_data *fvdat;
 
 			err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
 			    &vp, cnp, IFTOVT(fattr->mode));
 			if (err) {
 				goto out;
 			}
 			fuse_vnode_setparent(vp, dvp);
 
 			/*
 			 * In the case where we are looking up a FUSE node
 			 * represented by an existing cached vnode, and the
 			 * true size reported by FUSE_LOOKUP doesn't match
 			 * the vnode's cached size, fix the vnode cache to
 			 * match the real object size.
 			 *
 			 * This can occur via FUSE distributed filesystems,
 			 * irregular files, etc.
 			 */
 			fvdat = VTOFUD(vp);
 			if (vnode_isreg(vp) &&
 			    fattr->size != fvdat->filesize) {
 				/*
 				 * The FN_SIZECHANGE flag reflects a dirty
 				 * append.  If userspace lets us know our cache
 				 * is invalid, that write was lost.  (Dirty
 				 * writes that do not cause append are also
 				 * lost, but we don't detect them here.)
 				 *
 				 * XXX: Maybe disable WB caching on this mount.
 				 */
 				if (fvdat->flag & FN_SIZECHANGE)
 					printf("%s: WB cache incoherent on "
 					    "%s!\n", __func__,
 					    vnode_mount(vp)->mnt_stat.f_mntonname);
 
 				(void)fuse_vnode_setsize(vp, cred, fattr->size);
 				fvdat->flag &= ~FN_SIZECHANGE;
 			}
 			*vpp = vp;
 		}
 
 		if (op == FUSE_GETATTR) {
-			cache_attrs(*vpp, (struct fuse_attr_out *)fdi.answ,
-			    NULL);
+			struct fuse_attr_out *fao =
+				(struct fuse_attr_out*)fdi.answ;
+			fuse_internal_cache_attrs(*vpp,
+				&fao->attr, fao->attr_valid,
+				fao->attr_valid_nsec, NULL);
 		} else {
-			cache_attrs(*vpp, (struct fuse_entry_out *)fdi.answ,
-			    NULL);
+			struct fuse_entry_out *feo =
+				(struct fuse_entry_out*)fdi.answ;
+			fuse_internal_cache_attrs(*vpp,
+				&feo->attr, feo->attr_valid,
+				feo->attr_valid_nsec, NULL);
 		}
 
 		/* Insert name into cache if appropriate. */
 
 		/*
 	         * Nooo, caching is evil. With caching, we can't avoid stale
 	         * information taking over the playground (cached info is not
 	         * just positive/negative, it does have qualitative aspects,
 	         * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when
 	         * walking down along cached path components, and that's not
 	         * any cheaper than FUSE_LOOKUP. This might change with
 	         * implementing kernel side attr caching, but... In Linux,
 	         * lookup results are not cached, and the daemon is bombarded
 	         * with FUSE_LOOKUPS on and on. This shows that by design, the
 	         * daemon is expected to handle frequent lookup queries
 	         * efficiently, do its caching in userspace, and so on.
 	         *
 	         * So just leave the name cache alone.
 	         */
 
 		/*
 	         * Well, now I know, Linux caches lookups, but with a
 	         * timeout... So it's the same thing as attribute caching:
 	         * we can deal with it when implement timeouts.
 	         */
 #if 0
 		if (cnp->cn_flags & MAKEENTRY) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 #endif
 	}
 out:
 	if (!lookup_err) {
 
 		/* No lookup error; need to clean up. */
 
 		if (err) {		/* Found inode; exit with no vnode. */
 			if (op == FUSE_LOOKUP) {
 				fuse_internal_forget_send(vnode_mount(dvp), td, cred,
 				    nid, 1);
 			}
 			fdisp_destroy(&fdi);
 			return err;
 		} else {
 #ifndef NO_EARLY_PERM_CHECK_HACK
 			if (!islastcn) {
 				/*
 				 * We have the attributes of the next item
 				 * *now*, and it's a fact, and we do not
 				 * have to do extra work for it (ie, beg the
 				 * daemon), and it neither depends on such
 				 * accidental things like attr caching. So
 				 * the big idea: check credentials *now*,
 				 * not at the beginning of the next call to
 				 * lookup.
 				 * 
 				 * The first item of the lookup chain (fs root)
 				 * won't be checked then here, of course, as
 				 * its never "the next". But go and see that
 				 * the root is taken care about at the very
 				 * beginning of this function.
 				 * 
 				 * Now, given we want to do the access check
 				 * this way, one might ask: so then why not
 				 * do the access check just after fetching
 				 * the inode and its attributes from the
 				 * daemon? Why bother with producing the
 				 * corresponding vnode at all if something
 				 * is not OK? We know what's the deal as
 				 * soon as we get those attrs... There is
 				 * one bit of info though not given us by
 				 * the daemon: whether his response is
 				 * authoritative or not... His response should
 				 * be ignored if something is mounted over
 				 * the dir in question. But that can be
 				 * known only by having the vnode...
 				 */
 				int tmpvtype = vnode_vtype(*vpp);
 
 				bzero(&facp, sizeof(facp));
 				/*the early perm check hack */
 				    facp.facc_flags |= FACCESS_VA_VALID;
 
 				if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) {
 					err = ENOTDIR;
 				}
 				if (!err && !vnode_mountedhere(*vpp)) {
 					err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred);
 				}
 				if (err) {
 					if (tmpvtype == VLNK)
-						FS_DEBUG("weird, permission error with a symlink?\n");
+						SDT_PROBE2(fuse, , vnops, trace,
+						    1, "weird, permission "
+						    "error with a symlink?");
 					vput(*vpp);
 					*vpp = NULL;
 				}
 			}
 #endif
 		}
 	}
 	fdisp_destroy(&fdi);
 
 	return err;
 }
 
 /*
     struct vnop_mkdir_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 
 	struct fuse_mkdir_in fmdi;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
 	return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi,
 	    sizeof(fmdi), VDIR));
 }
 
 /*
     struct vnop_mknod_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mknod(struct vop_mknod_args *ap)
 {
 
 	return (EINVAL);
 }
 
 
 /*
     struct vnop_open_args {
 	struct vnode *a_vp;
 	int  a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	int a_fdidx; / struct file *a_fp;
     };
 */
 static int
 fuse_vnop_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int mode = ap->a_mode;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 
 	fufh_type_t fufh_type;
 	struct fuse_vnode_data *fvdat;
 
 	int error, isdir = 0;
 	int32_t fuse_open_flags;
 
-	FS_DEBUG2G("inode=%ju mode=0x%x\n", (uintmax_t)VTOI(vp), mode);
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if ((mode & (FREAD | FWRITE)) == 0)
 		return EINVAL;
 
 	fvdat = VTOFUD(vp);
 
 	if (vnode_isdir(vp)) {
 		isdir = 1;
 	}
 	fuse_open_flags = 0;
 	if (isdir) {
 		fufh_type = FUFH_RDONLY;
 	} else {
 		fufh_type = fuse_filehandle_xlate_from_fflags(mode);
 		/*
 		 * For WRONLY opens, force DIRECT_IO.  This is necessary
 		 * since writing a partial block through the buffer cache
 		 * will result in a read of the block and that read won't
 		 * be allowed by the WRONLY open.
 		 */
 		if (fufh_type == FUFH_WRONLY ||
 		    (fvdat->flag & FN_DIRECTIO) != 0)
 			fuse_open_flags = FOPEN_DIRECT_IO;
 	}
 
 	if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) {
 		fuse_vnode_open(vp, fuse_open_flags, td);
 		return 0;
 	}
 	error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred);
 
 	return error;
 }
 
 static int
 fuse_vnop_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX);
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
     struct vnop_read_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 
-	FS_DEBUG2G("inode=%ju offset=%jd resid=%zd\n",
-	    (uintmax_t)VTOI(vp), uio->uio_offset, uio->uio_resid);
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred);
 }
 
 /*
     struct vnop_readdir_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *ncookies;
 	u_long **a_cookies;
     };
 */
 static int
 fuse_vnop_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_filehandle *fufh = NULL;
 	struct fuse_iov cookediov;
 
 	int err = 0;
 	int freefufh = 0;
 
-	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (				/* XXXIP ((uio_iovcnt(uio) > 1)) || */
 	    (uio_resid(uio) < sizeof(struct dirent))) {
 		return EINVAL;
 	}
 
 	if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) {
-		FS_DEBUG("calling readdir() before open()");
+		SDT_PROBE2(fuse, , vnops, trace, 1,
+			"calling readdir() before open()");
 		err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred);
 		freefufh = 1;
 	} else {
 		err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh);
 	}
 	if (err) {
 		return (err);
 	}
 #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
 	fiov_init(&cookediov, DIRCOOKEDSIZE);
 
 	err = fuse_internal_readdir(vp, uio, fufh, &cookediov);
 
 	fiov_teardown(&cookediov);
 	if (freefufh) {
 		fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
 	}
 	return err;
 }
 
 /*
     struct vnop_readlink_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_dispatcher fdi;
 	int err;
 
-	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (!vnode_islnk(vp)) {
 		return EINVAL;
 	}
 	fdisp_init(&fdi, 0);
 	err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred);
 	if (err) {
 		goto out;
 	}
 	if (((char *)fdi.answ)[0] == '/' &&
 	    fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) {
 		char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname;
 
 		err = uiomove(mpth, strlen(mpth), uio);
 	}
 	if (!err) {
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 	}
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_reclaim_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh = NULL;
 
 	int type;
 
 	if (!fvdat) {
 		panic("FUSE: no vnode data during recycling");
 	}
-	FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	for (type = 0; type < FUFH_MAXTYPE; type++) {
 		fufh = &(fvdat->fufh[type]);
 		if (FUFH_IS_VALID(fufh)) {
 			printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid",
 			    type);
 			fuse_filehandle_close(vp, type, td, NULL);
 		}
 	}
 
 	if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) {
 		fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp),
 		    fvdat->nlookup);
 	}
 	fuse_vnode_setparent(vp, NULL);
 	cache_purge(vp);
 	vfs_hash_remove(vp);
 	vnode_destroy_vobject(vp);
 	fuse_vnode_destroy(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
 
 	int err;
 
-	FS_DEBUG2G("inode=%ju name=%*s\n",
-	    (uintmax_t)VTOI(vp), (int)cnp->cn_namelen, cnp->cn_nameptr);
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_isdir(vp)) {
 		return EPERM;
 	}
 	cache_purge(vp);
 
 	err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);
 
 	if (err == 0)
 		fuse_internal_vnode_disappear(vp);
 	return err;
 }
 
 /*
     struct vnop_rename_args {
 	struct vnode *a_fdvp;
 	struct vnode *a_fvp;
 	struct componentname *a_fcnp;
 	struct vnode *a_tdvp;
 	struct vnode *a_tvp;
 	struct componentname *a_tcnp;
     };
 */
 static int
 fuse_vnop_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct fuse_data *data;
 
 	int err = 0;
 
-	FS_DEBUG2G("from: inode=%ju name=%*s -> to: inode=%ju name=%*s\n",
-	    (uintmax_t)VTOI(fvp), (int)fcnp->cn_namelen, fcnp->cn_nameptr,
-	    (uintmax_t)(tvp == NULL ? -1 : VTOI(tvp)),
-	    (int)tcnp->cn_namelen, tcnp->cn_nameptr);
-
 	if (fuse_isdeadfs(fdvp)) {
 		return ENXIO;
 	}
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
-		FS_DEBUG("cross-device rename: %s -> %s\n",
-		    fcnp->cn_nameptr, (tcnp != NULL ? tcnp->cn_nameptr : "(NULL)"));
+		SDT_PROBE2(fuse, , vnops, trace, 1, "cross-device rename");
 		err = EXDEV;
 		goto out;
 	}
 	cache_purge(fvp);
 
 	/*
 	 * FUSE library is expected to check if target directory is not
 	 * under the source directory in the file system tree.
 	 * Linux performs this check at VFS level.
 	 */
 	data = fuse_get_mpdata(vnode_mount(tdvp));
 	sx_xlock(&data->rename_lock);
 	err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp);
 	if (err == 0) {
 		if (tdvp != fdvp)
 			fuse_vnode_setparent(fvp, tdvp);
 		if (tvp != NULL)
 			fuse_vnode_setparent(tvp, NULL);
 	}
 	sx_unlock(&data->rename_lock);
 
 	if (tvp != NULL && tvp != fvp) {
 		cache_purge(tvp);
 	}
 	if (vnode_isdir(fvp)) {
 		if ((tvp != NULL) && vnode_isdir(tvp)) {
 			cache_purge(tdvp);
 		}
 		cache_purge(fdvp);
 	}
 out:
 	if (tdvp == tvp) {
 		vrele(tdvp);
 	} else {
 		vput(tdvp);
 	}
 	if (tvp != NULL) {
 		vput(tvp);
 	}
 	vrele(fdvp);
 	vrele(fvp);
 
 	return err;
 }
 
 /*
     struct vnop_rmdir_args {
 	    struct vnode *a_dvp;
 	    struct vnode *a_vp;
 	    struct componentname *a_cnp;
     } *ap;
 */
 static int
 fuse_vnop_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 
 	int err;
 
-	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (VTOFUD(vp) == VTOFUD(dvp)) {
 		return EINVAL;
 	}
 	err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);
 
 	if (err == 0)
 		fuse_internal_vnode_disappear(vp);
 	return err;
 }
 
 /*
     struct vnop_setattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 
 	struct fuse_dispatcher fdi;
 	struct fuse_setattr_in *fsai;
 	struct fuse_access_param facp;
 
 	int err = 0;
 	enum vtype vtyp;
 	int sizechanged = 0;
 	uint64_t newsize = 0;
 
-	FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp));
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	fdisp_init(&fdi, sizeof(*fsai));
 	fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
 	fsai = fdi.indata;
 	fsai->valid = 0;
 
 	bzero(&facp, sizeof(facp));
 
 	facp.xuid = vap->va_uid;
 	facp.xgid = vap->va_gid;
 
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		facp.facc_flags |= FACCESS_CHOWN;
 		fsai->uid = vap->va_uid;
 		fsai->valid |= FATTR_UID;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		facp.facc_flags |= FACCESS_CHOWN;
 		fsai->gid = vap->va_gid;
 		fsai->valid |= FATTR_GID;
 	}
 	if (vap->va_size != VNOVAL) {
 
 		struct fuse_filehandle *fufh = NULL;
 
 		/*Truncate to a new value. */
 		    fsai->size = vap->va_size;
 		sizechanged = 1;
 		newsize = vap->va_size;
 		fsai->valid |= FATTR_SIZE;
 
 		fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
 		if (fufh) {
 			fsai->fh = fufh->fh_id;
 			fsai->valid |= FATTR_FH;
 		}
 	}
 	if (vap->va_atime.tv_sec != VNOVAL) {
 		fsai->atime = vap->va_atime.tv_sec;
 		fsai->atimensec = vap->va_atime.tv_nsec;
 		fsai->valid |= FATTR_ATIME;
 	}
 	if (vap->va_mtime.tv_sec != VNOVAL) {
 		fsai->mtime = vap->va_mtime.tv_sec;
 		fsai->mtimensec = vap->va_mtime.tv_nsec;
 		fsai->valid |= FATTR_MTIME;
 	}
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		fsai->mode = vap->va_mode & ALLPERMS;
 		fsai->valid |= FATTR_MODE;
 	}
 	if (!fsai->valid) {
 		goto out;
 	}
 	vtyp = vnode_vtype(vp);
 
 	if (fsai->valid & FATTR_SIZE && vtyp == VDIR) {
 		err = EISDIR;
 		goto out;
 	}
 	if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) {
 		err = EROFS;
 		goto out;
 	}
 	if (fsai->valid & ~FATTR_SIZE) {
 	  /*err = fuse_internal_access(vp, VADMIN, context, &facp); */
 	  /*XXX */
 		    err = 0;
 	}
 	facp.facc_flags &= ~FACCESS_XQUERIES;
 
 	if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) &&
 	    vap->va_vaflags & VA_UTIMES_NULL) {
 		err = fuse_internal_access(vp, VWRITE, &facp, td, cred);
 	}
 	if (err)
 		goto out;
 	if ((err = fdisp_wait_answ(&fdi)))
 		goto out;
 	vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
 
 	if (vnode_vtype(vp) != vtyp) {
 		if (vnode_vtype(vp) == VNON && vtyp != VNON) {
-			debug_printf("FUSE: Dang! vnode_vtype is VNON and vtype isn't.\n");
+			SDT_PROBE2(fuse, , vnops, trace, 1, "FUSE: Dang! "
+				"vnode_vtype is VNON and vtype isn't.");
 		} else {
 			/*
 	                 * STALE vnode, ditch
 	                 *
-	                 * The vnode has changed its type "behind our back". There's
-	                 * nothing really we can do, so let us just force an internal
-	                 * revocation and tell the caller to try again, if interested.
+			 * The vnode has changed its type "behind our back".
+			 * There's nothing really we can do, so let us just
+			 * force an internal revocation and tell the caller to
+			 * try again, if interested.
 	                 */
 			fuse_internal_vnode_disappear(vp);
 			err = EAGAIN;
 		}
 	}
-	if (err == 0)
-		cache_attrs(vp, (struct fuse_attr_out *)fdi.answ, NULL);
+	if (err == 0) {
+		struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ;
+		fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
+			fao->attr_valid_nsec, NULL);
+	}
 
 out:
 	fdisp_destroy(&fdi);
 	if (!err && sizechanged) {
 		fuse_vnode_setsize(vp, cred, newsize);
 		VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
 	}
 	return err;
 }
 
 /*
     struct vnop_strategy_args {
 	struct vnode *a_vp;
 	struct buf *a_bp;
     };
 */
 static int
 fuse_vnop_strategy(struct vop_strategy_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
 
-	fuse_trace_printf_vnop();
-
 	if (!vp || fuse_isdeadfs(vp)) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = ENXIO;
 		bufdone(bp);
 		return ENXIO;
 	}
 	if (bp->b_iocmd == BIO_WRITE)
 		fuse_vnode_refreshsize(vp, NOCRED);
 
 	(void)fuse_io_strategy(vp, bp);
 
 	/*
 	 * This is a dangerous function. If returns error, that might mean a
 	 * panic. We prefer pretty much anything over being forced to panic
 	 * by a malicious daemon (a demon?). So we just return 0 anyway. You
 	 * should never mind this: this function has its own error
 	 * propagation mechanism via the argument buffer, so
 	 * not-that-melodramatic residents of the call chain still will be
 	 * able to know what to do.
 	 */
 	return 0;
 }
 
 
 /*
     struct vnop_symlink_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 	char *a_target;
     };
 */
 static int
 fuse_vnop_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	const char *target = ap->a_target;
 
 	struct fuse_dispatcher fdi;
 
 	int err;
 	size_t len;
 
-	FS_DEBUG2G("inode=%ju name=%*s\n",
-	    (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr);
-
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	/*
 	 * Unlike the other creator type calls, here we have to create a message
 	 * where the name of the new entry comes first, and the data describing
 	 * the entry comes second.
 	 * Hence we can't rely on our handy fuse_internal_newentry() routine,
 	 * but put together the message manually and just call the core part.
 	 */
 
 	len = strlen(target) + 1;
 	fdisp_init(&fdi, len + cnp->cn_namelen + 1);
 	fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL);
 
 	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 	memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len);
 
 	err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi);
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_write_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_write(struct vop_write_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	fuse_vnode_refreshsize(vp, cred);
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred);
 }
 
+SDT_PROBE_DEFINE1(fuse, , vnops, vnop_getpages_error, "int");
 /*
     struct vnop_getpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int a_reqpage;
     };
 */
 static int
 fuse_vnop_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	vm_page_t *pages;
 
-	FS_DEBUG2G("heh\n");
-
 	vp = ap->a_vp;
 	KASSERT(vp->v_object, ("objectless vp passed to getpages"));
 	td = curthread;			/* XXX */
 	cred = curthread->td_ucred;	/* XXX */
 	pages = ap->a_m;
 	npages = ap->a_count;
 
 	if (!fsess_opt_mmap(vnode_mount(vp))) {
-		FS_DEBUG("called on non-cacheable vnode??\n");
+		SDT_PROBE2(fuse, , vnops, trace, 1,
+			"called on non-cacheable vnode??\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * If the last page is partially valid, just return it and allow
 	 * the pager to zero-out the blanks.  Partially valid pages can
 	 * only occur at the file EOF.
 	 *
 	 * XXXGL: is that true for FUSE, which is a local filesystem,
 	 * but still somewhat disconnected from the kernel?
 	 */
 	VM_OBJECT_WLOCK(vp->v_object);
 	if (pages[npages - 1]->valid != 0 && --npages == 0)
 		goto out;
 	VM_OBJECT_WUNLOCK(vp->v_object);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convenient and fast.
 	 */
 	bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
 
 	kva = (vm_offset_t)bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, npages);
 
 	count = npages << PAGE_SHIFT;
 	iov.iov_base = (caddr_t)kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
 	pmap_qremove(kva, npages);
 
 	uma_zfree(fuse_pbuf_zone, bp);
 
 	if (error && (uio.uio_resid == count)) {
-		FS_DEBUG("error %d\n", error);
+		SDT_PROBE1(fuse, , vnops, vnop_getpages_error, error);
 		return VM_PAGER_ERROR;
 	}
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_WLOCK(vp->v_object);
 	fuse_vm_page_lock_queues();
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("fuse_getpages: page %p is dirty", m));
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_valid_range(m, 0, size - toff);
 			KASSERT(m->dirty == 0,
 			    ("fuse_getpages: page %p is dirty", m));
 		} else {
 			/*
 			 * Read operation was short.  If no error occurred
 			 * we may have hit a zero-fill section.   We simply
 			 * leave valid set to 0.
 			 */
 			;
 		}
 	}
 	fuse_vm_page_unlock_queues();
 out:
 	VM_OBJECT_WUNLOCK(vp->v_object);
 	if (ap->a_rbehind)
 		*ap->a_rbehind = 0;
 	if (ap->a_rahead)
 		*ap->a_rahead = 0;
 	return (VM_PAGER_OK);
 }
 
 /*
     struct vnop_putpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int a_sync;
 	int *a_rtvals;
 	vm_ooffset_t a_offset;
     };
 */
 static int
 fuse_vnop_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	vm_page_t *pages;
 	vm_ooffset_t fsize;
 
-	FS_DEBUG2G("heh\n");
-
 	vp = ap->a_vp;
 	KASSERT(vp->v_object, ("objectless vp passed to putpages"));
 	fsize = vp->v_object->un_pager.vnp.vnp_size;
 	td = curthread;			/* XXX */
 	cred = curthread->td_ucred;	/* XXX */
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 
 	if (!fsess_opt_mmap(vnode_mount(vp))) {
-		FS_DEBUG("called on non-cacheable vnode??\n");
+		SDT_PROBE2(fuse, , vnops, trace, 1,
+			"called on non-cacheable vnode??\n");
 	}
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 
 	if (offset + count > fsize) {
 		count = fsize - offset;
 		if (count < 0)
 			count = 0;
 	}
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convenient and fast.
 	 */
 	bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
 
 	kva = (vm_offset_t)bp->b_data;
 	pmap_qenter(kva, pages, npages);
 	VM_CNT_INC(v_vnodeout);
 	VM_CNT_ADD(v_vnodepgsout, count);
 
 	iov.iov_base = (caddr_t)kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
 
 	pmap_qremove(kva, npages);
 	uma_zfree(fuse_pbuf_zone, bp);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
 
 		for (i = 0; i < nwritten; i++) {
 			rtvals[i] = VM_PAGER_OK;
 			VM_OBJECT_WLOCK(pages[i]->object);
 			vm_page_undirty(pages[i]);
 			VM_OBJECT_WUNLOCK(pages[i]->object);
 		}
 	}
 	return rtvals[0];
 }
 
 static const char extattr_namespace_separator = '.';
 
 /*
     struct vop_getextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct uio *a_uio;
 	size_t *a_size;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getextattr(struct vop_getextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_getxattr_in *get_xattr_in;
 	struct fuse_getxattr_out *get_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	char *attr_str;
 	size_t len;
 	int err;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*get_xattr_in));
 	fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred);
 
 	get_xattr_in = fdi.indata;
 	/*
 	 * Check to see whether we're querying the available size or
 	 * issuing the actual request.  If we pass in 0, we get back struct
 	 * fuse_getxattr_out.  If we pass in a non-zero size, we get back
 	 * that much data, without the struct fuse_getxattr_out header.
 	 */
 	if (uio == NULL)
 		get_xattr_in->size = 0;
 	else
 		get_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*get_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_GETXATTR);
-		debug_printf("getxattr: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 	get_xattr_out = fdi.answ;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = get_xattr_out->size;
 
 	if (uio != NULL)
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_setextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setextattr(struct vop_setextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_setxattr_in *set_xattr_in;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 	
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid);
 	fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred);
 
 	set_xattr_in = fdi.indata;
 	set_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*set_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len,
 	    uio->uio_resid, uio);
 	if (err != 0) {
-		debug_printf("setxattr: got error %d from uiomove\n", err);
 		goto out;
 	}
 
 	err = fdisp_wait_answ(&fdi);
 
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_SETXATTR);
-		debug_printf("setxattr: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
  * The Linux / FUSE extended attribute list is simply a collection of
  * NUL-terminated strings.  The FreeBSD extended attribute list is a single
  * byte length followed by a non-NUL terminated string.  So, this allows
  * conversion of the Linux / FUSE format to the FreeBSD format in place.
  * Linux attribute names are reported with the namespace as a prefix (e.g.
  * "user.attribute_name"), but in FreeBSD they are reported without the
  * namespace prefix (e.g. "attribute_name").  So, we're going from:
  *
  * user.attr_name1\0user.attr_name2\0
  *
  * to:
  *
  * <num>attr_name1<num>attr_name2
  *
  * Where "<num>" is a single byte number of characters in the attribute name.
  * 
  * Args:
  * prefix - exattr namespace prefix string
  * list, list_len - input list with namespace prefixes
  * bsd_list, bsd_list_len - output list compatible with bsd vfs
  */
 static int
 fuse_xattrlist_convert(char *prefix, const char *list, int list_len,
     char *bsd_list, int *bsd_list_len)
 {
 	int len, pos, dist_to_next, prefix_len;
 
 	pos = 0;
 	*bsd_list_len = 0;
 	prefix_len = strlen(prefix);
 
 	while (pos < list_len && list[pos] != '\0') {
 		dist_to_next = strlen(&list[pos]) + 1;
 		if (bcmp(&list[pos], prefix, prefix_len) == 0 &&
 		    list[pos + prefix_len] == extattr_namespace_separator) {
 			len = dist_to_next -
 			    (prefix_len + sizeof(extattr_namespace_separator)) - 1;
 			if (len >= EXTATTR_MAXNAMELEN)
 				return (ENAMETOOLONG);
 
 			bsd_list[*bsd_list_len] = len;
 			memcpy(&bsd_list[*bsd_list_len + 1],
 			    &list[pos + prefix_len +
 			    sizeof(extattr_namespace_separator)], len);
 
 			*bsd_list_len += len + 1;
 		}
 
 		pos += dist_to_next;
 	}
 
 	return (0);
 }
 
 /*
     struct vop_listextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	struct uio *a_uio;
 	size_t *a_size;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_listextattr(struct vop_listextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_listxattr_in *list_xattr_in;
 	struct fuse_listxattr_out *list_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	size_t len;
 	char *prefix;
 	char *attr_str;
 	char *bsd_list = NULL;
 	char *linux_list;
 	int bsd_list_len;
 	int linux_list_len;
 	int err;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/*
 	 * Add space for a NUL and the period separator if enabled.
 	 * Default to looking for user attributes.
 	 */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) + 1;
 
 	fdisp_init(&fdi, sizeof(*list_xattr_in) + len);
 	fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 
 	/*
 	 * Retrieve Linux / FUSE compatible list size.
 	 */
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = 0;
 	attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
 	snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_LISTXATTR);
-		debug_printf("listextattr: got err=%d from daemon\n", err);
 		goto out;
 	}
 
 	list_xattr_out = fdi.answ;
 	linux_list_len = list_xattr_out->size;
 	if (linux_list_len == 0) {
 		if (ap->a_size != NULL)
 			*ap->a_size = linux_list_len;
 		goto out;
 	}
 
 	/*
 	 * Retrieve Linux / FUSE compatible list values.
 	 */
 	fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out);
 	attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
 	snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0)
 		goto out;
 
 	linux_list = fdi.answ;
 	linux_list_len = fdi.iosize;
 
 	/*
 	 * Retrieve the BSD compatible list values.
 	 * The Linux / FUSE attribute list format isn't the same
 	 * as FreeBSD's format. So we need to transform it into
 	 * FreeBSD's format before giving it to the user.
 	 */
 	bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK);
 	err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len,
 	    bsd_list, &bsd_list_len);
 	if (err != 0)
 		goto out;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = bsd_list_len;
 
 	if (uio != NULL)
 		err = uiomove(bsd_list, bsd_list_len, uio);
 
 out:
 	free(bsd_list, M_TEMP);
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_deleteextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct fuse_dispatcher fdi;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 
-	fuse_trace_printf_vnop();
-
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len);
 	fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred);
 
 	attr_str = fdi.indata;
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS)
 			fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
-		debug_printf("removexattr: got err=%d from daemon\n", err);
 	}
 
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vnop_print_args {
 	struct vnode *a_vp;
     };
 */
 static int
 fuse_vnop_print(struct vop_print_args *ap)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp);
 
 	printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n",
 	    (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid,
 	    (uintmax_t)fvdat->nlookup,
 	    fvdat->flag);
 
 	return 0;
 }
Index: projects/capsicum-test/sys/kern/imgact_elf.c
===================================================================
--- projects/capsicum-test/sys/kern/imgact_elf.c	(revision 345709)
+++ projects/capsicum-test/sys/kern/imgact_elf.c	(revision 345710)
@@ -1,2677 +1,2704 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2017 Dell EMC
  * Copyright (c) 2000-2001, 2003 David O'Brien
  * Copyright (c) 1995-1996 Søren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/compressor.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/ptrace.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/eventhandler.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #define ELF_NOTE_ROUNDSIZE	4
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
-    const char *interp, int interp_name_len, int32_t *osrel, uint32_t *fctl0);
+    const char *interp, int32_t *osrel, uint32_t *fctl0);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry);
 static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note,
     int32_t *osrel);
 static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static boolean_t __elfN(check_note)(struct image_params *imgp,
     Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0);
 static vm_prot_t __elfN(trans_prot)(Elf_Word);
 static Elf_Word __elfN(untrans_prot)(vm_prot_t);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 #define	CORE_BUF_SIZE	(16 * 1024)
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0,
     "include all and only RW pages in core dumps");
 
 int __elfN(nxstack) =
 #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \
     (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) || \
     defined(__riscv)
 	1;
 #else
 	0;
 #endif
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
 
 #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
 int i386_read_exec = 0;
 SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
     "enable execution from readable segments");
 #endif
 
 SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, CTLFLAG_RW, 0,
     "");
 #define	ASLR_NODE_OID	__CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr)
 
 static int __elfN(aslr_enabled) = 0;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN,
     &__elfN(aslr_enabled), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
     ": enable address map randomization");
 
 static int __elfN(pie_aslr_enabled) = 0;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN,
     &__elfN(pie_aslr_enabled), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
     ": enable address map randomization for PIE binaries");
 
 static int __elfN(aslr_honor_sbrk) = 1;
 SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW,
     &__elfN(aslr_honor_sbrk), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used");
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 #define	aligned(a, t)	(rounddown2((u_long)(a), sizeof(t)) == (u_long)(a))
 
 static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
 
 Elf_Brandnote __elfN(freebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
 	.hdr.n_descsz	= sizeof(int32_t),
 	.hdr.n_type	= NT_FREEBSD_ABI_TAG,
 	.vendor		= FREEBSD_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= __elfN(freebsd_trans_osrel)
 };
 
 static bool
 __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
 {
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 	*osrel = *(const int32_t *)(p);
 
 	return (true);
 }
 
 static const char GNU_ABI_VENDOR[] = "GNU";
 static int GNU_KFREEBSD_ABI_DESC = 3;
 
 Elf_Brandnote __elfN(kfreebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
 	.vendor		= GNU_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= kfreebsd_trans_osrel
 };
 
 static bool
 kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
 {
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 
 	desc = (const Elf32_Word *)p;
 	if (desc[0] != GNU_KFREEBSD_ABI_DESC)
 		return (false);
 
 	/*
 	 * Debian GNU/kFreeBSD embed the earliest compatible kernel version
 	 * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
 	 */
 	*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
 
 	return (true);
 }
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS) {
 		printf("WARNING: %s: could not insert brandinfo entry: %p\n",
 			__func__, entry);
 		return (-1);
 	}
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
-    int interp_name_len, int32_t *osrel, uint32_t *fctl0)
+    int32_t *osrel, uint32_t *fctl0)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	Elf_Brandinfo *bi, *bi_m;
 	boolean_t ret;
-	int i;
+	int i, interp_name_len;
 
+	interp_name_len = interp != NULL ? strlen(interp) : 0;
+
 	/*
 	 * We support four types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, (3) path of the `interp_path'
 	 * field, and (4) the ".note.ABI-tag" ELF section.
 	 */
 
 	/* Look for an ".note.ABI-tag" ELF section */
 	bi_m = NULL;
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL)
 			continue;
 		if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)
 			continue;
 		if (hdr->e_machine == bi->machine && (bi->flags &
 		    (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
 			ret = __elfN(check_note)(imgp, bi->brand_note, osrel,
 			    fctl0);
 			/* Give brand a chance to veto check_note's guess */
 			if (ret && bi->header_supported)
 				ret = bi->header_supported(imgp);
 			/*
 			 * If note checker claimed the binary, but the
 			 * interpreter path in the image does not
 			 * match default one for the brand, try to
 			 * search for other brands with the same
 			 * interpreter.  Either there is better brand
 			 * with the right interpreter, or, failing
 			 * this, we return first brand which accepted
 			 * our note and, optionally, header.
 			 */
 			if (ret && bi_m == NULL && interp != NULL &&
 			    (bi->interp_path == NULL ||
 			    (strlen(bi->interp_path) + 1 != interp_name_len ||
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    != 0))) {
 				bi_m = bi;
 				ret = 0;
 			}
 			if (ret)
 				return (bi);
 		}
 	}
 	if (bi_m != NULL)
 		return (bi_m);
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
 		    (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    (bi->compat_3_brand != NULL &&
 		    strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand) == 0))) {
 			/* Looks good, but give brand a chance to veto */
 			if (bi->header_supported == NULL ||
 			    bi->header_supported(imgp)) {
 				/*
 				 * Again, prefer strictly matching
 				 * interpreter path.
 				 */
 				if (interp_name_len == 0 &&
 				    bi->interp_path == NULL)
 					return (bi);
 				if (bi->interp_path != NULL &&
 				    strlen(bi->interp_path) + 1 ==
 				    interp_name_len && strncmp(interp,
 				    bi->interp_path, interp_name_len) == 0)
 					return (bi);
 				if (bi_m == NULL)
 					bi_m = bi;
 			}
 		}
 	}
 	if (bi_m != NULL)
 		return (bi_m);
 
 	/* No known brand, see if the header is recognized by any brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY ||
 		    bi->header_supported == NULL)
 			continue;
 		if (hdr->e_machine == bi->machine) {
 			ret = bi->header_supported(imgp);
 			if (ret)
 				return (bi);
 		}
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi == NULL || (bi->flags &
 			    (BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC))
 			    != 0)
 				continue;
 			if (hdr->e_machine == bi->machine &&
 			    bi->interp_path != NULL &&
 			    /* ELF image p_filesz includes terminating zero */
 			    strlen(bi->interp_path) + 1 == interp_name_len &&
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    == 0 && (bi->header_supported == NULL ||
 			    bi->header_supported(imgp)))
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
 		    (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand &&
 		    (bi->header_supported == NULL ||
 		    bi->header_supported(imgp)))
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) -
 	    trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object != NULL) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error != 0)
 			return (KERN_FAILURE);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object,
     vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot,
     int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, locked, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (start >= end)
 		return (KERN_SUCCESS);
 	if ((offset & PAGE_MASK) != 0) {
 		/*
 		 * The mapping is not page aligned.  This means that we have
 		 * to copy the data.
 		 */
 		rv = vm_map_fixed(map, NULL, 0, start, end - start,
 		    prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		if (object == NULL)
 			return (KERN_SUCCESS);
 		for (; start < end; start += sz) {
 			sf = vm_imgact_map_page(object, offset);
 			if (sf == NULL)
 				return (KERN_FAILURE);
 			off = offset - trunc_page(offset);
 			sz = end - start;
 			if (sz > PAGE_SIZE - off)
 				sz = PAGE_SIZE - off;
 			error = copyout((caddr_t)sf_buf_kva(sf) + off,
 			    (caddr_t)start, sz);
 			vm_imgact_unmap_page(sf);
 			if (error != 0)
 				return (KERN_FAILURE);
 			offset += sz;
 		}
 	} else {
 		vm_object_reference(object);
 		rv = vm_map_fixed(map, object, offset, start, end - start,
 		    prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL);
 		if (rv != KERN_SUCCESS) {
 			locked = VOP_ISLOCKED(imgp->vp);
 			VOP_UNLOCK(imgp->vp, 0);
 			vm_object_deallocate(object);
 			vn_lock(imgp->vp, locked | LK_RETRY);
 			return (rv);
 		}
 	}
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t off, map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_ooffset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) ||
 	    filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 	object = imgp->object;
 	map = &imgp->proc->p_vmspace->vm_map;
 	map_addr = trunc_page((vm_offset_t)vmaddr);
 	file_addr = trunc_page(offset);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second.
 	 */
 	if (filsz == 0)
 		map_len = 0;
 	else if (memsz > filsz)
 		map_len = trunc_page(offset + filsz) - file_addr;
 	else
 		map_len = round_page(offset + filsz) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(imgp, map,
 				      object,
 				      file_addr,	/* file offset */
 				      map_addr,		/* virtual start */
 				      map_addr + map_len,/* virtual end */
 				      prot,
 				      cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz)
 			return (0);
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page(offset +
 	    filsz);
 	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
 	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr,
 		    map_addr + map_len, prot, 0);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 	}
 
 	if (copy_len != 0) {
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		off = trunc_page(offset + filsz) - trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Remove write access to the page if it was only granted by map_insert
 	 * to allow copyout.
 	 */
 	if ((prot & VM_PROT_WRITE) == 0)
 		vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
 		    map_len), prot, FALSE);
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vattr *attr;
 	struct image_params *imgp;
 	vm_prot_t prot;
 	u_long rbase;
 	u_long base_addr = 0;
 	int error, i, numsegs;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * XXXJA: This check can go away once we are sufficiently confident
 	 * that the checks in namei() are correct.
 	 */
 	if (IN_CAPABILITY_MODE(curthread))
 		return (ECAPMODE);
 #endif
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
 	imgp->image_header = NULL;
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
 	NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread);
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	/*
 	 * Also make certain that the interpreter stays the same, so set
 	 * its VV_TEXT flag, too.
 	 */
 	VOP_SET_TEXT(nd->ni_vp);
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) {
 			/* Loadable segment */
 			prot = __elfN(trans_prot)(phdr[i].p_flags);
 			error = __elfN(load_section)(imgp, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot);
 			if (error != 0)
 				goto fail;
 			/*
 			 * Establish the base address if this is the
 			 * first segment.
 			 */
 			if (numsegs == 0)
   				base_addr = trunc_page(phdr[i].p_vaddr +
 				    rbase);
 			numsegs++;
 		}
 	}
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp)
 		vput(nd->ni_vp);
 
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static u_long
 __CONCAT(rnd_, __elfN(base))(vm_map_t map __unused, u_long minv, u_long maxv,
     u_int align)
 {
 	u_long rbase, res;
 
 	MPASS(vm_map_min(map) <= minv);
 	MPASS(maxv <= vm_map_max(map));
 	MPASS(minv < maxv);
 	MPASS(minv + align < maxv);
 	arc4rand(&rbase, sizeof(rbase), 0);
 	res = roundup(minv, (u_long)align) + rbase % (maxv - minv);
 	res &= ~((u_long)align - 1);
 	if (res >= maxv)
 		res -= align;
 	KASSERT(res >= minv,
 	    ("res %#lx < minv %#lx, maxv %#lx rbase %#lx",
 	    res, minv, maxv, rbase));
 	KASSERT(res < maxv,
 	    ("res %#lx > maxv %#lx, minv %#lx rbase %#lx",
 	    res, maxv, minv, rbase));
 	return (res);
 }
 
 static int
 __elfN(enforce_limits)(struct image_params *imgp, const Elf_Ehdr *hdr,
     const Elf_Phdr *phdr, u_long et_dyn_addr)
 {
 	struct vmspace *vmspace;
 	const char *err_str;
 	u_long text_size, data_size, total_size, text_addr, data_addr;
 	u_long seg_size, seg_addr;
 	int i;
 
 	err_str = NULL;
 	text_size = data_size = total_size = text_addr = data_addr = 0;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
 			continue;
 
 		seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
 		seg_size = round_page(phdr[i].p_memsz +
 		    phdr[i].p_vaddr + et_dyn_addr - seg_addr);
 
 		/*
 		 * Make the largest executable segment the official
 		 * text segment and all others data.
 		 *
 		 * Note that obreak() assumes that data_addr + data_size == end
 		 * of data load area, and the ELF file format expects segments
 		 * to be sorted by address.  If multiple data segments exist,
 		 * the last one will be used.
 		 */
 
 		if ((phdr[i].p_flags & PF_X) != 0 && text_size < seg_size) {
 			text_size = seg_size;
 			text_addr = seg_addr;
 		} else {
 			data_size = seg_size;
 			data_addr = seg_addr;
 		}
 		total_size += seg_size;
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA))
 		err_str = "Data segment size exceeds process limit";
 	else if (text_size > maxtsiz)
 		err_str = "Text segment size exceeds system limit";
 	else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM))
 		err_str = "Total segment size exceeds process limit";
 	else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0)
 		err_str = "Data segment size exceeds resource limit";
 	else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0)
 		err_str = "Total segment size exceeds resource limit";
 	PROC_UNLOCK(imgp->proc);
 	if (err_str != NULL) {
 		uprintf("%s\n", err_str);
 		return (ENOMEM);
 	}
 
 	vmspace = imgp->proc->p_vmspace;
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	return (0);
 }
 
+static int
+__elfN(get_interp)(struct image_params *imgp, const Elf_Phdr *phdr,
+    char **interpp, bool *free_interpp)
+{
+	struct thread *td;
+	char *interp;
+	int error, interp_name_len;
+
+	KASSERT(phdr->p_type == PT_INTERP,
+	    ("%s: p_type %u != PT_INTERP", __func__, phdr->p_type));
+	KASSERT(VOP_ISLOCKED(imgp->vp),
+	    ("%s: vp %p is not locked", __func__, imgp->vp));
+
+	td = curthread;
+
+	/* Path to interpreter */
+	if (phdr->p_filesz < 2 || phdr->p_filesz > MAXPATHLEN) {
+		uprintf("Invalid PT_INTERP\n");
+		return (ENOEXEC);
+	}
+
+	interp_name_len = phdr->p_filesz;
+	if (phdr->p_offset > PAGE_SIZE ||
+	    interp_name_len > PAGE_SIZE - phdr->p_offset) {
+		VOP_UNLOCK(imgp->vp, 0);
+		interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK);
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+		error = vn_rdwr(UIO_READ, imgp->vp, interp,
+		    interp_name_len, phdr->p_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+		    NOCRED, NULL, td);
+		if (error != 0) {
+			free(interp, M_TEMP);
+			uprintf("i/o error PT_INTERP %d\n", error);
+			return (error);
+		}
+		interp[interp_name_len] = '\0';
+
+		*interpp = interp;
+		*free_interpp = true;
+		return (0);
+	}
+
+	interp = __DECONST(char *, imgp->image_header) + phdr->p_offset;
+	if (interp[interp_name_len - 1] != '\0') {
+		uprintf("Invalid PT_INTERP\n");
+		return (ENOEXEC);
+	}
+
+	*interpp = interp;
+	*free_interpp = false;
+	return (0);
+}
+
 /*
  * Impossible et_dyn_addr initial value indicating that the real base
  * must be calculated later with some randomization applied.
  */
 #define	ET_DYN_ADDR_RAND	1
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	struct thread *td;
 	const Elf_Ehdr *hdr;
 	const Elf_Phdr *phdr;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_map_t map;
 	const char *newinterp;
-	char *interp, *interp_buf, *path;
+	char *interp, *path;
 	Elf_Brandinfo *brand_info;
 	struct sysentvec *sv;
 	vm_prot_t prot;
 	u_long addr, baddr, et_dyn_addr, entry, proghdr;
 	u_long maxalign, mapsz, maxv, maxv1;
 	uint32_t fctl0;
 	int32_t osrel;
-	int error, i, n, interp_name_len, have_interp;
+	bool free_interp;
+	int error, i, n, have_interp;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		/* Only support headers in first page for now */
 		uprintf("Program headers not in the first page\n");
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); 
 	if (!aligned(phdr, Elf_Addr)) {
 		uprintf("Unaligned program headers\n");
 		return (ENOEXEC);
 	}
 
 	n = error = 0;
 	baddr = 0;
 	osrel = 0;
 	fctl0 = 0;
 	entry = proghdr = 0;
-	interp_name_len = 0;
-	newinterp = NULL;
-	interp = interp_buf = NULL;
+	newinterp = interp = NULL;
+	free_interp = false;
 	td = curthread;
 	maxalign = PAGE_SIZE;
 	mapsz = 0;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:
 			if (n == 0)
 				baddr = phdr[i].p_vaddr;
 			if (phdr[i].p_align > maxalign)
 				maxalign = phdr[i].p_align;
 			mapsz += phdr[i].p_memsz;
 			n++;
 			break;
 		case PT_INTERP:
 			/* Path to interpreter */
-			if (phdr[i].p_filesz < 2 ||
-			    phdr[i].p_filesz > MAXPATHLEN) {
-				uprintf("Invalid PT_INTERP\n");
-				error = ENOEXEC;
-				goto ret;
-			}
 			if (interp != NULL) {
 				uprintf("Multiple PT_INTERP headers\n");
 				error = ENOEXEC;
 				goto ret;
 			}
-			interp_name_len = phdr[i].p_filesz;
-			if (phdr[i].p_offset > PAGE_SIZE ||
-			    interp_name_len > PAGE_SIZE - phdr[i].p_offset) {
-				VOP_UNLOCK(imgp->vp, 0);
-				interp_buf = malloc(interp_name_len + 1, M_TEMP,
-				    M_WAITOK);
-				vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
-				error = vn_rdwr(UIO_READ, imgp->vp, interp_buf,
-				    interp_name_len, phdr[i].p_offset,
-				    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
-				    NOCRED, NULL, td);
-				if (error != 0) {
-					uprintf("i/o error PT_INTERP %d\n",
-					    error);
-					goto ret;
-				}
-				interp_buf[interp_name_len] = '\0';
-				interp = interp_buf;
-			} else {
-				interp = __DECONST(char *, imgp->image_header) +
-				    phdr[i].p_offset;
-				if (interp[interp_name_len - 1] != '\0') {
-					uprintf("Invalid PT_INTERP\n");
-					error = ENOEXEC;
-					goto ret;
-				}
-			}
+			error = __elfN(get_interp)(imgp, &phdr[i], &interp,
+			    &free_interp);
+			if (error != 0)
+				goto ret;
 			break;
 		case PT_GNU_STACK:
 			if (__elfN(nxstack))
 				imgp->stack_prot =
 				    __elfN(trans_prot)(phdr[i].p_flags);
 			imgp->stack_sz = phdr[i].p_memsz;
 			break;
 		}
 	}
 
-	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
-	    &osrel, &fctl0);
+	brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel, &fctl0);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		error = ENOEXEC;
 		goto ret;
 	}
 	sv = brand_info->sysvec;
 	et_dyn_addr = 0;
 	if (hdr->e_type == ET_DYN) {
 		if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) {
 			uprintf("Cannot execute shared object\n");
 			error = ENOEXEC;
 			goto ret;
 		}
 		/*
 		 * Honour the base load address from the dso if it is
 		 * non-zero for some reason.
 		 */
 		if (baddr == 0) {
 			if ((sv->sv_flags & SV_ASLR) == 0 ||
 			    (fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0)
 				et_dyn_addr = ET_DYN_LOAD_ADDR;
 			else if ((__elfN(pie_aslr_enabled) &&
 			    (imgp->proc->p_flag2 & P2_ASLR_DISABLE) == 0) ||
 			    (imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0)
 				et_dyn_addr = ET_DYN_ADDR_RAND;
 			else
 				et_dyn_addr = ET_DYN_LOAD_ADDR;
 		}
 	}
 	if (interp != NULL && brand_info->interp_newpath != NULL)
 		newinterp = brand_info->interp_newpath;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 *
 	 * The VV_TEXT flag prevents modifications to the executable while
 	 * the vnode is unlocked.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	/*
 	 * Decide whether to enable randomization of user mappings.
 	 * First, reset user preferences for the setid binaries.
 	 * Then, account for the support of the randomization by the
 	 * ABI, by user preferences, and make special treatment for
 	 * PIE binaries.
 	 */
 	if (imgp->credential_setid) {
 		PROC_LOCK(imgp->proc);
 		imgp->proc->p_flag2 &= ~(P2_ASLR_ENABLE | P2_ASLR_DISABLE);
 		PROC_UNLOCK(imgp->proc);
 	}
 	if ((sv->sv_flags & SV_ASLR) == 0 ||
 	    (imgp->proc->p_flag2 & P2_ASLR_DISABLE) != 0 ||
 	    (fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0) {
 		KASSERT(et_dyn_addr != ET_DYN_ADDR_RAND,
 		    ("et_dyn_addr == RAND and !ASLR"));
 	} else if ((imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0 ||
 	    (__elfN(aslr_enabled) && hdr->e_type == ET_EXEC) ||
 	    et_dyn_addr == ET_DYN_ADDR_RAND) {
 		imgp->map_flags |= MAP_ASLR;
 		/*
 		 * If user does not care about sbrk, utilize the bss
 		 * grow region for mappings as well.  We can select
 		 * the base for the image anywere and still not suffer
 		 * from the fragmentation.
 		 */
 		if (!__elfN(aslr_honor_sbrk) ||
 		    (imgp->proc->p_flag2 & P2_ASLR_IGNSTART) != 0)
 			imgp->map_flags |= MAP_ASLR_IGNSTART;
 	}
 
 	error = exec_new_vmspace(imgp, sv);
 	vmspace = imgp->proc->p_vmspace;
 	map = &vmspace->vm_map;
 
 	imgp->proc->p_sysent = sv;
 
 	maxv = vm_map_max(map) - lim_max(td, RLIMIT_STACK);
 	if (et_dyn_addr == ET_DYN_ADDR_RAND) {
 		KASSERT((map->flags & MAP_ASLR) != 0,
 		    ("ET_DYN_ADDR_RAND but !MAP_ASLR"));
 		et_dyn_addr = __CONCAT(rnd_, __elfN(base))(map,
 		    vm_map_min(map) + mapsz + lim_max(td, RLIMIT_DATA),
 		    /* reserve half of the address space to interpreter */
 		    maxv / 2, 1UL << flsl(maxalign));
 	}
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0)
 		goto ret;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:	/* Loadable segment */
 			if (phdr[i].p_memsz == 0)
 				break;
 			prot = __elfN(trans_prot)(phdr[i].p_flags);
 			error = __elfN(load_section)(imgp, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot);
 			if (error != 0)
 				goto ret;
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff +
 				    et_dyn_addr;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr + et_dyn_addr;
 			break;
 		default:
 			break;
 		}
 	}
 
 	error = __elfN(enforce_limits)(imgp, hdr, phdr, et_dyn_addr);
 	if (error != 0)
 		goto ret;
 
 	entry = (u_long)hdr->e_entry + et_dyn_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td,
 	    RLIMIT_DATA));
 	if ((map->flags & MAP_ASLR) != 0) {
 		maxv1 = maxv / 2 + addr / 2;
 		MPASS(maxv1 >= addr);	/* No overflow */
 		map->anon_loc = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1,
 		    MAXPAGESIZES > 1 ? pagesizes[1] : pagesizes[0]);
 	} else {
 		map->anon_loc = addr;
 	}
 
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
 		have_interp = FALSE;
 		VOP_UNLOCK(imgp->vp, 0);
 		if ((map->flags & MAP_ASLR) != 0) {
 			/* Assume that interpeter fits into 1/4 of AS */
 			maxv1 = maxv / 2 + addr / 2;
 			MPASS(maxv1 >= addr);	/* No overflow */
 			addr = __CONCAT(rnd_, __elfN(base))(map, addr,
 			    maxv1, PAGE_SIZE);
 		}
 		if (brand_info->emul_path != NULL &&
 		    brand_info->emul_path[0] != '\0') {
 			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 			snprintf(path, MAXPATHLEN, "%s%s",
 			    brand_info->emul_path, interp);
 			error = __elfN(load_file)(imgp->proc, path, &addr,
 			    &imgp->entry_addr);
 			free(path, M_TEMP);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp && newinterp != NULL &&
 		    (brand_info->interp_path == NULL ||
 		    strcmp(interp, brand_info->interp_path) == 0)) {
 			error = __elfN(load_file)(imgp->proc, newinterp, &addr,
 			    &imgp->entry_addr);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp) {
 			error = __elfN(load_file)(imgp->proc, interp, &addr,
 			    &imgp->entry_addr);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0) {
 			uprintf("ELF interpreter %s not found, error %d\n",
 			    interp, error);
 			goto ret;
 		}
 	} else
 		addr = et_dyn_addr;
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 	elf_auxargs->hdr_eflags = hdr->e_flags;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 	imgp->reloc_base = addr;
 	imgp->proc->p_osrel = osrel;
 	imgp->proc->p_fctl0 = fctl0;
 	imgp->proc->p_elf_machine = hdr->e_machine;
 	imgp->proc->p_elf_flags = hdr->e_flags;
 
 ret:
-	free(interp_buf, M_TEMP);
+	if (free_interp)
+		free(interp, M_TEMP);
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Auxinfo *argarray, *pos;
 	Elf_Addr *base, *auxbase;
 	int error;
 
 	base = (Elf_Addr *)*stack_base;
 	auxbase = base + imgp->args->argc + 1 + imgp->args->envc + 1;
 	argarray = pos = malloc(AT_COUNT * sizeof(*pos), M_TEMP,
 	    M_WAITOK | M_ZERO);
 
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags);
 	if (imgp->execpathp != 0)
 		AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
 	AUXARGS_ENTRY(pos, AT_OSRELDATE,
 	    imgp->proc->p_ucred->cr_prison->pr_osreldate);
 	if (imgp->canary != 0) {
 		AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
 		AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
 	}
 	AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
 	if (imgp->pagesizes != 0) {
 		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
 		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
 	}
 	if (imgp->sysent->sv_timekeep_base != 0) {
 		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
 		    imgp->sysent->sv_timekeep_base);
 	}
 	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
 	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    imgp->sysent->sv_stackprot);
 	if (imgp->sysent->sv_hwcap != NULL)
 		AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap);
 	if (imgp->sysent->sv_hwcap2 != NULL)
 		AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 	KASSERT(pos - argarray <= AT_COUNT, ("Too many auxargs"));
 
 	error = copyout(argarray, auxbase, sizeof(*argarray) * AT_COUNT);
 	free(argarray, M_TEMP);
 	if (error != 0)
 		return (error);
 
 	base--;
 	if (suword(base, imgp->args->argc) == -1)
 		return (EFAULT);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
 
 struct note_info {
 	int		type;		/* Note type. */
 	outfunc_t 	outfunc; 	/* Output function. */
 	void		*outarg;	/* Argument for the output function. */
 	size_t		outsize;	/* Output size. */
 	TAILQ_ENTRY(note_info) link;	/* Link to the next note info. */
 };
 
 TAILQ_HEAD(note_info_list, note_info);
 
 /* Coredump output parameters. */
 struct coredump_params {
 	off_t		offset;
 	struct ucred	*active_cred;
 	struct ucred	*file_cred;
 	struct thread	*td;
 	struct vnode	*vp;
 	struct compressor *comp;
 };
 
 extern int compress_user_cores;
 extern int compress_user_cores_level;
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static int core_write(struct coredump_params *, const void *, size_t, off_t,
     enum uio_seg);
 static void each_dumpable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t,
     struct note_info_list *, size_t);
 static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
     size_t *);
 static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
 static void __elfN(putnote)(struct note_info *, struct sbuf *);
 static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
 static int sbuf_drain_core_output(void *, const char *, int);
 static int sbuf_drain_count(void *arg, const char *data, int len);
 
 static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
 static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
 static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_ptlwpinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
 static void note_procstat_files(void *, struct sbuf *, size_t *);
 static void note_procstat_groups(void *, struct sbuf *, size_t *);
 static void note_procstat_osrel(void *, struct sbuf *, size_t *);
 static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
 static void note_procstat_umask(void *, struct sbuf *, size_t *);
 static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
 
 /*
  * Write out a core segment to the compression stream.
  */
 static int
 compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len)
 {
 	u_int chunk_len;
 	int error;
 
 	while (len > 0) {
 		chunk_len = MIN(len, CORE_BUF_SIZE);
 
 		/*
 		 * We can get EFAULT error here.
 		 * In that case zero out the current chunk of the segment.
 		 */
 		error = copyin(base, buf, chunk_len);
 		if (error != 0)
 			bzero(buf, chunk_len);
 		error = compressor_write(p->comp, buf, chunk_len);
 		if (error != 0)
 			break;
 		base += chunk_len;
 		len -= chunk_len;
 	}
 	return (error);
 }
 
 static int
 core_compressed_write(void *base, size_t len, off_t offset, void *arg)
 {
 
 	return (core_write((struct coredump_params *)arg, base, len, offset,
 	    UIO_SYSSPACE));
 }
 
 static int
 core_write(struct coredump_params *p, const void *base, size_t len,
     off_t offset, enum uio_seg seg)
 {
 
 	return (vn_rdwr_inchunks(UIO_WRITE, p->vp, __DECONST(void *, base),
 	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
 	    p->active_cred, p->file_cred, NULL, p->td));
 }
 
 static int
 core_output(void *base, size_t len, off_t offset, struct coredump_params *p,
     void *tmpbuf)
 {
 	int error;
 
 	if (p->comp != NULL)
 		return (compress_chunk(p, base, tmpbuf, len));
 
 	/*
 	 * EFAULT is a non-fatal error that we can get, for example,
 	 * if the segment is backed by a file but extends beyond its
 	 * end.
 	 */
 	error = core_write(p, base, len, offset, UIO_USERSPACE);
 	if (error == EFAULT) {
 		log(LOG_WARNING, "Failed to fully fault in a core file segment "
 		    "at VA %p with size 0x%zx to be written at offset 0x%jx "
 		    "for process %s\n", base, len, offset, curproc->p_comm);
 
 		/*
 		 * Write a "real" zero byte at the end of the target region
 		 * in the case this is the last segment.
 		 * The intermediate space will be implicitly zero-filled.
 		 */
 		error = core_write(p, zero_region, 1, offset + len - 1,
 		    UIO_SYSSPACE);
 	}
 	return (error);
 }
 
 /*
  * Drain into a core file.
  */
 static int
 sbuf_drain_core_output(void *arg, const char *data, int len)
 {
 	struct coredump_params *p;
 	int error, locked;
 
 	p = (struct coredump_params *)arg;
 
 	/*
 	 * Some kern_proc out routines that print to this sbuf may
 	 * call us with the process lock held. Draining with the
 	 * non-sleepable lock held is unsafe. The lock is needed for
 	 * those routines when dumping a live process. In our case we
 	 * can safely release the lock before draining and acquire
 	 * again after.
 	 */
 	locked = PROC_LOCKED(p->td->td_proc);
 	if (locked)
 		PROC_UNLOCK(p->td->td_proc);
 	if (p->comp != NULL)
 		error = compressor_write(p->comp, __DECONST(char *, data), len);
 	else
 		error = core_write(p, __DECONST(void *, data), len, p->offset,
 		    UIO_SYSSPACE);
 	if (locked)
 		PROC_LOCK(p->td->td_proc);
 	if (error != 0)
 		return (-error);
 	p->offset += len;
 	return (len);
 }
 
 /*
  * Drain into a counter.
  */
 static int
 sbuf_drain_count(void *arg, const char *data __unused, int len)
 {
 	size_t *sizep;
 
 	sizep = (size_t *)arg;
 	*sizep += len;
 	return (len);
 }
 
 int
 __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	struct note_info_list notelst;
 	struct coredump_params params;
 	struct note_info *ninfo;
 	void *hdr, *tmpbuf;
 	size_t hdrsize, notesz, coresize;
 
 	hdr = NULL;
 	tmpbuf = NULL;
 	TAILQ_INIT(&notelst);
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_dumpable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Collect info about the core file header area.
 	 */
 	hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
 	if (seginfo.count + 1 >= PN_XNUM)
 		hdrsize += sizeof(Elf_Shdr);
 	__elfN(prepare_notes)(td, &notelst, &notesz);
 	coresize = round_page(hdrsize + notesz) + seginfo.size;
 
 	/* Set up core dump parameters. */
 	params.offset = 0;
 	params.active_cred = cred;
 	params.file_cred = NOCRED;
 	params.td = td;
 	params.vp = vp;
 	params.comp = NULL;
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		error = racct_add(td->td_proc, RACCT_CORE, coresize);
 		PROC_UNLOCK(td->td_proc);
 		if (error != 0) {
 			error = EFAULT;
 			goto done;
 		}
 	}
 #endif
 	if (coresize >= limit) {
 		error = EFAULT;
 		goto done;
 	}
 
 	/* Create a compression stream if necessary. */
 	if (compress_user_cores != 0) {
 		params.comp = compressor_init(core_compressed_write,
 		    compress_user_cores, CORE_BUF_SIZE,
 		    compress_user_cores_level, &params);
 		if (params.comp == NULL) {
 			error = EFAULT;
 			goto done;
 		}
 		tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
         }
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out following the notes.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	error = __elfN(corehdr)(&params, seginfo.count, hdr, hdrsize, &notelst,
 	    notesz);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = round_page(hdrsize + notesz);
 		for (i = 0; i < seginfo.count; i++) {
 			error = core_output((caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, &params, tmpbuf);
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 		if (error == 0 && params.comp != NULL)
 			error = compressor_flush(params.comp);
 	}
 	if (error) {
 		log(LOG_WARNING,
 		    "Failed to write core file for process %s (error %d)\n",
 		    curproc->p_comm, error);
 	}
 
 done:
 	free(tmpbuf, M_TEMP);
 	if (params.comp != NULL)
 		compressor_fini(params.comp);
 	while ((ninfo = TAILQ_FIRST(&notelst)) != NULL) {
 		TAILQ_REMOVE(&notelst, ninfo, link);
 		free(ninfo, M_TEMP);
 	}
 	if (hdr != NULL)
 		free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_dumpable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(vm_map_entry_t entry, void *closure)
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = __elfN(untrans_prot)(entry->protection);
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_dumpable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(vm_map_entry_t entry, void *closure)
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_dumpable_segment(struct thread *td, segment_callback func, void *closure)
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, object;
 	boolean_t ignore_entry;
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Ignore memory-mapped devices and such things. */
 		VM_OBJECT_RLOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_RLOCK(backing_object);
 			VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		ignore_entry = object->type != OBJT_DEFAULT &&
 		    object->type != OBJT_SWAP && object->type != OBJT_VNODE &&
 		    object->type != OBJT_PHYS;
 		VM_OBJECT_RUNLOCK(object);
 		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr,
     size_t hdrsize, struct note_info_list *notelst, size_t notesz)
 {
 	struct note_info *ninfo;
 	struct sbuf *sb;
 	int error;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	__elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz);
 
 	sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
 	sbuf_set_drain(sb, sbuf_drain_core_output, p);
 	sbuf_start_section(sb, NULL);
 	sbuf_bcat(sb, hdr, hdrsize);
 	TAILQ_FOREACH(ninfo, notelst, link)
 	    __elfN(putnote)(ninfo, sb);
 	/* Align up to a page boundary for the program segments. */
 	sbuf_end_section(sb, -1, PAGE_SIZE, 0);
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static void
 __elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
     size_t *sizep)
 {
 	struct proc *p;
 	struct thread *thr;
 	size_t size;
 
 	p = td->td_proc;
 	size = 0;
 
 	size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		size += register_note(list, NT_PRSTATUS,
 		    __elfN(note_prstatus), thr);
 		size += register_note(list, NT_FPREGSET,
 		    __elfN(note_fpregset), thr);
 		size += register_note(list, NT_THRMISC,
 		    __elfN(note_thrmisc), thr);
 		size += register_note(list, NT_PTLWPINFO,
 		    __elfN(note_ptlwpinfo), thr);
 		size += register_note(list, -1,
 		    __elfN(note_threadmd), thr);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	size += register_note(list, NT_PROCSTAT_PROC,
 	    __elfN(note_procstat_proc), p);
 	size += register_note(list, NT_PROCSTAT_FILES,
 	    note_procstat_files, p);
 	size += register_note(list, NT_PROCSTAT_VMMAP,
 	    note_procstat_vmmap, p);
 	size += register_note(list, NT_PROCSTAT_GROUPS,
 	    note_procstat_groups, p);
 	size += register_note(list, NT_PROCSTAT_UMASK,
 	    note_procstat_umask, p);
 	size += register_note(list, NT_PROCSTAT_RLIMIT,
 	    note_procstat_rlimit, p);
 	size += register_note(list, NT_PROCSTAT_OSREL,
 	    note_procstat_osrel, p);
 	size += register_note(list, NT_PROCSTAT_PSSTRINGS,
 	    __elfN(note_procstat_psstrings), p);
 	size += register_note(list, NT_PROCSTAT_AUXV,
 	    __elfN(note_procstat_auxv), p);
 
 	*sizep = size;
 }
 
 static void
 __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
     size_t notesz)
 {
 	Elf_Ehdr *ehdr;
 	Elf_Phdr *phdr;
 	Elf_Shdr *shdr;
 	struct phdr_closure phc;
 
 	ehdr = (Elf_Ehdr *)hdr;
 
 	ehdr->e_ident[EI_MAG0] = ELFMAG0;
 	ehdr->e_ident[EI_MAG1] = ELFMAG1;
 	ehdr->e_ident[EI_MAG2] = ELFMAG2;
 	ehdr->e_ident[EI_MAG3] = ELFMAG3;
 	ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 	ehdr->e_ident[EI_DATA] = ELF_DATA;
 	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 	ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 	ehdr->e_ident[EI_ABIVERSION] = 0;
 	ehdr->e_ident[EI_PAD] = 0;
 	ehdr->e_type = ET_CORE;
 	ehdr->e_machine = td->td_proc->p_elf_machine;
 	ehdr->e_version = EV_CURRENT;
 	ehdr->e_entry = 0;
 	ehdr->e_phoff = sizeof(Elf_Ehdr);
 	ehdr->e_flags = td->td_proc->p_elf_flags;
 	ehdr->e_ehsize = sizeof(Elf_Ehdr);
 	ehdr->e_phentsize = sizeof(Elf_Phdr);
 	ehdr->e_shentsize = sizeof(Elf_Shdr);
 	ehdr->e_shstrndx = SHN_UNDEF;
 	if (numsegs + 1 < PN_XNUM) {
 		ehdr->e_phnum = numsegs + 1;
 		ehdr->e_shnum = 0;
 	} else {
 		ehdr->e_phnum = PN_XNUM;
 		ehdr->e_shnum = 1;
 
 		ehdr->e_shoff = ehdr->e_phoff +
 		    (numsegs + 1) * ehdr->e_phentsize;
 		KASSERT(ehdr->e_shoff == hdrsize - sizeof(Elf_Shdr),
 		    ("e_shoff: %zu, hdrsize - shdr: %zu",
 		     (size_t)ehdr->e_shoff, hdrsize - sizeof(Elf_Shdr)));
 
 		shdr = (Elf_Shdr *)((char *)hdr + ehdr->e_shoff);
 		memset(shdr, 0, sizeof(*shdr));
 		/*
 		 * A special first section is used to hold large segment and
 		 * section counts.  This was proposed by Sun Microsystems in
 		 * Solaris and has been adopted by Linux; the standard ELF
 		 * tools are already familiar with the technique.
 		 *
 		 * See table 7-7 of the Solaris "Linker and Libraries Guide"
 		 * (or 12-7 depending on the version of the document) for more
 		 * details.
 		 */
 		shdr->sh_type = SHT_NULL;
 		shdr->sh_size = ehdr->e_shnum;
 		shdr->sh_link = ehdr->e_shstrndx;
 		shdr->sh_info = numsegs + 1;
 	}
 
 	/*
 	 * Fill in the program header entries.
 	 */
 	phdr = (Elf_Phdr *)((char *)hdr + ehdr->e_phoff);
 
 	/* The note segement. */
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = hdrsize;
 	phdr->p_vaddr = 0;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = notesz;
 	phdr->p_memsz = 0;
 	phdr->p_flags = PF_R;
 	phdr->p_align = ELF_NOTE_ROUNDSIZE;
 	phdr++;
 
 	/* All the writable segments from the program. */
 	phc.phdr = phdr;
 	phc.offset = round_page(hdrsize + notesz);
 	each_dumpable_segment(td, cb_put_phdr, &phc);
 }
 
 static size_t
 register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
 {
 	struct note_info *ninfo;
 	size_t size, notesize;
 
 	size = 0;
 	out(arg, NULL, &size);
 	ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
 	ninfo->type = type;
 	ninfo->outfunc = out;
 	ninfo->outarg = arg;
 	ninfo->outsize = size;
 	TAILQ_INSERT_TAIL(list, ninfo, link);
 
 	if (type == -1)
 		return (size);
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static size_t
 append_note_data(const void *src, void *dst, size_t len)
 {
 	size_t padded_len;
 
 	padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE);
 	if (dst != NULL) {
 		bcopy(src, dst, len);
 		bzero((char *)dst + len, padded_len - len);
 	}
 	return (padded_len);
 }
 
 size_t
 __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp)
 {
 	Elf_Note *note;
 	char *buf;
 	size_t notesize;
 
 	buf = dst;
 	if (buf != NULL) {
 		note = (Elf_Note *)buf;
 		note->n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 		note->n_descsz = size;
 		note->n_type = type;
 		buf += sizeof(*note);
 		buf += append_note_data(FREEBSD_ABI_VENDOR, buf,
 		    sizeof(FREEBSD_ABI_VENDOR));
 		append_note_data(src, buf, size);
 		if (descp != NULL)
 			*descp = buf;
 	}
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static void
 __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
 {
 	Elf_Note note;
 	ssize_t old_len, sect_len;
 	size_t new_len, descsz, i;
 
 	if (ninfo->type == -1) {
 		ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 		return;
 	}
 
 	note.n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 	note.n_descsz = ninfo->outsize;
 	note.n_type = ninfo->type;
 
 	sbuf_bcat(sb, &note, sizeof(note));
 	sbuf_start_section(sb, &old_len);
 	sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR));
 	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (note.n_descsz == 0)
 		return;
 	sbuf_start_section(sb, &old_len);
 	ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 	sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (sect_len < 0)
 		return;
 
 	new_len = (size_t)sect_len;
 	descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE);
 	if (new_len < descsz) {
 		/*
 		 * It is expected that individual note emitters will correctly
 		 * predict their expected output size and fill up to that size
 		 * themselves, padding in a format-specific way if needed.
 		 * However, in case they don't, just do it here with zeros.
 		 */
 		for (i = 0; i < descsz - new_len; i++)
 			sbuf_putc(sb, 0);
 	} else if (new_len > descsz) {
 		/*
 		 * We can't always truncate sb -- we may have drained some
 		 * of it already.
 		 */
 		KASSERT(new_len == descsz, ("%s: Note type %u changed as we "
 		    "read it (%zu > %zu).  Since it is longer than "
 		    "expected, this coredump's notes are corrupt.  THIS "
 		    "IS A BUG in the note_procstat routine for type %u.\n",
 		    __func__, (unsigned)note.n_type, new_len, descsz,
 		    (unsigned)note.n_type));
 	}
 }
 
 /*
  * Miscellaneous note out functions.
  */
 
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 typedef struct thrmisc32 elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	KERN_PROC_MASK32
 typedef struct kinfo_proc32 elf_kinfo_proc_t;
 typedef uint32_t elf_ps_strings_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 typedef thrmisc_t elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	0
 typedef struct kinfo_proc elf_kinfo_proc_t;
 typedef vm_offset_t elf_ps_strings_t;
 #endif
 
 static void
 __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct sbuf sbarg;
 	size_t len;
 	char *cp, *end;
 	struct proc *p;
 	elf_prpsinfo_t *psinfo;
 	int error;
 
 	p = (struct proc *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
 		psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
 		PROC_LOCK(p);
 		if (p->p_args != NULL) {
 			len = sizeof(psinfo->pr_psargs) - 1;
 			if (len > p->p_args->ar_length)
 				len = p->p_args->ar_length;
 			memcpy(psinfo->pr_psargs, p->p_args->ar_args, len);
 			PROC_UNLOCK(p);
 			error = 0;
 		} else {
 			_PHOLD(p);
 			PROC_UNLOCK(p);
 			sbuf_new(&sbarg, psinfo->pr_psargs,
 			    sizeof(psinfo->pr_psargs), SBUF_FIXEDLEN);
 			error = proc_getargv(curthread, p, &sbarg);
 			PRELE(p);
 			if (sbuf_finish(&sbarg) == 0)
 				len = sbuf_len(&sbarg) - 1;
 			else
 				len = sizeof(psinfo->pr_psargs) - 1;
 			sbuf_delete(&sbarg);
 		}
 		if (error || len == 0)
 			strlcpy(psinfo->pr_psargs, p->p_comm,
 			    sizeof(psinfo->pr_psargs));
 		else {
 			KASSERT(len < sizeof(psinfo->pr_psargs),
 			    ("len is too long: %zu vs %zu", len,
 			    sizeof(psinfo->pr_psargs)));
 			cp = psinfo->pr_psargs;
 			end = cp + len - 1;
 			for (;;) {
 				cp = memchr(cp, '\0', end - cp);
 				if (cp == NULL)
 					break;
 				*cp = ' ';
 			}
 		}
 		psinfo->pr_pid = p->p_pid;
 		sbuf_bcat(sb, psinfo, sizeof(*psinfo));
 		free(psinfo, M_TEMP);
 	}
 	*sizep = sizeof(*psinfo);
 }
 
 static void
 __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prstatus_t *status;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*status), ("invalid size"));
 		status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
 		status->pr_version = PRSTATUS_VERSION;
 		status->pr_statussz = sizeof(elf_prstatus_t);
 		status->pr_gregsetsz = sizeof(elf_gregset_t);
 		status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 		status->pr_osreldate = osreldate;
 		status->pr_cursig = td->td_proc->p_sig;
 		status->pr_pid = td->td_tid;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_regs32(td, &status->pr_reg);
 #else
 		fill_regs(td, &status->pr_reg);
 #endif
 		sbuf_bcat(sb, status, sizeof(*status));
 		free(status, M_TEMP);
 	}
 	*sizep = sizeof(*status);
 }
 
 static void
 __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prfpregset_t *fpregset;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
 		fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_fpregs32(td, fpregset);
 #else
 		fill_fpregs(td, fpregset);
 #endif
 		sbuf_bcat(sb, fpregset, sizeof(*fpregset));
 		free(fpregset, M_TEMP);
 	}
 	*sizep = sizeof(*fpregset);
 }
 
 static void
 __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_thrmisc_t thrmisc;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
 		bzero(&thrmisc._pad, sizeof(thrmisc._pad));
 		strcpy(thrmisc.pr_tname, td->td_name);
 		sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
 	}
 	*sizep = sizeof(thrmisc);
 }
 
 static void
 __elfN(note_ptlwpinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	size_t size;
 	int structsize;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 	struct ptrace_lwpinfo32 pl;
 #else
 	struct ptrace_lwpinfo pl;
 #endif
 
 	td = (struct thread *)arg;
 	size = sizeof(structsize) + sizeof(pl);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(pl);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		bzero(&pl, sizeof(pl));
 		pl.pl_lwpid = td->td_tid;
 		pl.pl_event = PL_EVENT_NONE;
 		pl.pl_sigmask = td->td_sigmask;
 		pl.pl_siglist = td->td_siglist;
 		if (td->td_si.si_signo != 0) {
 			pl.pl_event = PL_EVENT_SIGNAL;
 			pl.pl_flags |= PL_FLAG_SI;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 			siginfo_to_siginfo32(&td->td_si, &pl.pl_siginfo);
 #else
 			pl.pl_siginfo = td->td_si;
 #endif
 		}
 		strcpy(pl.pl_tdname, td->td_name);
 		/* XXX TODO: supply more information in struct ptrace_lwpinfo*/
 		sbuf_bcat(sb, &pl, sizeof(pl));
 	}
 	*sizep = size;
 }
 
 /*
  * Allow for MD specific notes, as well as any MD
  * specific preparations for writing MI notes.
  */
 static void
 __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	void *buf;
 	size_t size;
 
 	td = (struct thread *)arg;
 	size = *sizep;
 	if (size != 0 && sb != NULL)
 		buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
 	else
 		buf = NULL;
 	size = 0;
 	__elfN(dump_thread)(td, buf, &size);
 	KASSERT(sb == NULL || *sizep == size, ("invalid size"));
 	if (size != 0 && sb != NULL)
 		sbuf_bcat(sb, buf, size);
 	free(buf, M_TEMP);
 	*sizep = size;
 }
 
 #ifdef KINFO_PROC_SIZE
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 #endif
 
 static void
 __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_numthreads *
 	    sizeof(elf_kinfo_proc_t);
 
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(elf_kinfo_proc_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
 	}
 	*sizep = size;
 }
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static void
 note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size, sect_sz, i;
 	ssize_t start_len, sect_len;
 	int structsize, filedesc_flags;
 
 	if (coredump_pack_fileinfo)
 		filedesc_flags = KERN_FILEDESC_PACK_KINFO;
 	else
 		filedesc_flags = 0;
 
 	p = (struct proc *)arg;
 	structsize = sizeof(struct kinfo_file);
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1, filedesc_flags);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		sbuf_start_section(sb, &start_len);
 
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize),
 		    filedesc_flags);
 
 		sect_len = sbuf_end_section(sb, start_len, 0, 0);
 		if (sect_len < 0)
 			return;
 		sect_sz = sect_len;
 
 		KASSERT(sect_sz <= *sizep,
 		    ("kern_proc_filedesc_out did not respect maxlen; "
 		     "requested %zu, got %zu", *sizep - sizeof(structsize),
 		     sect_sz - sizeof(structsize)));
 
 		for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++)
 			sbuf_putc(sb, 0);
 	}
 }
 
 #ifdef KINFO_VMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
 #endif
 
 static void
 note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize, vmmap_flags;
 
 	if (coredump_pack_vmmapinfo)
 		vmmap_flags = KERN_VMMAP_PACK_KINFO;
 	else
 		vmmap_flags = 0;
 
 	p = (struct proc *)arg;
 	structsize = sizeof(struct kinfo_vmentry);
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb, -1, vmmap_flags);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize),
 		    vmmap_flags);
 	}
 }
 
 static void
 note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(gid_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
 		    sizeof(gid_t));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_fd->fd_cmask);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	struct rlimit rlim[RLIM_NLIMITS];
 	size_t size;
 	int structsize, i;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(rlim);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(rlim);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		for (i = 0; i < RLIM_NLIMITS; i++)
 			lim_rlimit_proc(p, i, &rlim[i]);
 		PROC_UNLOCK(p);
 		sbuf_bcat(sb, rlim, sizeof(rlim));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_osrel);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_osrel);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	elf_ps_strings_t ps_strings;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(ps_strings);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(ps_strings);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		ps_strings = PTROUT(p->p_sysent->sv_psstrings);
 #else
 		ps_strings = p->p_sysent->sv_psstrings;
 #endif
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		structsize = sizeof(Elf_Auxinfo);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 	}
 }
 
 static boolean_t
 __elfN(parse_notes)(struct image_params *imgp, Elf_Note *checknote,
     const char *note_vendor, const Elf_Phdr *pnote,
     boolean_t (*cb)(const Elf_Note *, void *, boolean_t *), void *cb_arg)
 {
 	const Elf_Note *note, *note0, *note_end;
 	const char *note_name;
 	char *buf;
 	int i, error;
 	boolean_t res;
 
 	/* We need some limit, might as well use PAGE_SIZE. */
 	if (pnote == NULL || pnote->p_filesz > PAGE_SIZE)
 		return (FALSE);
 	ASSERT_VOP_LOCKED(imgp->vp, "parse_notes");
 	if (pnote->p_offset > PAGE_SIZE ||
 	    pnote->p_filesz > PAGE_SIZE - pnote->p_offset) {
 		VOP_UNLOCK(imgp->vp, 0);
 		buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK);
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz,
 		    pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED,
 		    curthread->td_ucred, NOCRED, NULL, curthread);
 		if (error != 0) {
 			uprintf("i/o error PT_NOTE\n");
 			goto retf;
 		}
 		note = note0 = (const Elf_Note *)buf;
 		note_end = (const Elf_Note *)(buf + pnote->p_filesz);
 	} else {
 		note = note0 = (const Elf_Note *)(imgp->image_header +
 		    pnote->p_offset);
 		note_end = (const Elf_Note *)(imgp->image_header +
 		    pnote->p_offset + pnote->p_filesz);
 		buf = NULL;
 	}
 	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
 		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
 		    (const char *)note < sizeof(Elf_Note)) {
 			goto retf;
 		}
 		if (note->n_namesz != checknote->n_namesz ||
 		    note->n_descsz != checknote->n_descsz ||
 		    note->n_type != checknote->n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
 		if (note_name + checknote->n_namesz >=
 		    (const char *)note_end || strncmp(note_vendor,
 		    note_name, checknote->n_namesz) != 0)
 			goto nextnote;
 
 		if (cb(note, cb_arg, &res))
 			goto ret;
 nextnote:
 		note = (const Elf_Note *)((const char *)(note + 1) +
 		    roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
 		    roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
 	}
 retf:
 	res = FALSE;
 ret:
 	free(buf, M_TEMP);
 	return (res);
 }
 
 struct brandnote_cb_arg {
 	Elf_Brandnote *brandnote;
 	int32_t *osrel;
 };
 
 static boolean_t
 brandnote_cb(const Elf_Note *note, void *arg0, boolean_t *res)
 {
 	struct brandnote_cb_arg *arg;
 
 	arg = arg0;
 
 	/*
 	 * Fetch the osreldate for binary from the ELF OSABI-note if
 	 * necessary.
 	 */
 	*res = (arg->brandnote->flags & BN_TRANSLATE_OSREL) != 0 &&
 	    arg->brandnote->trans_osrel != NULL ?
 	    arg->brandnote->trans_osrel(note, arg->osrel) : TRUE;
 
 	return (TRUE);
 }
 
 static Elf_Note fctl_note = {
 	.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
 	.n_descsz = sizeof(uint32_t),
 	.n_type = NT_FREEBSD_FEATURE_CTL,
 };
 
 struct fctl_cb_arg {
 	uint32_t *fctl0;
 };
 
 static boolean_t
 note_fctl_cb(const Elf_Note *note, void *arg0, boolean_t *res)
 {
 	struct fctl_cb_arg *arg;
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	arg = arg0;
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 	desc = (const Elf32_Word *)p;
 	*arg->fctl0 = desc[0];
 	return (TRUE);
 }
 
 /*
  * Try to find the appropriate ABI-note section for checknote, fetch
  * the osreldate and feature control flags for binary from the ELF
  * OSABI-note.  Only the first page of the image is searched, the same
  * as for headers.
  */
 static boolean_t
 __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
     int32_t *osrel, uint32_t *fctl0)
 {
 	const Elf_Phdr *phdr;
 	const Elf_Ehdr *hdr;
 	struct brandnote_cb_arg b_arg;
 	struct fctl_cb_arg f_arg;
 	int i, j;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	b_arg.brandnote = brandnote;
 	b_arg.osrel = osrel;
 	f_arg.fctl0 = fctl0;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp,
 		    &brandnote->hdr, brandnote->vendor, &phdr[i], brandnote_cb,
 		    &b_arg)) {
 			for (j = 0; j < hdr->e_phnum; j++) {
 				if (phdr[j].p_type == PT_NOTE &&
 				    __elfN(parse_notes)(imgp, &fctl_note,
 				    FREEBSD_ABI_VENDOR, &phdr[j],
 				    note_fctl_cb, &f_arg))
 					break;
 			}
 			return (TRUE);
 		}
 	}
 	return (FALSE);
 
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	.ex_imgact = __CONCAT(exec_, __elfN(imgact)),
 	.ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
 
 static vm_prot_t
 __elfN(trans_prot)(Elf_Word flags)
 {
 	vm_prot_t prot;
 
 	prot = 0;
 	if (flags & PF_X)
 		prot |= VM_PROT_EXECUTE;
 	if (flags & PF_W)
 		prot |= VM_PROT_WRITE;
 	if (flags & PF_R)
 		prot |= VM_PROT_READ;
 #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__))
 	if (i386_read_exec && (flags & PF_R))
 		prot |= VM_PROT_EXECUTE;
 #endif
 	return (prot);
 }
 
 static Elf_Word
 __elfN(untrans_prot)(vm_prot_t prot)
 {
 	Elf_Word flags;
 
 	flags = 0;
 	if (prot & VM_PROT_EXECUTE)
 		flags |= PF_X;
 	if (prot & VM_PROT_READ)
 		flags |= PF_R;
 	if (prot & VM_PROT_WRITE)
 		flags |= PF_W;
 	return (flags);
 }
Index: projects/capsicum-test/sys/net/if_lagg.c
===================================================================
--- projects/capsicum-test/sys/net/if_lagg.c	(revision 345709)
+++ projects/capsicum-test/sys/net/if_lagg.c	(revision 345710)
@@ -1,2258 +1,2267 @@
 /*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
 
 /*
  * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
  * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
  * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 
 #include <net/if_vlan_var.h>
 #include <net/if_lagg.h>
 #include <net/ieee8023ad_lacp.h>
 
 #define	LAGG_RLOCK()	struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et)
 #define	LAGG_RUNLOCK()	epoch_exit_preempt(net_epoch_preempt, &lagg_et)
 #define	LAGG_RLOCK_ASSERT()	MPASS(in_epoch(net_epoch_preempt))
 #define	LAGG_UNLOCK_ASSERT()	MPASS(!in_epoch(net_epoch_preempt))
 
 #define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
 #define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
 #define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
 #define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
 #define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
 #define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
 
 /* Special flags we should propagate to the lagg ports. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } lagg_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
 #define	V_lagg_list	VNET(lagg_list)
 VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
 #define	V_lagg_list_mtx	VNET(lagg_list_mtx)
 #define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
 					"if_lagg list", NULL, MTX_DEF)
 #define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
 #define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
 #define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
 eventhandler_tag	lagg_detach_cookie = NULL;
 
 static int	lagg_clone_create(struct if_clone *, int, caddr_t);
 static void	lagg_clone_destroy(struct ifnet *);
 VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
 #define	V_lagg_cloner	VNET(lagg_cloner)
 static const char laggname[] = "lagg";
+static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
 
 static void	lagg_capabilities(struct lagg_softc *);
 static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
 static int	lagg_port_destroy(struct lagg_port *, int);
 static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
 static void	lagg_linkstate(struct lagg_softc *);
 static void	lagg_port_state(struct ifnet *, int);
 static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
 static int	lagg_port_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
 #ifdef LAGG_PORT_STACKING
 static int	lagg_port_checkstacking(struct lagg_softc *);
 #endif
 static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
 static void	lagg_init(void *);
 static void	lagg_stop(struct lagg_softc *);
 static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
 #ifdef RATELIMIT
 static int	lagg_snd_tag_alloc(struct ifnet *,
 		    union if_snd_tag_alloc_params *,
 		    struct m_snd_tag **);
 static void	lagg_snd_tag_free(struct m_snd_tag *);
 #endif
 static int	lagg_setmulti(struct lagg_port *);
 static int	lagg_clrmulti(struct lagg_port *);
 static	int	lagg_setcaps(struct lagg_port *, int cap);
 static	int	lagg_setflag(struct lagg_port *, int, int,
 		    int (*func)(struct ifnet *, int));
 static	int	lagg_setflags(struct lagg_port *, int status);
 static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
 static int	lagg_transmit(struct ifnet *, struct mbuf *);
 static void	lagg_qflush(struct ifnet *);
 static int	lagg_media_change(struct ifnet *);
 static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
 static struct lagg_port *lagg_link_active(struct lagg_softc *,
 	    struct lagg_port *);
 
 /* Simple round robin */
 static void	lagg_rr_attach(struct lagg_softc *);
 static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Active failover */
 static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Loadbalancing */
 static void	lagg_lb_attach(struct lagg_softc *);
 static void	lagg_lb_detach(struct lagg_softc *);
 static int	lagg_lb_port_create(struct lagg_port *);
 static void	lagg_lb_port_destroy(struct lagg_port *);
 static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
 
 /* Broadcast */
 static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* 802.3ad LACP */
 static void	lagg_lacp_attach(struct lagg_softc *);
 static void	lagg_lacp_detach(struct lagg_softc *);
 static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static void	lagg_lacp_lladdr(struct lagg_softc *);
 
 /* lagg protocol table */
 static const struct lagg_proto {
 	lagg_proto	pr_num;
 	void		(*pr_attach)(struct lagg_softc *);
 	void		(*pr_detach)(struct lagg_softc *);
 	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
 	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
 			    struct mbuf *);
 	int		(*pr_addport)(struct lagg_port *);
 	void		(*pr_delport)(struct lagg_port *);
 	void		(*pr_linkstate)(struct lagg_port *);
 	void 		(*pr_init)(struct lagg_softc *);
 	void 		(*pr_stop)(struct lagg_softc *);
 	void 		(*pr_lladdr)(struct lagg_softc *);
 	void		(*pr_request)(struct lagg_softc *, void *);
 	void		(*pr_portreq)(struct lagg_port *, void *);
 } lagg_protos[] = {
     {
 	.pr_num = LAGG_PROTO_NONE
     },
     {
 	.pr_num = LAGG_PROTO_ROUNDROBIN,
 	.pr_attach = lagg_rr_attach,
 	.pr_start = lagg_rr_start,
 	.pr_input = lagg_rr_input,
     },
     {
 	.pr_num = LAGG_PROTO_FAILOVER,
 	.pr_start = lagg_fail_start,
 	.pr_input = lagg_fail_input,
     },
     {
 	.pr_num = LAGG_PROTO_LOADBALANCE,
 	.pr_attach = lagg_lb_attach,
 	.pr_detach = lagg_lb_detach,
 	.pr_start = lagg_lb_start,
 	.pr_input = lagg_lb_input,
 	.pr_addport = lagg_lb_port_create,
 	.pr_delport = lagg_lb_port_destroy,
     },
     {
 	.pr_num = LAGG_PROTO_LACP,
 	.pr_attach = lagg_lacp_attach,
 	.pr_detach = lagg_lacp_detach,
 	.pr_start = lagg_lacp_start,
 	.pr_input = lagg_lacp_input,
 	.pr_addport = lacp_port_create,
 	.pr_delport = lacp_port_destroy,
 	.pr_linkstate = lacp_linkstate,
 	.pr_init = lacp_init,
 	.pr_stop = lacp_stop,
 	.pr_lladdr = lagg_lacp_lladdr,
 	.pr_request = lacp_req,
 	.pr_portreq = lacp_portreq,
     },
     {
 	.pr_num = LAGG_PROTO_BROADCAST,
 	.pr_start = lagg_bcast_start,
 	.pr_input = lagg_bcast_input,
     },
 };
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
     "Link Aggregation");
 
 /* Allow input on any failover links */
 VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
 #define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(lagg_failover_rx_all), 0,
     "Accept input from any interface in a failover lagg");
 
 /* Default value for using flowid */
 VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
 #define	V_def_use_flowid	VNET(def_use_flowid)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_flowid), 0,
     "Default setting for using flow id for load sharing");
 
 /* Default value for flowid shift */
 VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
 #define	V_def_flowid_shift	VNET(def_flowid_shift)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
     &VNET_NAME(def_flowid_shift), 0,
     "Default setting for flowid shift for load sharing");
 
 static void
 vnet_lagg_init(const void *unused __unused)
 {
 
 	LAGG_LIST_LOCK_INIT();
 	SLIST_INIT(&V_lagg_list);
 	V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
 	    lagg_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_lagg_init, NULL);
 
 static void
 vnet_lagg_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_lagg_cloner);
 	LAGG_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_lagg_uninit, NULL);
 
 static int
 lagg_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		lagg_input_p = lagg_input;
 		lagg_linkstate_p = lagg_port_state;
 		lagg_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, lagg_port_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    lagg_detach_cookie);
 		lagg_input_p = NULL;
 		lagg_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t lagg_mod = {
 	"if_lagg",
 	lagg_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_lagg, 1);
 
 static void
 lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
 {
 
 	LAGG_XLOCK_ASSERT(sc);
 	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
 	    __func__, sc));
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		if_printf(sc->sc_ifp, "using proto %u\n", pr);
 
 	if (lagg_protos[pr].pr_attach != NULL)
 		lagg_protos[pr].pr_attach(sc);
 	sc->sc_proto = pr;
 }
 
 static void
 lagg_proto_detach(struct lagg_softc *sc)
 {
 	lagg_proto pr;
 
 	LAGG_XLOCK_ASSERT(sc);
 	pr = sc->sc_proto;
 	sc->sc_proto = LAGG_PROTO_NONE;
 
 	if (lagg_protos[pr].pr_detach != NULL)
 		lagg_protos[pr].pr_detach(sc);
 }
 
 static int
 lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
 }
 
 static struct mbuf *
 lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
 }
 
 static int
 lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
 		return (0);
 	else
 		return (lagg_protos[sc->sc_proto].pr_addport(lp));
 }
 
 static void
 lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
 		lagg_protos[sc->sc_proto].pr_delport(lp);
 }
 
 static void
 lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
 		lagg_protos[sc->sc_proto].pr_linkstate(lp);
 }
 
 static void
 lagg_proto_init(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_init != NULL)
 		lagg_protos[sc->sc_proto].pr_init(sc);
 }
 
 static void
 lagg_proto_stop(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
 		lagg_protos[sc->sc_proto].pr_stop(sc);
 }
 
 static void
 lagg_proto_lladdr(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
 		lagg_protos[sc->sc_proto].pr_lladdr(sc);
 }
 
 static void
 lagg_proto_request(struct lagg_softc *sc, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_request != NULL)
 		lagg_protos[sc->sc_proto].pr_request(sc, v);
 }
 
 static void
 lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
 		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
 }
 
 /*
  * This routine is run via an vlan
  * config EVENT
  */
 static void
 lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
 	LAGG_RUNLOCK();
 }
 
 /*
  * This routine is run via an vlan
  * unconfig EVENT
  */
 static void
 lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
 	LAGG_RUNLOCK();
 }
 
 static int
 lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct lagg_softc *sc;
 	struct ifnet *ifp;
 	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
 
-	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
+	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
-		free(sc, M_DEVBUF);
+		free(sc, M_LAGG);
 		return (ENOSPC);
 	}
 	LAGG_SX_INIT(sc);
 
 	LAGG_XLOCK(sc);
 	if (V_def_use_flowid)
 		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
 	sc->flowid_shift = V_def_flowid_shift;
 
 	/* Hash all layers by default */
 	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
 
 	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
 
 	CK_SLIST_INIT(&sc->sc_ports);
 
 	/* Initialise pseudo media types */
 	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
 	    lagg_media_status);
 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
 
 	if_initname(ifp, laggname, unit);
 	ifp->if_softc = sc;
 	ifp->if_transmit = lagg_transmit;
 	ifp->if_qflush = lagg_qflush;
 	ifp->if_init = lagg_init;
 	ifp->if_ioctl = lagg_ioctl;
 	ifp->if_get_counter = lagg_get_counter;
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
 #ifdef RATELIMIT
 	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
 	ifp->if_snd_tag_free = lagg_snd_tag_free;
 #endif
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
 	/*
 	 * Attach as an ordinary ethernet device, children will be attached
 	 * as special device IFT_IEEE8023ADLAG.
 	 */
 	ether_ifattach(ifp, eaddr);
 
 	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 
 	/* Insert into the global list of laggs */
 	LAGG_LIST_LOCK();
 	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
 	LAGG_LIST_UNLOCK();
 	LAGG_XUNLOCK(sc);
 
 	return (0);
 }
 
 static void
 lagg_clone_destroy(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	sc->sc_destroying = 1;
 	lagg_stop(sc);
 	ifp->if_flags &= ~IFF_UP;
 
 	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 
 	/* Shutdown and remove lagg ports */
 	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
 		lagg_port_destroy(lp, 1);
 
 	/* Unhook the aggregation protocol */
 	lagg_proto_detach(sc);
 	LAGG_XUNLOCK(sc);
 
 	ifmedia_removeall(&sc->sc_media);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 	LAGG_LIST_LOCK();
 	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
 	LAGG_LIST_UNLOCK();
 
 	LAGG_SX_DESTROY(sc);
-	free(sc, M_DEVBUF);
+	free(sc, M_LAGG);
 }
 
 static void
 lagg_capabilities(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int cap, ena, pena;
 	uint64_t hwa;
 	struct ifnet_hw_tsomax hw_tsomax;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* Get common enabled capabilities for the lagg ports */
 	ena = ~0;
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		ena &= lp->lp_ifp->if_capenable;
 	ena = (ena == ~0 ? 0 : ena);
 
 	/*
 	 * Apply common enabled capabilities back to the lagg ports.
 	 * May require several iterations if they are dependent.
 	 */
 	do {
 		pena = ena;
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setcaps(lp, ena);
 			ena &= lp->lp_ifp->if_capenable;
 		}
 	} while (pena != ena);
 
 	/* Get other capabilities from the lagg ports */
 	cap = ~0;
 	hwa = ~(uint64_t)0;
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		cap &= lp->lp_ifp->if_capabilities;
 		hwa &= lp->lp_ifp->if_hwassist;
 		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
 	}
 	cap = (cap == ~0 ? 0 : cap);
 	hwa = (hwa == ~(uint64_t)0 ? 0 : hwa);
 
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_hwassist != hwa ||
 	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_hwassist = hwa;
 		getmicrotime(&sc->sc_ifp->if_lastchange);
 
 		if (sc->sc_ifflags & IFF_DEBUG)
 			if_printf(sc->sc_ifp,
 			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
 	}
 }
 
 static int
 lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp, *tlp;
 	struct ifreq ifr;
 	int error, i, oldmtu;
 	uint64_t *pval;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (sc->sc_ifp == ifp) {
 		if_printf(sc->sc_ifp,
 		    "cannot add a lagg to itself as a port\n");
 		return (EINVAL);
 	}
 
 	/* Limit the maximal number of lagg ports */
 	if (sc->sc_count >= LAGG_MAX_PORTS)
 		return (ENOSPC);
 
 	/* Check if port has already been associated to a lagg */
 	if (ifp->if_lagg != NULL) {
 		/* Port is already in the current lagg? */
 		lp = (struct lagg_port *)ifp->if_lagg;
 		if (lp->lp_softc == sc)
 			return (EEXIST);
 		return (EBUSY);
 	}
 
 	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
 	if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
 		return (EPROTONOSUPPORT);
 
 	/* Allow the first Ethernet member to define the MTU */
 	oldmtu = -1;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		sc->sc_ifp->if_mtu = ifp->if_mtu;
 	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
 		if (ifp->if_ioctl == NULL) {
 			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
 			    ifp->if_xname);
 			return (EINVAL);
 		}
 		oldmtu = ifp->if_mtu;
 		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
 		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		if (error != 0) {
 			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
 			    ifp->if_xname);
 			return (error);
 		}
 		ifr.ifr_mtu = oldmtu;
 	}
 
-	lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_WAITOK|M_ZERO);
+	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO);
 	lp->lp_softc = sc;
 
 	/* Check if port is a stacked lagg */
 	LAGG_LIST_LOCK();
 	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
 		if (ifp == sc_ptr->sc_ifp) {
 			LAGG_LIST_UNLOCK();
-			free(lp, M_DEVBUF);
+			free(lp, M_LAGG);
 			if (oldmtu != -1)
 				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 				    (caddr_t)&ifr);
 			return (EINVAL);
 			/* XXX disable stacking for the moment, its untested */
 #ifdef LAGG_PORT_STACKING
 			lp->lp_flags |= LAGG_PORT_STACK;
 			if (lagg_port_checkstacking(sc_ptr) >=
 			    LAGG_MAX_STACKING) {
 				LAGG_LIST_UNLOCK();
-				free(lp, M_DEVBUF);
+				free(lp, M_LAGG);
 				if (oldmtu != -1)
 					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 					    (caddr_t)&ifr);
 				return (E2BIG);
 			}
 #endif
 		}
 	}
 	LAGG_LIST_UNLOCK();
 
 	if_ref(ifp);
 	lp->lp_ifp = ifp;
 
 	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
 	lp->lp_ifcapenable = ifp->if_capenable;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 		lagg_proto_lladdr(sc);
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	} else {
 		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 	}
 	lagg_setflags(lp, 1);
 
 	if (CK_SLIST_EMPTY(&sc->sc_ports))
 		sc->sc_primary = lp;
 
 	/* Change the interface type */
 	lp->lp_iftype = ifp->if_type;
 	ifp->if_type = IFT_IEEE8023ADLAG;
 	ifp->if_lagg = lp;
 	lp->lp_ioctl = ifp->if_ioctl;
 	ifp->if_ioctl = lagg_port_ioctl;
 	lp->lp_output = ifp->if_output;
 	ifp->if_output = lagg_port_output;
 
 	/* Read port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++)
 		*pval = ifp->if_get_counter(ifp, i);
 
 	/*
 	 * Insert into the list of ports.
 	 * Keep ports sorted by if_index. It is handy, when configuration
 	 * is predictable and `ifconfig laggN create ...` command
 	 * will lead to the same result each time.
 	 */
-	LAGG_RLOCK();
 	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
 		if (tlp->lp_ifp->if_index < ifp->if_index && (
 		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
 		    ((struct  lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
 		    ifp->if_index))
 			break;
 	}
-	LAGG_RUNLOCK();
 	if (tlp != NULL)
 		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
 	else
 		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
 	sc->sc_count++;
 
 	lagg_setmulti(lp);
 
 
 	if ((error = lagg_proto_addport(sc, lp)) != 0) {
 		/* Remove the port, without calling pr_delport. */
 		lagg_port_destroy(lp, 0);
 		if (oldmtu != -1)
 			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		return (error);
 	}
 
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 #ifdef LAGG_PORT_STACKING
 static int
 lagg_port_checkstacking(struct lagg_softc *sc)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp;
 	int m = 0;
 
 	LAGG_SXLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_flags & LAGG_PORT_STACK) {
 			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
 			m = MAX(m, lagg_port_checkstacking(sc_ptr));
 		}
 	}
 
 	return (m + 1);
 }
 #endif
 
 static void
 lagg_port_destroy_cb(epoch_context_t ec)
 {
 	struct lagg_port *lp;
 	struct ifnet *ifp;
 
 	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
 	ifp = lp->lp_ifp;
 
 	if_rele(ifp);
-	free(lp, M_DEVBUF);
+	free(lp, M_LAGG);
 }
 
 static int
 lagg_port_destroy(struct lagg_port *lp, int rundelport)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct lagg_port *lp_ptr, *lp0;
 	struct ifnet *ifp = lp->lp_ifp;
 	uint64_t *pval, vdiff;
 	int i;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (rundelport)
 		lagg_proto_delport(sc, lp);
 
 	if (lp->lp_detaching == 0)
 		lagg_clrmulti(lp);
 
 	/* Restore interface */
 	ifp->if_type = lp->lp_iftype;
 	ifp->if_ioctl = lp->lp_ioctl;
 	ifp->if_output = lp->lp_output;
 	ifp->if_lagg = NULL;
 
 	/* Update detached port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++) {
 		vdiff = ifp->if_get_counter(ifp, i) - *pval;
 		sc->detached_counters.val[i] += vdiff;
 	}
 
 	/* Finally, remove the port from the lagg */
 	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
 	sc->sc_count--;
 
 	/* Update the primary interface */
 	if (lp == sc->sc_primary) {
 		uint8_t lladdr[ETHER_ADDR_LEN];
 
 		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
 			bzero(&lladdr, ETHER_ADDR_LEN);
 		else
 			bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN);
 		sc->sc_primary = lp0;
 		if (sc->sc_destroying == 0) {
 			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			lagg_proto_lladdr(sc);
 			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 		}
 
 		/*
 		 * Update lladdr for each port (new primary needs update
 		 * as well, to switch from old lladdr to its 'real' one)
 		 */
 		CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
 			if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN);
 	}
 
 	if (lp->lp_ifflags)
 		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
 
 	if (lp->lp_detaching == 0) {
 		lagg_setflags(lp, 0);
 		lagg_setcaps(lp, lp->lp_ifcapenable);
 		if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN);
 	}
 
 	/*
 	 * free port and release it's ifnet reference after a grace period has
 	 * elapsed.
 	 */
 	epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb);
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 static int
 lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct lagg_reqport *rp = (struct lagg_reqport *)data;
 	struct lagg_softc *sc;
 	struct lagg_port *lp = NULL;
 	int error = 0;
 
 	/* Should be checked by the caller */
 	if (ifp->if_type != IFT_IEEE8023ADLAG ||
 	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
 		goto fallback;
 
 	switch (cmd) {
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    ifunit(rp->rp_portname) != ifp) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_RLOCK();
 		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_RUNLOCK();
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		LAGG_RUNLOCK();
 		break;
 
 	case SIOCSIFCAP:
 		if (lp->lp_ioctl == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = (*lp->lp_ioctl)(ifp, cmd, data);
 		if (error)
 			break;
 
 		/* Update lagg interface capabilities */
 		LAGG_XLOCK(sc);
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(sc->sc_ifp);
 		break;
 
 	case SIOCSIFMTU:
 		/* Do not allow the MTU to be changed once joined */
 		error = EINVAL;
 		break;
 
 	default:
 		goto fallback;
 	}
 
 	return (error);
 
 fallback:
 	if (lp != NULL && lp->lp_ioctl != NULL)
 		return ((*lp->lp_ioctl)(ifp, cmd, data));
 
 	return (EINVAL);
 }
 
 /*
  * Requests counter @cnt data. 
  *
  * Counter value is calculated the following way:
  * 1) for each port, sum  difference between current and "initial" measurements.
  * 2) add lagg logical interface counters.
  * 3) add data from detached_counters array.
  *
  * We also do the following things on ports attach/detach:
  * 1) On port attach we store all counters it has into port_counter array. 
  * 2) On port detach we add the different between "initial" and
  *   current counters data to detached_counters array.
  */
 static uint64_t
 lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *lpifp;
 	uint64_t newval, oldval, vsum;
 
 	/* Revise this when we've got non-generic counters. */
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	sc = (struct lagg_softc *)ifp->if_softc;
 
 	vsum = 0;
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		/* Saved attached value */
 		oldval = lp->port_counters.val[cnt];
 		/* current value */
 		lpifp = lp->lp_ifp;
 		newval = lpifp->if_get_counter(lpifp, cnt);
 		/* Calculate diff and save new */
 		vsum += newval - oldval;
 	}
 	LAGG_RUNLOCK();
 
 	/*
 	 * Add counter data which might be added by upper
 	 * layer protocols operating on logical interface.
 	 */
 	vsum += if_get_counter_default(ifp, cnt);
 
 	/*
 	 * Add counter data from detached ports counters
 	 */
 	vsum += sc->detached_counters.val[cnt];
 
 
 	return (vsum);
 }
 
 /*
  * For direct output to child ports.
  */
 static int
 lagg_port_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 
 	switch (dst->sa_family) {
 		case pseudo_AF_HDRCMPLT:
 		case AF_UNSPEC:
 			return ((*lp->lp_output)(ifp, m, dst, ro));
 	}
 
 	/* drop any other frames */
 	m_freem(m);
 	return (ENETDOWN);
 }
 
 static void
 lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct lagg_port *lp;
 	struct lagg_softc *sc;
 
 	if ((lp = ifp->if_lagg) == NULL)
 		return;
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 
 	sc = lp->lp_softc;
 
 	LAGG_XLOCK(sc);
 	lp->lp_detaching = 1;
 	lagg_port_destroy(lp, 1);
 	LAGG_XUNLOCK(sc);
 	VLAN_CAPABILITIES(sc->sc_ifp);
 }
 
 static void
 lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 
 	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
 	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
 	rp->rp_prio = lp->lp_prio;
 	rp->rp_flags = lp->lp_flags;
 	lagg_proto_portreq(sc, lp, &rp->rp_psc);
 
 	/* Add protocol specific flags */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			if (lp == sc->sc_primary)
 				rp->rp_flags |= LAGG_PORT_MASTER;
 			if (lp == lagg_link_active(sc, sc->sc_primary))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			if (LAGG_PORTACTIVE(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_LACP:
 			/* LACP has a different definition of active */
 			if (lacp_isactive(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			if (lacp_iscollecting(lp))
 				rp->rp_flags |= LAGG_PORT_COLLECTING;
 			if (lacp_isdistributing(lp))
 				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
 			break;
 	}
 
 }
 
 static void
 lagg_init(void *xsc)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		LAGG_XUNLOCK(sc);
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/*
 	 * Update the port lladdrs if needed.
 	 * This might be if_setlladdr() notification
 	 * that lladdr has been changed.
 	 */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
 		    ETHER_ADDR_LEN) != 0)
 			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	}
 
 	lagg_proto_init(sc);
 
 	LAGG_XUNLOCK(sc);
 }
 
 static void
 lagg_stop(struct lagg_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	lagg_proto_stop(sc);
 }
 
 static int
 lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_reqall *ra = (struct lagg_reqall *)data;
 	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
 	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct lagg_port *lp;
 	struct ifnet *tpif;
 	struct thread *td = curthread;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	bzero(&rpbuf, sizeof(rpbuf));
 
 	switch (cmd) {
 	case SIOCGLAGG:
 		LAGG_XLOCK(sc);
 		buflen = sc->sc_count * sizeof(struct lagg_reqport);
 		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 		ra->ra_proto = sc->sc_proto;
 		lagg_proto_request(sc, &ra->ra_psc);
 		count = 0;
 		buf = outbuf;
 		len = min(ra->ra_size, buflen);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (len < sizeof(rpbuf))
 				break;
 
 			lagg_port2req(lp, &rpbuf);
 			memcpy(buf, &rpbuf, sizeof(rpbuf));
 			count++;
 			buf += sizeof(rpbuf);
 			len -= sizeof(rpbuf);
 		}
 		LAGG_XUNLOCK(sc);
 		ra->ra_ports = count;
 		ra->ra_size = count * sizeof(rpbuf);
 		error = copyout(outbuf, ra->ra_port, ra->ra_size);
 		free(outbuf, M_TEMP);
 		break;
 	case SIOCSLAGG:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ra->ra_proto >= LAGG_PROTO_MAX) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 		lagg_proto_detach(sc);
 		LAGG_UNLOCK_ASSERT();
 		lagg_proto_attach(sc, ra->ra_proto);
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGOPTS:
 		LAGG_XLOCK(sc);
 		ro->ro_opts = sc->sc_opts;
 		if (sc->sc_proto == LAGG_PROTO_LACP) {
 			struct lacp_softc *lsc;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 			if (lsc->lsc_debug.lsc_tx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
 			if (lsc->lsc_debug.lsc_rx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
 			if (lsc->lsc_strict_mode != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
 			if (lsc->lsc_fast_timeout != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT;
 
 			ro->ro_active = sc->sc_active;
 		} else {
 			ro->ro_active = 0;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				ro->ro_active += LAGG_PORTACTIVE(lp);
 		}
 		ro->ro_bkt = sc->sc_bkt;
 		ro->ro_flapping = sc->sc_flapping;
 		ro->ro_flowid_shift = sc->flowid_shift;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGOPTS:
 		if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) {
 			if (ro->ro_bkt == 0)
 				sc->sc_bkt = 1; // Minimum 1 packet per iface.
 			else
 				sc->sc_bkt = ro->ro_bkt;
 		}
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ro->ro_opts == 0)
 			break;
 		/*
 		 * Set options.  LACP options are stored in sc->sc_psc,
 		 * not in sc_opts.
 		 */
 		int valid, lacp;
 
 		switch (ro->ro_opts) {
 		case LAGG_OPT_USE_FLOWID:
 		case -LAGG_OPT_USE_FLOWID:
 		case LAGG_OPT_FLOWIDSHIFT:
 			valid = 1;
 			lacp = 0;
 			break;
 		case LAGG_OPT_LACP_TXTEST:
 		case -LAGG_OPT_LACP_TXTEST:
 		case LAGG_OPT_LACP_RXTEST:
 		case -LAGG_OPT_LACP_RXTEST:
 		case LAGG_OPT_LACP_STRICT:
 		case -LAGG_OPT_LACP_STRICT:
 		case LAGG_OPT_LACP_TIMEOUT:
 		case -LAGG_OPT_LACP_TIMEOUT:
 			valid = lacp = 1;
 			break;
 		default:
 			valid = lacp = 0;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 
 		if (valid == 0 ||
 		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
 			/* Invalid combination of options specified. */
 			error = EINVAL;
 			LAGG_XUNLOCK(sc);
 			break;	/* Return from SIOCSLAGGOPTS. */ 
 		}
 		/*
 		 * Store new options into sc->sc_opts except for
 		 * FLOWIDSHIFT and LACP options.
 		 */
 		if (lacp == 0) {
 			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
 				sc->flowid_shift = ro->ro_flowid_shift;
 			else if (ro->ro_opts > 0)
 				sc->sc_opts |= ro->ro_opts;
 			else
 				sc->sc_opts &= ~ro->ro_opts;
 		} else {
 			struct lacp_softc *lsc;
 			struct lacp_port *lp;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 
 			switch (ro->ro_opts) {
 			case LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 0;
 				break;
 			case LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 0;
 				break;
 			case LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 1;
 				break;
 			case -LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 0;
 				break;
 			case LAGG_OPT_LACP_TIMEOUT:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state |= LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 1;
 				break;
 			case -LAGG_OPT_LACP_TIMEOUT:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state &= ~LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 0;
 				break;
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGFLAGS:
 		rf->rf_flags = 0;
 		LAGG_XLOCK(sc);
 		if (sc->sc_flags & MBUF_HASHFLAG_L2)
 			rf->rf_flags |= LAGG_F_HASHL2;
 		if (sc->sc_flags & MBUF_HASHFLAG_L3)
 			rf->rf_flags |= LAGG_F_HASHL3;
 		if (sc->sc_flags & MBUF_HASHFLAG_L4)
 			rf->rf_flags |= LAGG_F_HASHL4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGHASH:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		LAGG_XLOCK(sc);
 		sc->sc_flags = 0;
 		if (rf->rf_flags & LAGG_F_HASHL2)
 			sc->sc_flags |= MBUF_HASHFLAG_L2;
 		if (rf->rf_flags & LAGG_F_HASHL3)
 			sc->sc_flags |= MBUF_HASHFLAG_L3;
 		if (rf->rf_flags & LAGG_F_HASHL4)
 			sc->sc_flags |= MBUF_HASHFLAG_L4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_RLOCK();
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_RUNLOCK();
 			if_rele(tpif);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		LAGG_RUNLOCK();
 		if_rele(tpif);
 		break;
 	case SIOCSLAGGPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 #ifdef INET6
 		/*
 		 * A laggport interface should not have inet6 address
 		 * because two interfaces with a valid link-local
 		 * scope zone must not be merged in any form.  This
 		 * restriction is needed to prevent violation of
 		 * link-local scope zone.  Attempts to add a laggport
 		 * interface which has inet6 addresses triggers
 		 * removal of all inet6 addresses on the member
 		 * interface.
 		 */
 		if (in6ifa_llaonifp(tpif)) {
 			in6_ifdetach(tpif);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    tpif->if_xname);
 		}
 #endif
 		LAGG_XLOCK(sc);
 		error = lagg_port_create(sc, tpif);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSLAGGDELPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_XUNLOCK(sc);
 			if_rele(tpif);
 			break;
 		}
 
 		error = lagg_port_destroy(lp, 1);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSIFFLAGS:
 		/* Set flags on ports too */
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setflags(lp, 1);
 		}
 
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			lagg_stop(sc);
 			LAGG_XUNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			LAGG_XUNLOCK(sc);
 			(*ifp->if_init)(sc);
 		} else
 			LAGG_XUNLOCK(sc);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_clrmulti(lp);
 			lagg_setmulti(lp);
 		}
 		LAGG_XUNLOCK(sc);
 		error = 0;
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 		}
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(ifp);
 		error = 0;
 		break;
 
 	case SIOCSIFMTU:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			else
 				error = EINVAL;
 			if (error != 0) {
 				if_printf(ifp,
 				    "failed to change MTU to %d on port %s, "
 				    "reverting all ports to original MTU (%d)\n",
 				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
 				break;
 			}
 		}
 		if (error == 0) {
 			ifp->if_mtu = ifr->ifr_mtu;
 		} else {
 			/* set every port back to the original MTU */
 			ifr->ifr_mtu = ifp->if_mtu;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 				if (lp->lp_ioctl != NULL)
 					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 	return (error);
 }
 
 #ifdef RATELIMIT
 static int
 lagg_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 	uint32_t p;
 
+	LAGG_RLOCK();
 	switch (sc->sc_proto) {
 	case LAGG_PROTO_FAILOVER:
 		lp = lagg_link_active(sc, sc->sc_primary);
 		break;
 	case LAGG_PROTO_LOADBALANCE:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
-		    params->hdr.flowtype == M_HASHTYPE_NONE)
+		    params->hdr.flowtype == M_HASHTYPE_NONE) {
+			LAGG_RUNLOCK();
 			return (EOPNOTSUPP);
+		}
 		p = params->hdr.flowid >> sc->flowid_shift;
 		p %= sc->sc_count;
 		lb = (struct lagg_lb *)sc->sc_psc;
 		lp = lb->lb_ports[p];
 		lp = lagg_link_active(sc, lp);
 		break;
 	case LAGG_PROTO_LACP:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
-		    params->hdr.flowtype == M_HASHTYPE_NONE)
+		    params->hdr.flowtype == M_HASHTYPE_NONE) {
+			LAGG_RUNLOCK();
 			return (EOPNOTSUPP);
+		}
 		lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
 		break;
 	default:
+		LAGG_RUNLOCK();
 		return (EOPNOTSUPP);
 	}
-	if (lp == NULL)
+	if (lp == NULL) {
+		LAGG_RUNLOCK();
 		return (EOPNOTSUPP);
+	}
 	ifp = lp->lp_ifp;
+	LAGG_RUNLOCK();
 	if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
 	    (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
 		return (EOPNOTSUPP);
 
 	/* forward allocation request */
 	return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
 }
 
 static void
 lagg_snd_tag_free(struct m_snd_tag *tag)
 {
 	tag->ifp->if_snd_tag_free(tag);
 }
 
 #endif
 
 static int
 lagg_setmulti(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *ifp = lp->lp_ifp;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct lagg_mc *mc;
 	struct ifmultiaddr *ifma;
 	int error;
 
 	IF_ADDR_WLOCK(scifp);
 	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
-		mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT);
+		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(scifp);
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr,
 		    ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp->if_index;
 		mc->mc_ifma = NULL;
 		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(scifp);
 	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
 		error = if_addmulti(ifp,
 		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_clrmulti(struct lagg_port *lp)
 {
 	struct lagg_mc *mc;
 
 	LAGG_XLOCK_ASSERT(lp->lp_softc);
 	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
 		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
 		if (mc->mc_ifma && lp->lp_detaching == 0)
 			if_delmulti_ifma(mc->mc_ifma);
-		free(mc, M_DEVBUF);
+		free(mc, M_LAGG);
 	}
 	return (0);
 }
 
 static int
 lagg_setcaps(struct lagg_port *lp, int cap)
 {
 	struct ifreq ifr;
 
 	if (lp->lp_ifp->if_capenable == cap)
 		return (0);
 	if (lp->lp_ioctl == NULL)
 		return (ENXIO);
 	ifr.ifr_reqcap = cap;
 	return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr));
 }
 
 /* Handle a ref counted flag that should be set on the lagg port as well */
 static int
 lagg_setflag(struct lagg_port *lp, int flag, int status,
     int (*func)(struct ifnet *, int))
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct ifnet *ifp = lp->lp_ifp;
 	int error;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	status = status ? (scifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded ports status is different from what
 	 * we want it to be.  If it is, flip it.  We record ports
 	 * status in lp_ifflags so that we won't clear ports flag
 	 * we haven't set.  In fact, we don't clear or set ports
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual ports flags.
 	 */
 	if (status != (lp->lp_ifflags & flag)) {
 		error = (*func)(ifp, status);
 		if (error)
 			return (error);
 		lp->lp_ifflags &= ~flag;
 		lp->lp_ifflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the lagg port
  * if "status" is true, update ports flags respective to the lagg
  * if "status" is false, forcedly clear the flags set on port.
  */
 static int
 lagg_setflags(struct lagg_port *lp, int status)
 {
 	int error, i;
 
 	for (i = 0; lagg_pflags[i].flag; i++) {
 		error = lagg_setflag(lp, lagg_pflags[i].flag,
 		    status, lagg_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error;
 
 	LAGG_RLOCK();
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		LAGG_RUNLOCK();
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	LAGG_RUNLOCK();
 
 	if (error != 0)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for lagg(4) is no-op.
  */
 static void
 lagg_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static struct mbuf *
 lagg_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 
 	LAGG_RLOCK();
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    lp->lp_detaching != 0 ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		LAGG_RUNLOCK();
 		m_freem(m);
 		return (NULL);
 	}
 
 	ETHER_BPF_MTAP(scifp, m);
 
 	m = lagg_proto_input(sc, lp, m);
 	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
 		m_freem(m);
 		m = NULL;
 	}
 
 	LAGG_RUNLOCK();
 	return (m);
 }
 
 static int
 lagg_media_change(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		printf("%s\n", __func__);
 
 	/* Ignore */
 	return (0);
 }
 
 static void
 lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	imr->ifm_status = IFM_AVALID;
 	imr->ifm_active = IFM_ETHER | IFM_AUTO;
 
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp))
 			imr->ifm_status |= IFM_ACTIVE;
 	}
 	LAGG_RUNLOCK();
 }
 
 static void
 lagg_linkstate(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int new_link = LINK_STATE_DOWN;
 	uint64_t speed;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* LACP handles link state itself */
 	if (sc->sc_proto == LAGG_PROTO_LACP)
 		return;
 
 	/* Our link is considered up if at least one of our ports is active */
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	LAGG_RUNLOCK();
 	if_link_state_change(sc->sc_ifp, new_link);
 
 	/* Update if_baudrate to reflect the max possible speed */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
 			    sc->sc_primary->lp_ifp->if_baudrate : 0;
 			break;
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			speed = 0;
 			LAGG_RLOCK();
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				speed += lp->lp_ifp->if_baudrate;
 			LAGG_RUNLOCK();
 			sc->sc_ifp->if_baudrate = speed;
 			break;
 		case LAGG_PROTO_LACP:
 			/* LACP updates if_baudrate itself */
 			break;
 	}
 }
 
 static void
 lagg_port_state(struct ifnet *ifp, int state)
 {
 	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
 	struct lagg_softc *sc = NULL;
 
 	if (lp != NULL)
 		sc = lp->lp_softc;
 	if (sc == NULL)
 		return;
 
 	LAGG_XLOCK(sc);
 	lagg_linkstate(sc);
 	lagg_proto_linkstate(sc, lp);
 	LAGG_XUNLOCK(sc);
 }
 
 struct lagg_port *
 lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_port *lp_next, *rval = NULL;
-	struct epoch_tracker net_et;
 
 	/*
 	 * Search a port which reports an active link state.
 	 */
 
+	/*
+	 * This is called with either LAGG_RLOCK() held or
+	 * LAGG_XLOCK(sc) held.
+	 */
+	if (!in_epoch(net_epoch_preempt))
+		LAGG_XLOCK_ASSERT(sc);
+
 	if (lp == NULL)
 		goto search;
 	if (LAGG_PORTACTIVE(lp)) {
 		rval = lp;
 		goto found;
 	}
 	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
 	    LAGG_PORTACTIVE(lp_next)) {
 		rval = lp_next;
 		goto found;
 	}
 
- search:
-	epoch_enter_preempt(net_epoch_preempt, &net_et);
+search:
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp_next)) {
-			epoch_exit_preempt(net_epoch_preempt, &net_et);
 			return (lp_next);
 		}
 	}
-	epoch_exit_preempt(net_epoch_preempt, &net_et);
 found:
 	return (rval);
 }
 
 int
 lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
 
 	return (ifp->if_transmit)(ifp, m);
 }
 
 /*
  * Simple round robin aggregation
  */
 static void
 lagg_rr_attach(struct lagg_softc *sc)
 {
 	sc->sc_seq = 0;
 	sc->sc_bkt_count = sc->sc_bkt;
 }
 
 static int
 lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	uint32_t p;
 
 	if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0)
 		sc->sc_bkt_count = sc->sc_bkt;
 
 	if (sc->sc_bkt > 0) {
 		atomic_subtract_int(&sc->sc_bkt_count, 1);
 	if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt))
 		p = atomic_fetchadd_32(&sc->sc_seq, 1);
 	else
 		p = sc->sc_seq; 
 	} else
 		p = atomic_fetchadd_32(&sc->sc_seq, 1);
 
 	p %= sc->sc_count;
 	lp = CK_SLIST_FIRST(&sc->sc_ports);
 
 	while (p--)
 		lp = CK_SLIST_NEXT(lp, lp_entries);
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * Broadcast mode
  */
 static int
 lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	int active_ports = 0;
 	int errors = 0;
 	int ret;
 	struct lagg_port *lp, *last = NULL;
 	struct mbuf *m0;
 
-	LAGG_RLOCK();
+	LAGG_RLOCK_ASSERT();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (!LAGG_PORTACTIVE(lp))
 			continue;
 
 		active_ports++;
 
 		if (last != NULL) {
 			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (m0 == NULL) {
 				ret = ENOBUFS;
 				errors++;
 				break;
 			}
 
 			ret = lagg_enqueue(last->lp_ifp, m0);
 			if (ret != 0)
 				errors++;
 		}
 		last = lp;
 	}
-	LAGG_RUNLOCK();
 
 	if (last == NULL) {
 		m_freem(m);
 		return (ENOENT);
 	}
 	if ((last = lagg_link_active(sc, last)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	ret = lagg_enqueue(last->lp_ifp, m);
 	if (ret != 0)
 		errors++;
 
 	if (errors == 0)
 		return (ret);
 
 	return (0);
 }
 
 static struct mbuf*
 lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
 /*
  * Active failover
  */
 static int
 lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	/* Use the master port if active or the next available port */
 	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *tmp_tp;
 
 	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
 		m->m_pkthdr.rcvif = ifp;
 		return (m);
 	}
 
 	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
 		tmp_tp = lagg_link_active(sc, sc->sc_primary);
 		/*
 		 * If tmp_tp is null, we've received a packet when all
 		 * our links are down. Weird, but process it anyways.
 		 */
 		if ((tmp_tp == NULL || tmp_tp == lp)) {
 			m->m_pkthdr.rcvif = ifp;
 			return (m);
 		}
 	}
 
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Loadbalancing
  */
 static void
 lagg_lb_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 
 	LAGG_XLOCK_ASSERT(sc);
-	lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO);
+	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
 	lb->lb_key = m_ether_tcpip_hash_init();
 	sc->sc_psc = lb;
 
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lagg_lb_port_create(lp);
 }
 
 static void
 lagg_lb_detach(struct lagg_softc *sc)
 {
 	struct lagg_lb *lb;
 
 	lb = (struct lagg_lb *)sc->sc_psc;
 	if (lb != NULL)
-		free(lb, M_DEVBUF);
+		free(lb, M_LAGG);
 }
 
 static int
 lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp_next;
 	int i = 0, rv;
 
 	rv = 0;
 	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
-	LAGG_RLOCK();
+	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (lp_next == lp)
 			continue;
 		if (i >= LAGG_MAX_PORTS) {
 			rv = EINVAL;
 			break;
 		}
 		if (sc->sc_ifflags & IFF_DEBUG)
 			printf("%s: port %s at index %d\n",
 			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
 		lb->lb_ports[i++] = lp_next;
 	}
-	LAGG_RUNLOCK();
 
 	return (rv);
 }
 
 static int
 lagg_lb_port_create(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	return (lagg_lb_porttable(sc, NULL));
 }
 
 static void
 lagg_lb_port_destroy(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	lagg_lb_porttable(sc, lp);
 }
 
 static int
 lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp = NULL;
 	uint32_t p = 0;
 
 	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
 	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		p = m->m_pkthdr.flowid >> sc->flowid_shift;
 	else
 		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
 	p %= sc->sc_count;
 	lp = lb->lb_ports[p];
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * 802.3ad LACP
  */
 static void
 lagg_lacp_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	lacp_attach(sc);
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static void
 lagg_lacp_detach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	void *psc;
 
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	psc = sc->sc_psc;
 	sc->sc_psc = NULL;
 	lacp_detach(psc);
 }
 
 static void
 lagg_lacp_lladdr(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	LAGG_SXLOCK_ASSERT(sc);
 
 	/* purge all the lacp ports */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	/* add them back in */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static int
 lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	lp = lacp_select_tx_port(sc, m);
 	if (lp == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct ether_header *eh;
 	u_short etype;
 
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 
 	/* Tap off LACP control messages */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
 		m = lacp_input(lp, m);
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * If the port is not collecting or not in the active aggregator then
 	 * free and return.
 	 */
 	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
 		m_freem(m);
 		return (NULL);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
Index: projects/capsicum-test/sys/net/iflib.c
===================================================================
--- projects/capsicum-test/sys/net/iflib.c	(revision 345709)
+++ projects/capsicum-test/sys/net/iflib.c	(revision 345710)
@@ -1,6569 +1,6572 @@
 /*-
  * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/md5.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/ip_var.h>
 #include <netinet/netdump/netdump.h>
 #include <netinet6/ip6_var.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/led/led.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
 #include <net/iflib.h>
 #include <net/iflib_private.h>
 
 #include "ifdi_if.h"
 
 #ifdef PCI_IOV
 #include <dev/pci/pci_iov.h>
 #endif
 
 #include <sys/bitstring.h>
 /*
  * enable accounting of every mbuf as it comes in to and goes out of
  * iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
  * Enable mbuf vectors for compressing long mbuf chains
  */
 
 /*
  * NB:
  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  *   the cost of a prefetch. This will of course vary based on the workload:
  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  *        is quite expensive, thus suggesting very little prefetch.
  *      - small packet forwarding which is just returning a single mbuf to
  *        UMA will typically be very fast vis a vis the cost of a memory
  *        access.
  */
 
 
 /*
  * File organization:
  *  - private structures
  *  - iflib private utility functions
  *  - ifnet functions
  *  - vlan registry and other exported functions
  *  - iflib public core functions
  *
  *
  */
 MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
 
 struct iflib_txq;
 typedef struct iflib_txq *iflib_txq_t;
 struct iflib_rxq;
 typedef struct iflib_rxq *iflib_rxq_t;
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
 struct iflib_ctx;
 
 static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
 static void iflib_timer(void *arg);
 
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
 	void *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
 	KOBJ_FIELDS;
 	/*
 	 * Pointer to hardware driver's softc
 	 */
 	void *ifc_softc;
 	device_t ifc_dev;
 	if_t ifc_ifp;
 
 	cpuset_t ifc_cpus;
 	if_shared_ctx_t ifc_sctx;
 	struct if_softc_ctx ifc_softc_ctx;
 
 	struct sx ifc_ctx_sx;
 	struct mtx ifc_state_mtx;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
 	uint32_t ifc_if_flags;
 	uint32_t ifc_flags;
 	uint32_t ifc_max_fl_buf_size;
 	uint32_t ifc_rx_mbuf_sz;
 
 	int ifc_link_state;
 	int ifc_link_irq;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
 
 	struct if_irq ifc_legacy_irq;
 	struct grouptask ifc_admin_task;
 	struct grouptask ifc_vflr_task;
 	struct iflib_filter_info ifc_filter_info;
 	struct ifmedia	ifc_media;
 
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
 	uint16_t ifc_sysctl_rx_budget;
 	uint16_t ifc_sysctl_tx_abdicate;
 
 	qidx_t ifc_sysctl_ntxds[8];
 	qidx_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
 #define isc_rxd_available ifc_txrx.ift_rxd_available
 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	uint8_t ifc_mac[ETHER_ADDR_LEN];
 	char ifc_mtx_name[16];
 };
 
 
 void *
 iflib_get_softc(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_softc);
 }
 
 device_t
 iflib_get_dev(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_dev);
 }
 
 if_t
 iflib_get_ifp(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_ifp);
 }
 
 struct ifmedia *
 iflib_get_media(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_media);
 }
 
 uint32_t
 iflib_get_flags(if_ctx_t ctx)
 {
 	return (ctx->ifc_flags);
 }
 
 void
 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
 {
 
 	bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
 }
 
 if_softc_ctx_t
 iflib_get_softc_ctx(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_softc_ctx);
 }
 
 if_shared_ctx_t
 iflib_get_sctx(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_sctx);
 }
 
 #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
 #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
 typedef struct iflib_sw_rx_desc_array {
 	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
 	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
 	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
 } iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 } if_txsd_vec_t;
 
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
 #define IFLIB_RX_COPY_THRESH		128
 #define IFLIB_MAX_RX_REFRESH		32
 /* The minimum descriptors per second before we start coalescing */
 #define IFLIB_MIN_DESC_SEC		16384
 #define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
 /* maximum number of txqs that can share an rx interrupt */
 #define IFLIB_MAX_TX_SHARED_INTR	4
 
 /* this should really scale with ring size - this is a fairly arbitrary value */
 #define TX_BATCH_SIZE			32
 
 #define IFLIB_RESTART_BUDGET		8
 
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 struct iflib_txq {
 	qidx_t		ift_in_use;
 	qidx_t		ift_cidx;
 	qidx_t		ift_cidx_processed;
 	qidx_t		ift_pidx;
 	uint8_t		ift_gen;
 	uint8_t		ift_br_offset;
 	uint16_t	ift_npending;
 	uint16_t	ift_db_pending;
 	uint16_t	ift_rs_pending;
 	/* implicit pad */
 	uint8_t		ift_txd_size[8];
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
 	uint64_t	ift_cleaned_prev;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
 #endif
 	uint64_t	ift_no_tx_dma_setup;
 	uint64_t	ift_no_desc_avail;
 	uint64_t	ift_mbuf_defrag_failed;
 	uint64_t	ift_mbuf_defrag;
 	uint64_t	ift_map_failed;
 	uint64_t	ift_txd_encap_efbig;
 	uint64_t	ift_pullups;
 	uint64_t	ift_last_timer_tick;
 
 	struct mtx	ift_mtx;
 	struct mtx	ift_db_mtx;
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
 	struct ifmp_ring        *ift_br;
 	struct grouptask	ift_task;
 	qidx_t		ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
 
 	if_txsd_vec_t	ift_sds;
 	uint8_t		ift_qstatus;
 	uint8_t		ift_closed;
 	uint8_t		ift_update_freq;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t	ift_buf_tag;
 	bus_dma_tag_t	ift_tso_buf_tag;
 	iflib_dma_info_t	ift_ifdi;
 #define MTX_NAME_LEN 16
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	char                    ift_db_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ift_cpu_exec_count[256];
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
 	qidx_t		ifl_cidx;
 	qidx_t		ifl_pidx;
 	qidx_t		ifl_credits;
 	uint8_t		ifl_gen;
 	uint8_t		ifl_rxd_size;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
 	uint64_t	ifl_cl_enqueued;
 	uint64_t	ifl_cl_dequeued;
 #endif
 	/* implicit pad */
 
 	bitstr_t 	*ifl_rx_bitmap;
 	qidx_t		ifl_fragidx;
 	/* constant */
 	qidx_t		ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
 	iflib_rxsd_array_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t	ifl_buf_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
 	qidx_t	ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
 static inline qidx_t
 get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
 {
 	qidx_t used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
 	else if (pidx < cidx)
 		used = size - cidx + pidx;
 	else if (gen == 0 && pidx == cidx)
 		used = 0;
 	else if (gen == 1 && pidx == cidx)
 		used = size;
 	else
 		panic("bad state");
 
 	return (used);
 }
 
 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
 
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 struct iflib_rxq {
 	/* If there is a separate completion queue -
 	 * these are the cq cidx and pidx. Otherwise
 	 * these are unused.
 	 */
 	qidx_t		ifr_size;
 	qidx_t		ifr_cq_cidx;
 	qidx_t		ifr_cq_pidx;
 	uint8_t		ifr_cq_gen;
 	uint8_t		ifr_fl_offset;
 
 	if_ctx_t	ifr_ctx;
 	iflib_fl_t	ifr_fl;
 	uint64_t	ifr_rx_irq;
 	uint16_t	ifr_id;
 	uint8_t		ifr_lro_enabled;
 	uint8_t		ifr_nfl;
 	uint8_t		ifr_ntxqirq;
 	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
 
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ifr_cpu_exec_count[256];
 #endif
 }  __aligned(CACHE_LINE_SIZE);
 
 typedef struct if_rxsd {
 	caddr_t *ifsd_cl;
 	struct mbuf **ifsd_m;
 	iflib_fl_t ifsd_fl;
 	qidx_t ifsd_cidx;
 } *if_rxsd_t;
 
 /* multiple of word size */
 #ifdef __LP64__
 #define PKT_INFO_SIZE	6
 #define RXD_INFO_SIZE	5
 #define PKT_TYPE uint64_t
 #else
 #define PKT_INFO_SIZE	11
 #define RXD_INFO_SIZE	8
 #define PKT_TYPE uint32_t
 #endif
 #define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
 #define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
 
 typedef struct if_pkt_info_pad {
 	PKT_TYPE pkt_val[PKT_INFO_SIZE];
 } *if_pkt_info_pad_t;
 typedef struct if_rxd_info_pad {
 	PKT_TYPE rxd_val[RXD_INFO_SIZE];
 } *if_rxd_info_pad_t;
 
 CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
 CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
 
 
 static inline void
 pkt_info_zero(if_pkt_info_t pi)
 {
 	if_pkt_info_pad_t pi_pad;
 
 	pi_pad = (if_pkt_info_pad_t)pi;
 	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
 	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
 #ifndef __LP64__
 	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
 	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
 #endif	
 }
 
 static device_method_t iflib_pseudo_methods[] = {
 	DEVMETHOD(device_attach, noop_attach),
 	DEVMETHOD(device_detach, iflib_pseudo_detach),
 	DEVMETHOD_END
 };
 
 driver_t iflib_pseudodriver = {
 	"iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
 };
 
 static inline void
 rxd_info_zero(if_rxd_info_t ri)
 {
 	if_rxd_info_pad_t ri_pad;
 	int i;
 
 	ri_pad = (if_rxd_info_pad_t)ri;
 	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
 		ri_pad->rxd_val[i] = 0;
 		ri_pad->rxd_val[i+1] = 0;
 		ri_pad->rxd_val[i+2] = 0;
 		ri_pad->rxd_val[i+3] = 0;
 	}
 #ifdef __LP64__
 	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
 #endif
 }
 
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
 #define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
 #define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
 #define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
 
 
 #define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
 #define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
 #define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
 #define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
 
 
 
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
 void
 iflib_set_detach(if_ctx_t ctx)
 {
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_IN_DETACH;
 	STATE_UNLOCK(ctx);
 }
 
 /* Our boot-time initialization hook */
 static int	iflib_module_event_handler(module_t, int, void *);
 
 static moduledata_t iflib_moduledata = {
 	"iflib",
 	iflib_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(iflib, 1);
 
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
 #define IFLIB_DEBUG_COUNTERS 1
 #else
 #define IFLIB_DEBUG_COUNTERS 0
 #endif /* !INVARIANTS */
 #endif
 
 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
                    "iflib driver parameters");
 
 /*
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
 		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
 static int iflib_no_tx_batch = 0;
 SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
 		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
 
 
 #if IFLIB_DEBUG_COUNTERS
 
 static int iflib_tx_seen;
 static int iflib_tx_sent;
 static int iflib_tx_encap;
 static int iflib_rx_allocs;
 static int iflib_fl_refills;
 static int iflib_fl_refills_large;
 static int iflib_tx_frees;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
 		   &iflib_tx_seen, 0, "# tx mbufs seen");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
 		   &iflib_tx_sent, 0, "# tx mbufs sent");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
 		   &iflib_tx_encap, 0, "# tx mbufs encapped");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
 		   &iflib_tx_frees, 0, "# tx frees");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
 		   &iflib_rx_allocs, 0, "# rx allocations");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
 		   &iflib_fl_refills, 0, "# refills");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
 		   &iflib_fl_refills_large, 0, "# large refills");
 
 
 static int iflib_txq_drain_flushing;
 static int iflib_txq_drain_oactive;
 static int iflib_txq_drain_notready;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
 		   &iflib_txq_drain_notready, 0, "# drain notready");
 
 
 static int iflib_encap_load_mbuf_fail;
 static int iflib_encap_pad_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
 
 static int iflib_task_fn_rxs;
 static int iflib_rx_intr_enables;
 static int iflib_fast_intrs;
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_if_input;
 static int iflib_rx_mbuf_null;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
 		   &iflib_rx_intr_enables, 0, "# rx intr enables");
 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
 		   &iflib_fast_intrs, 0, "# fast_intr calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
 		   &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
 static void
 iflib_debug_reset(void)
 {
 	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
 		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
 		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
 		iflib_txq_drain_notready =
 		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
 		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
 		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
 		iflib_rx_unavail =
 		iflib_rx_ctx_inactive = iflib_rx_if_input =
 		iflib_rx_mbuf_null = iflib_rxd_flush = 0;
 }
 
 #else
 #define DBG_COUNTER_INC(name)
 static void iflib_debug_reset(void) {}
 #endif
 
 #define IFLIB_DEBUG 0
 
 static void iflib_tx_structures_free(if_ctx_t ctx);
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
 #ifdef ALTQ
 static void iflib_altq_if_start(if_t ifp);
 static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
 #endif
 static int iflib_register(if_ctx_t);
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
 static void iflib_ifmp_purge(iflib_txq_t txq);
 static void _iflib_pre_assert(if_softc_ctx_t scctx);
 static void iflib_if_init_locked(if_ctx_t ctx);
 static void iflib_free_intr_mem(if_ctx_t ctx);
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf * iflib_fixup_rx(struct mbuf *m);
 #endif
 
 NETDUMP_DEFINE(iflib);
 
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
 static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init);
 
 /*
  * device-specific sysctl variables:
  *
  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  *	During regular operations the CRC is stripped, but on some
  *	hardware reception of frames not multiple of 64 is slower,
  *	so using crcstrip=0 helps in benchmarks.
  *
  * iflib_rx_miss, iflib_rx_miss_bufs:
  *	count packets that might be missed due to lost interrupts.
  */
 SYSCTL_DECL(_dev_netmap);
 /*
  * The xl driver by default strips CRCs and we do not override it.
  */
 
 int iflib_crcstrip = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames");
 
 int iflib_rx_miss, iflib_rx_miss_bufs;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr");
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs");
 
 /*
  * Register/unregister. We are already under netmap lock.
  * Only called on the first register or the last unregister.
  */
 static int
 iflib_netmap_register(struct netmap_adapter *na, int onoff)
 {
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 	int status;
 
 	CTX_LOCK(ctx);
 	IFDI_INTR_DISABLE(ctx);
 
 	/* Tell the stack that the interface is no longer active */
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	if (!CTX_IS_VF(ctx))
 		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	/* enable or disable flags and callbacks in na and ifp */
 	if (onoff) {
 		nm_set_native_flags(na);
 	} else {
 		nm_clear_native_flags(na);
 	}
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
 	status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
 	if (status)
 		nm_clear_native_flags(na);
 	CTX_UNLOCK(ctx);
 	return (status);
 }
 
 static int
 netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init)
 {
 	struct netmap_adapter *na = kring->na;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int head = kring->rhead;
 	struct netmap_ring *ring = kring->ring;
 	bus_dmamap_t *map;
 	struct if_rxd_update iru;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 	uint32_t refill_pidx, nic_i;
 #if IFLIB_DEBUG_COUNTERS
 	int rf_count = 0;
 #endif
 
 	if (nm_i == head && __predict_true(!init))
 		return 0;
 	iru_init(&iru, rxq, 0 /* flid */);
 	map = fl->ifl_sds.ifsd_map;
 	refill_pidx = netmap_idx_k2n(kring, nm_i);
 	/*
 	 * IMPORTANT: we must leave one free slot in the ring,
 	 * so move head back by one unit
 	 */
 	head = nm_prev(head, lim);
 	nic_i = UINT_MAX;
 	DBG_COUNTER_INC(fl_refills);
 	while (nm_i != head) {
 #if IFLIB_DEBUG_COUNTERS
 		if (++rf_count == 9)
 			DBG_COUNTER_INC(fl_refills_large);
 #endif
 		for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]);
 			uint32_t nic_i_dma = refill_pidx;
 			nic_i = netmap_idx_k2n(kring, nm_i);
 
 			MPASS(tmp_pidx < IFLIB_MAX_RX_REFRESH);
 
 			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 			        return netmap_ring_reinit(kring);
 
 			fl->ifl_vm_addrs[tmp_pidx] = addr;
 			if (__predict_false(init)) {
 				netmap_load_map(na, fl->ifl_buf_tag,
 				    map[nic_i], addr);
 			} else if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, fl->ifl_buf_tag,
 				    map[nic_i], addr);
 			}
 			slot->flags &= ~NS_BUF_CHANGED;
 
 			nm_i = nm_next(nm_i, lim);
 			fl->ifl_rxd_idxs[tmp_pidx] = nic_i = nm_next(nic_i, lim);
 			if (nm_i != head && tmp_pidx < IFLIB_MAX_RX_REFRESH-1)
 				continue;
 
 			iru.iru_pidx = refill_pidx;
 			iru.iru_count = tmp_pidx+1;
 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			refill_pidx = nic_i;
 			for (int n = 0; n < iru.iru_count; n++) {
 				bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i_dma],
 						BUS_DMASYNC_PREREAD);
 				/* XXX - change this to not use the netmap func*/
 				nic_i_dma = nm_next(nic_i_dma, lim);
 			}
 		}
 	}
 	kring->nr_hwcur = head;
 
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	if (__predict_true(nic_i != UINT_MAX)) {
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
 		DBG_COUNTER_INC(rxd_flush);
 	}
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
  * All information is in the kring.
  * Userspace wants to send packets up to the one before kring->rhead,
  * kernel knows kring->nr_hwcur is the first unsent packet.
  *
  * Here we push packets out (as many as possible), and possibly
  * reclaim buffers from previously completed transmission.
  *
  * The caller (netmap) guarantees that there is only one instance
  * running at any time. Any interference with other driver
  * methods should be handled by the individual drivers.
  */
 static int
 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap kring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	struct if_pkt_info pi;
 
 	/*
 	 * interrupts on every tx packet are expensive so request
 	 * them every half ring, or where NS_REPORT is set
 	 */
 	u_int report_frequency = kring->nkr_num_slots >> 1;
 	/* device-specific */
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap kring,
 	 * nic_i is the corresponding index in the NIC ring.
 	 *
 	 * If we have packets to send (nm_i != head)
 	 * iterate over the netmap ring, fetch length and update
 	 * the corresponding slot in the NIC ring. Some drivers also
 	 * need to update the buffer's physical address in the NIC slot
 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 	 *
 	 * The netmap_reload_map() calls is especially expensive,
 	 * even when (as in this case) the tag is 0, so do only
 	 * when the buffer has actually changed.
 	 *
 	 * If possible do not set the report/intr bit on all slots,
 	 * but only a few times per ring or when NS_REPORT is set.
 	 *
 	 * Finally, on 10G and faster drivers, it might be useful
 	 * to prefetch the next slot and txr entry.
 	 */
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
 		pkt_info_zero(&pi);
 		pi.ipi_segs = txq->ift_segs;
 		pi.ipi_qsidx = kring->ring_id;
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 			int flags = (slot->flags & NS_REPORT ||
 				nic_i == 0 || nic_i == report_frequency) ?
 				IPI_TX_INTR : 0;
 
 			/* device-specific */
 			pi.ipi_len = len;
 			pi.ipi_segs[0].ds_addr = paddr;
 			pi.ipi_segs[0].ds_len = len;
 			pi.ipi_nsegs = 1;
 			pi.ipi_ndescs = 0;
 			pi.ipi_pidx = nic_i;
 			pi.ipi_flags = flags;
 
 			/* Fill the slot in the NIC ring. */
 			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 			DBG_COUNTER_INC(tx_encap);
 
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
 			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[nic_i], addr);
 			}
 			/* make sure changes to the buffer are synced */
 			bus_dmamap_sync(txq->ift_buf_tag,
 			    txq->ift_sds.ifsd_map[nic_i],
 			    BUS_DMASYNC_PREWRITE);
 
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = nm_i;
 
 		/* synchronize the NIC ring */
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 	}
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 *
 	 * If there are unclaimed buffers, attempt to reclaim them.
 	 * If none are reclaimed, and TX IRQs are not in use, do an initial
 	 * minimal delay, then trigger the tx handler which will spin in the
 	 * group task queue.
 	 */
 	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 		if (iflib_tx_credits_update(ctx, txq)) {
 			/* some tx completed, increment avail */
 			nic_i = txq->ift_cidx_processed;
 			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 		}
 	}
 	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
 		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 			callout_reset_on(&txq->ift_timer, hz < 2000 ? 1 : hz / 1000,
 			    iflib_timer, txq, txq->ift_timer.c_cpu);
 	}
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the receive ring.
  * Same as for the txsync, this routine must be efficient.
  * The caller guarantees a single invocations, but races against
  * the rest of the driver should be handled here.
  *
  * On call, kring->rhead is the first packet that userspace wants
  * to keep, and kring->rcur is the wakeup point.
  * The kernel has previously reported packets up to kring->rtail.
  *
  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
  * of whether or not we received an interrupt.
  */
 static int
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct netmap_ring *ring = kring->ring;
 	iflib_fl_t fl;
 	uint32_t nm_i;	/* index into the netmap ring */
 	uint32_t nic_i;	/* index into the NIC ring */
 	u_int i, n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	struct if_rxd_info ri;
 
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	if (head > lim)
 		return netmap_ring_reinit(kring);
 
 	/*
 	 * XXX netmap_fl_refill() only ever (re)fills free list 0 so far.
 	 */
 
 	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) {
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	}
 
 	/*
 	 * First part: import newly received packets.
 	 *
 	 * nm_i is the index of the next free slot in the netmap ring,
 	 * nic_i is the index of the next received packet in the NIC ring,
 	 * and they may differ in case if_init() has been called while
 	 * in netmap mode. For the receive ring we have
 	 *
 	 *	nic_i = rxr->next_check;
 	 *	nm_i = kring->nr_hwtail (previous)
 	 * and
 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 *
 	 * rxr->next_check is set to 0 on a ring reinit
 	 */
 	if (netmap_no_pendintr || force_update) {
 		int crclen = iflib_crcstrip ? 0 : 4;
 		int error, avail;
 
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			nic_i = fl->ifl_cidx;
 			nm_i = netmap_idx_n2k(kring, nic_i);
 			avail = ctx->isc_rxd_available(ctx->ifc_softc,
 			    rxq->ifr_id, nic_i, USHRT_MAX);
 			for (n = 0; avail > 0; n++, avail--) {
 				rxd_info_zero(&ri);
 				ri.iri_frags = rxq->ifr_frags;
 				ri.iri_qsidx = kring->ring_id;
 				ri.iri_ifp = ctx->ifc_ifp;
 				ri.iri_cidx = nic_i;
 
 				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 				ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen;
 				ring->slot[nm_i].flags = 0;
 				bus_dmamap_sync(fl->ifl_buf_tag,
 				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				nic_i = nm_next(nic_i, lim);
 			}
 			if (n) { /* update the state variables */
 				if (netmap_no_pendintr && !force_update) {
 					/* diagnostics */
 					iflib_rx_miss ++;
 					iflib_rx_miss_bufs += n;
 				}
 				fl->ifl_cidx = nic_i;
 				kring->nr_hwtail = nm_i;
 			}
 			kring->nr_kflags &= ~NKR_PENDINTR;
 		}
 	}
 	/*
 	 * Second part: skip past packets that userspace has released.
 	 * (kring->nr_hwcur to head excluded),
 	 * and make the buffers available for reception.
 	 * As usual nm_i is the index in the netmap ring,
 	 * nic_i is the index in the NIC ring, and
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	/* XXX not sure how this will work with multiple free lists */
 	nm_i = kring->nr_hwcur;
 
 	return (netmap_fl_refill(rxq, kring, nm_i, false));
 }
 
 static void
 iflib_netmap_intr(struct netmap_adapter *na, int onoff)
 {
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 
 	CTX_LOCK(ctx);
 	if (onoff) {
 		IFDI_INTR_ENABLE(ctx);
 	} else {
 		IFDI_INTR_DISABLE(ctx);
 	}
 	CTX_UNLOCK(ctx);
 }
 
 
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
 	struct netmap_adapter na;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = ctx->ifc_ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP;
 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 
 	na.num_tx_desc = scctx->isc_ntxd[0];
 	na.num_rx_desc = scctx->isc_nrxd[0];
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
 	na.nm_intr = iflib_netmap_intr;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
 }
 
 static void
 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 	if (slot == NULL)
 		return;
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 
 		/*
 		 * In netmap mode, set the map for the packet buffer.
 		 * NOTE: Some drivers (not this one) also need to set
 		 * the physical buffer address in the NIC ring.
 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
 		 * netmap slot index, si
 		 */
 		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
 		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
 		    NMB(na, slot + si));
 	}
 }
 
 static void
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_kring *kring = na->rx_rings[rxq->ifr_id];
 	struct netmap_slot *slot;
 	uint32_t nm_i;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 	if (slot == NULL)
 		return;
 	nm_i = netmap_idx_n2k(kring, 0);
 	netmap_fl_refill(rxq, kring, nm_i, true);
 }
 
 static void
 iflib_netmap_timer_adjust(if_ctx_t ctx, iflib_txq_t txq, uint32_t *reset_on)
 {
 	struct netmap_kring *kring;
 	uint16_t txqid;
 
 	txqid = txq->ift_id;
 	kring = NA(ctx->ifc_ifp)->tx_rings[txqid];
 
 	if (kring->nr_hwcur != nm_next(kring->nr_hwtail, kring->nkr_num_slots - 1)) {
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD);
 		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false))
 			netmap_tx_irq(ctx->ifc_ifp, txqid);
 		if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) {
 			if (hz < 2000)
 				*reset_on = 1;
 			else
 				*reset_on = hz / 1000;
 		}
 	}
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 
 #else
 #define iflib_netmap_txq_init(ctx, txq)
 #define iflib_netmap_rxq_init(ctx, rxq)
 #define iflib_netmap_detach(ifp)
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
 #define netmap_tx_irq(ifp, qid) do {} while (0)
 #define iflib_netmap_timer_adjust(ctx, txq, reset_on)
 
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
 static __inline void
 prefetch2cachelines(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 #if (CACHE_LINE_SIZE < 128)
 	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
 #endif
 }
 #else
 #define prefetch(x)
 #define prefetch2cachelines(x)
 #endif
 
 static void
 iflib_gen_mac(if_ctx_t ctx)
 {
 	struct thread *td;
 	MD5_CTX mdctx;
 	char uuid[HOSTUUIDLEN+1];
 	char buf[HOSTUUIDLEN+16];
 	uint8_t *mac;
 	unsigned char digest[16];
 
 	td = curthread;
 	mac = ctx->ifc_mac;
 	uuid[HOSTUUIDLEN] = 0;
 	bcopy(td->td_ucred->cr_prison->pr_hostuuid, uuid, HOSTUUIDLEN);
 	snprintf(buf, HOSTUUIDLEN+16, "%s-%s", uuid, device_get_nameunit(ctx->ifc_dev));
 	/*
 	 * Generate a pseudo-random, deterministic MAC
 	 * address based on the UUID and unit number.
 	 * The FreeBSD Foundation OUI of 58-9C-FC is used.
 	 */
 	MD5Init(&mdctx);
 	MD5Update(&mdctx, buf, strlen(buf));
 	MD5Final(digest, &mdctx);
 
 	mac[0] = 0x58;
 	mac[1] = 0x9C;
 	mac[2] = 0xFC;
 	mac[3] = digest[0];
 	mac[4] = digest[1];
 	mac[5] = digest[2];
 }
 
 static void
 iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
 {
 	iflib_fl_t fl;
 
 	fl = &rxq->ifr_fl[flid];
 	iru->iru_paddrs = fl->ifl_bus_addrs;
 	iru->iru_vaddrs = &fl->ifl_vm_addrs[0];
 	iru->iru_idxs = fl->ifl_rxd_idxs;
 	iru->iru_qsidx = rxq->ifr_id;
 	iru->iru_buf_size = fl->ifl_buf_size;
 	iru->iru_flidx = fl->ifl_id;
 }
 
 static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
 		return;
 	*(bus_addr_t *) arg = segs[0].ds_addr;
 }
 
 int
 iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
 {
 	int err;
 	device_t dev = ctx->ifc_dev;
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
 				align, 0,		/* alignment, bounds */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				size,			/* maxsize */
 				1,			/* nsegments */
 				size,			/* maxsegsize */
 				BUS_DMA_ALLOCNOW,	/* flags */
 				NULL,			/* lockfunc */
 				NULL,			/* lockarg */
 				&dma->idi_tag);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dma_tag_create failed: %d\n",
 		    __func__, err);
 		goto fail_0;
 	}
 
 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 		    __func__, (uintmax_t)size, err);
 		goto fail_1;
 	}
 
 	dma->idi_paddr = IF_BAD_DMA;
 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 	if (err || dma->idi_paddr == IF_BAD_DMA) {
 		device_printf(dev,
 		    "%s: bus_dmamap_load failed: %d\n",
 		    __func__, err);
 		goto fail_2;
 	}
 
 	dma->idi_size = size;
 	return (0);
 
 fail_2:
 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 fail_1:
 	bus_dma_tag_destroy(dma->idi_tag);
 fail_0:
 	dma->idi_tag = NULL;
 
 	return (err);
 }
 
 int
 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 
 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 
 	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
 }
 
 int
 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 {
 	int i, err;
 	iflib_dma_info_t *dmaiter;
 
 	dmaiter = dmalist;
 	for (i = 0; i < count; i++, dmaiter++) {
 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 			break;
 	}
 	if (err)
 		iflib_dma_free_multi(dmalist, i);
 	return (err);
 }
 
 void
 iflib_dma_free(iflib_dma_info_t dma)
 {
 	if (dma->idi_tag == NULL)
 		return;
 	if (dma->idi_paddr != IF_BAD_DMA) {
 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 		dma->idi_paddr = IF_BAD_DMA;
 	}
 	if (dma->idi_vaddr != NULL) {
 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 		dma->idi_vaddr = NULL;
 	}
 	bus_dma_tag_destroy(dma->idi_tag);
 	dma->idi_tag = NULL;
 }
 
 void
 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 {
 	int i;
 	iflib_dma_info_t *dmaiter = dmalist;
 
 	for (i = 0; i < count; i++, dmaiter++)
 		iflib_dma_free(*dmaiter);
 }
 
 #ifdef EARLY_AP_STARTUP
 static const int iflib_started = 1;
 #else
 /*
  * We used to abuse the smp_started flag to decide if the queues have been
  * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
  * That gave bad races, since the SYSINIT() runs strictly after smp_started
  * is set.  Run a SYSINIT() strictly after that to just set a usable
  * completion flag.
  */
 
 static int iflib_started;
 
 static void
 iflib_record_started(void *arg)
 {
 	iflib_started = 1;
 }
 
 SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 	iflib_record_started, NULL);
 #endif
 
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	int result;
 
 	if (!iflib_started)
 		return (FILTER_STRAY);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 iflib_fast_intr_rxtx(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	if_ctx_t ctx;
 	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
 	iflib_txq_t txq;
 	void *sc;
 	int i, cidx, result;
 	qidx_t txqid;
 
 	if (!iflib_started)
 		return (FILTER_STRAY);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	ctx = rxq->ifr_ctx;
 	sc = ctx->ifc_softc;
 	MPASS(rxq->ifr_ntxqirq);
 	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
 		txqid = rxq->ifr_txqid[i];
 		txq = &ctx->ifc_txqs[txqid];
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD);
 		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
 			IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
 			continue;
 		}
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidx = rxq->ifr_cq_cidx;
 	else
 		cidx = rxq->ifr_fl[0].ifl_cidx;
 	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
 		GROUPTASK_ENQUEUE(gtask);
 	else {
 		IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 		DBG_COUNTER_INC(rx_intr_enables);
 	}
 	return (FILTER_HANDLED);
 }
 
 
 static int
 iflib_fast_intr_ctx(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	int result;
 
 	if (!iflib_started)
 		return (FILTER_STRAY);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 		 driver_filter_t filter, driver_intr_t handler, void *arg,
 		 const char *name)
 {
 	int rc, flags;
 	struct resource *res;
 	void *tag = NULL;
 	device_t dev = ctx->ifc_dev;
 
 	flags = RF_ACTIVE;
 	if (ctx->ifc_flags & IFC_LEGACY)
 		flags |= RF_SHAREABLE;
 	MPASS(rid < 512);
 	irq->ii_rid = rid;
 	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, flags);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 	irq->ii_res = res;
 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 						filter, handler, arg, &tag);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 					  rid, name ? name : "unknown", rc);
 		return (rc);
 	} else if (name)
 		bus_describe_intr(dev, res, tag, "%s", name);
 
 	irq->ii_tag = tag;
 	return (0);
 }
 
 
 /*********************************************************************
  *
  *  Allocate DMA resources for TX buffers as well as memory for the TX
  *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
  *  iflib_sw_tx_desc_array structure, storing all the information that
  *  is needed to transmit a packet on the wire.  This is called only
  *  once at attach, setup is done every reset.
  *
  **********************************************************************/
 static int
 iflib_txsd_alloc(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	bus_size_t tsomaxsize;
 	int err, nsegments, ntsosegments;
 	bool tso;
 
 	nsegments = scctx->isc_tx_nsegments;
 	ntsosegments = scctx->isc_tx_tso_segments_max;
 	tsomaxsize = scctx->isc_tx_tso_size_max;
 	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
 		tsomaxsize += sizeof(struct ether_vlan_header);
 	MPASS(scctx->isc_ntxd[0] > 0);
 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 	MPASS(nsegments > 0);
 	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
 		MPASS(ntsosegments > 0);
 		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
 	}
 
 	/*
 	 * Set up DMA tags for TX buffers.
 	 */
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       sctx->isc_tx_maxsize,		/* maxsize */
 			       nsegments,	/* nsegments */
 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_buf_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
 		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
 	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
 	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       tsomaxsize,		/* maxsize */
 			       ntsosegments,	/* nsegments */
 			       sctx->isc_tso_maxsegsize,/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_tso_buf_tag))) {
 		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
 		    err);
 		goto fail;
 	}
 
 	/* Allocate memory for the TX mbuf map. */
 	if (!(txq->ift_sds.ifsd_m =
 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Create the DMA maps for TX buffers.
 	 */
 	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
 	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(dev,
 		    "Unable to allocate TX buffer DMA map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
 	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(dev,
 		    "Unable to allocate TSO TX buffer map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 		err = bus_dmamap_create(txq->ift_buf_tag, 0,
 		    &txq->ift_sds.ifsd_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 		if (!tso)
 			continue;
 		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
 		    &txq->ift_sds.ifsd_tso_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TSO TX DMA map\n");
 			goto fail;
 		}
 	}
 	return (0);
 fail:
 	/* We free all, it handles case where we are in the middle */
 	iflib_tx_structures_free(ctx);
 	return (err);
 }
 
 static void
 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	bus_dmamap_t map;
 
 	map = NULL;
 	if (txq->ift_sds.ifsd_map != NULL)
 		map = txq->ift_sds.ifsd_map[i];
 	if (map != NULL) {
 		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_buf_tag, map);
 		bus_dmamap_destroy(txq->ift_buf_tag, map);
 		txq->ift_sds.ifsd_map[i] = NULL;
 	}
 
 	map = NULL;
 	if (txq->ift_sds.ifsd_tso_map != NULL)
 		map = txq->ift_sds.ifsd_tso_map[i];
 	if (map != NULL) {
 		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
 		    BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
 		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
 		txq->ift_sds.ifsd_tso_map[i] = NULL;
 	}
 }
 
 static void
 iflib_txq_destroy(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 
 	for (int i = 0; i < txq->ift_size; i++)
 		iflib_txsd_destroy(ctx, txq, i);
 	if (txq->ift_sds.ifsd_map != NULL) {
 		free(txq->ift_sds.ifsd_map, M_IFLIB);
 		txq->ift_sds.ifsd_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
 		txq->ift_sds.ifsd_tso_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_m != NULL) {
 		free(txq->ift_sds.ifsd_m, M_IFLIB);
 		txq->ift_sds.ifsd_m = NULL;
 	}
 	if (txq->ift_buf_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_buf_tag);
 		txq->ift_buf_tag = NULL;
 	}
 	if (txq->ift_tso_buf_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
 		txq->ift_tso_buf_tag = NULL;
 	}
 }
 
 static void
 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	struct mbuf **mp;
 
 	mp = &txq->ift_sds.ifsd_m[i];
 	if (*mp == NULL)
 		return;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		bus_dmamap_sync(txq->ift_buf_tag,
 		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
 	}
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		bus_dmamap_sync(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[i]);
 	}
 	m_free(*mp);
 	DBG_COUNTER_INC(tx_frees);
 	*mp = NULL;
 }
 
 static int
 iflib_txq_setup(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	iflib_dma_info_t di;
 	int i;
 
 	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	/* XXX make configurable */
 	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
 
 	/* Reset indices */
 	txq->ift_cidx_processed = 0;
 	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 		bzero((void *)di->idi_vaddr, di->idi_size);
 
 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
 	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate DMA resources for RX buffers as well as memory for the RX
  *  mbuf map, direct RX cluster pointer map and RX cluster bus address
  *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
  *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
  *  Since we use use one entry in iflib_sw_rx_desc_array per received
  *  packet, the maximum number of entries we'll need is equal to the
  *  number of hardware receive descriptors that we've allocated.
  *
  **********************************************************************/
 static int
 iflib_rxsd_alloc(iflib_rxq_t rxq)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		/* Set up DMA tag for RX buffers. */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
 					 BUS_SPACE_MAXADDR,	/* lowaddr */
 					 BUS_SPACE_MAXADDR,	/* highaddr */
 					 NULL, NULL,		/* filter, filterarg */
 					 sctx->isc_rx_maxsize,	/* maxsize */
 					 sctx->isc_rx_nsegments,	/* nsegments */
 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
 					 0,			/* flags */
 					 NULL,			/* lockfunc */
 					 NULL,			/* lockarg */
 					 &fl->ifl_buf_tag);
 		if (err) {
 			device_printf(dev,
 			    "Unable to allocate RX DMA tag: %d\n", err);
 			goto fail;
 		}
 
 		/* Allocate memory for the RX mbuf map. */
 		if (!(fl->ifl_sds.ifsd_m =
 		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX mbuf map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Allocate memory for the direct RX cluster pointer map. */
 		if (!(fl->ifl_sds.ifsd_cl =
 		      (caddr_t *) malloc(sizeof(caddr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX cluster map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Allocate memory for the RX cluster bus address map. */
 		if (!(fl->ifl_sds.ifsd_ba =
 		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX bus address map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/*
 		 * Create the DMA maps for RX buffers.
 		 */
 		if (!(fl->ifl_sds.ifsd_map =
 		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX buffer DMA map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
 			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
 			    &fl->ifl_sds.ifsd_map[i]);
 			if (err != 0) {
 				device_printf(dev, "Unable to create RX buffer DMA map\n");
 				goto fail;
 			}
 		}
 	}
 	return (0);
 
 fail:
 	iflib_rx_structures_free(ctx);
 	return (err);
 }
 
 
 /*
  * Internal service routines
  */
 
 struct rxq_refill_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 static void
 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct rxq_refill_cb_arg *cb_arg = arg;
 
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 }
 
 /**
  *	rxq_refill - refill an rxq  free-buffer list
  *	@ctx: the iflib context
  *	@rxq: the free-list to refill
  *	@n: the number of new buffers to allocate
  *
  *	(Re)populate an rxq free-buffer list with up to @n new packet buffers.
  *	The caller must assure that @n does not exceed the queue's capacity.
  */
 static void
 _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct if_rxd_update iru;
 	struct rxq_refill_cb_arg cb_arg;
 	struct mbuf *m;
 	caddr_t cl, *sd_cl;
 	struct mbuf **sd_m;
 	bus_dmamap_t *sd_map;
 	bus_addr_t bus_addr, *sd_ba;
 	int err, frag_idx, i, idx, n, pidx;
 	qidx_t credits;
 
 	sd_m = fl->ifl_sds.ifsd_m;
 	sd_map = fl->ifl_sds.ifsd_map;
 	sd_cl = fl->ifl_sds.ifsd_cl;
 	sd_ba = fl->ifl_sds.ifsd_ba;
 	pidx = fl->ifl_pidx;
 	idx = pidx;
 	frag_idx = fl->ifl_fragidx;
 	credits = fl->ifl_credits;
 
 	i = 0;
 	n = count;
 	MPASS(n > 0);
 	MPASS(credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
 	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
 	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
 	while (n--) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 *
 		 * If the cluster is still set then we know a minimum sized packet was received
 		 */
 		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
 		    &frag_idx);
 		if (frag_idx < 0)
 			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
 		MPASS(frag_idx >= 0);
 		if ((cl = sd_cl[frag_idx]) == NULL) {
 			if ((cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
 				break;
 
 			cb_arg.error = 0;
 			MPASS(sd_map != NULL);
 			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
 			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
 			    BUS_DMA_NOWAIT);
 			if (err != 0 || cb_arg.error) {
 				/*
 				 * !zone_pack ?
 				 */
 				if (fl->ifl_zone == zone_pack)
 					uma_zfree(fl->ifl_zone, cl);
 				break;
 			}
 
 			sd_ba[frag_idx] =  bus_addr = cb_arg.seg.ds_addr;
 			sd_cl[frag_idx] = cl;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
 #endif
 		} else {
 			bus_addr = sd_ba[frag_idx];
 		}
 		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
 		    BUS_DMASYNC_PREREAD);
 
 		MPASS(sd_m[frag_idx] == NULL);
 		if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
 			break;
 		}
 		sd_m[frag_idx] = m;
 		bit_set(fl->ifl_rx_bitmap, frag_idx);
 #if MEMORY_LOGGING
 		fl->ifl_m_enqueued++;
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
 		fl->ifl_rxd_idxs[i] = frag_idx;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		fl->ifl_vm_addrs[i] = cl;
 		credits++;
 		i++;
 		MPASS(credits <= fl->ifl_size);
 		if (++idx == fl->ifl_size) {
 			fl->ifl_gen = 1;
 			idx = 0;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
 			iru.iru_pidx = pidx;
 			iru.iru_count = i;
 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			i = 0;
 			pidx = idx;
 			fl->ifl_pidx = idx;
 			fl->ifl_credits = credits;
 		}
 	}
 
 	if (i) {
 		iru.iru_pidx = pidx;
 		iru.iru_count = i;
 		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 		fl->ifl_pidx = idx;
 		fl->ifl_credits = credits;
 	}
 	DBG_COUNTER_INC(rxd_flush);
 	if (fl->ifl_pidx == 0)
 		pidx = fl->ifl_size - 1;
 	else
 		pidx = fl->ifl_pidx - 1;
 
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
 	fl->ifl_fragidx = frag_idx;
 }
 
 static __inline void
 __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
 {
 	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 #ifdef INVARIANTS
 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 #endif
 
 	MPASS(fl->ifl_credits <= fl->ifl_size);
 	MPASS(reclaimable == delta);
 
 	if (reclaimable > 0)
 		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
 }
 
 uint8_t
 iflib_in_detach(if_ctx_t ctx)
 {
 	bool in_detach;
 	STATE_LOCK(ctx);
 	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
 	STATE_UNLOCK(ctx);
 	return (in_detach);
 }
 
 static void
 iflib_fl_bufs_free(iflib_fl_t fl)
 {
 	iflib_dma_info_t idi = fl->ifl_ifdi;
 	bus_dmamap_t sd_map;
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
 		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
 		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 
 		if (*sd_cl != NULL) {
 			sd_map = fl->ifl_sds.ifsd_map[i];
 			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
 			    BUS_DMASYNC_POSTREAD);
 			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
 			if (*sd_cl != NULL)
 				uma_zfree(fl->ifl_zone, *sd_cl);
 			// XXX: Should this get moved out?
 			if (iflib_in_detach(fl->ifl_rxq->ifr_ctx))
 				bus_dmamap_destroy(fl->ifl_buf_tag, sd_map);
 			if (*sd_m != NULL) {
 				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
 				uma_zfree(zone_mbuf, *sd_m);
 			}
 		} else {
 			MPASS(*sd_cl == NULL);
 			MPASS(*sd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
 		*sd_cl = NULL;
 		*sd_m = NULL;
 	}
 #ifdef INVARIANTS
 	for (i = 0; i < fl->ifl_size; i++) {
 		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
 		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
 	}
 #endif
 	/*
 	 * Reset free list values
 	 */
 	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
 /*********************************************************************
  *
  *  Initialize a receive ring and its buffers.
  *
  **********************************************************************/
 static int
 iflib_fl_setup(iflib_fl_t fl)
 {
 	iflib_rxq_t rxq = fl->ifl_rxq;
 	if_ctx_t ctx = rxq->ifr_ctx;
 
 	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
 	iflib_fl_bufs_free(fl);
 	/* Now replenish the mbufs */
 	MPASS(fl->ifl_credits == 0);
 	fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 
 
 	/* avoid pre-allocating zillions of clusters to an idle card
 	 * potentially speeding up attach
 	 */
 	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
 	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
 	if (min(128, fl->ifl_size) != fl->ifl_credits)
 		return (ENOBUFS);
 	/*
 	 * handle failure
 	 */
 	MPASS(rxq != NULL);
 	MPASS(fl->ifl_ifdi != NULL);
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Free receive ring data structures
  *
  **********************************************************************/
 static void
 iflib_rx_sds_free(iflib_rxq_t rxq)
 {
 	iflib_fl_t fl;
 	int i, j;
 
 	if (rxq->ifr_fl != NULL) {
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			if (fl->ifl_buf_tag != NULL) {
 				if (fl->ifl_sds.ifsd_map != NULL) {
 					for (j = 0; j < fl->ifl_size; j++) {
 						if (fl->ifl_sds.ifsd_map[j] ==
 						    NULL)
 							continue;
 						bus_dmamap_sync(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j],
 						    BUS_DMASYNC_POSTREAD);
 						bus_dmamap_unload(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j]);
 					}
 				}
 				bus_dma_tag_destroy(fl->ifl_buf_tag);
 				fl->ifl_buf_tag = NULL;
 			}
 			free(fl->ifl_sds.ifsd_m, M_IFLIB);
 			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
 			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
 			free(fl->ifl_sds.ifsd_map, M_IFLIB);
 			fl->ifl_sds.ifsd_m = NULL;
 			fl->ifl_sds.ifsd_cl = NULL;
 			fl->ifl_sds.ifsd_ba = NULL;
 			fl->ifl_sds.ifsd_map = NULL;
 		}
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 }
 
 /*
  * MI independent logic
  *
  */
 static void
 iflib_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	uint64_t this_tick = ticks;
 	uint32_t reset_on = hz / 2;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 	/*
 	** Check on the state of the TX queue(s), this
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
 	if (this_tick - txq->ift_last_timer_tick >= hz / 2) {
 		txq->ift_last_timer_tick = this_tick;
 		IFDI_TIMER(ctx, txq->ift_id);
 		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
 		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
 		     (sctx->isc_pause_frames == 0)))
 			goto hung;
 
 		if (ifmp_ring_is_stalled(txq->ift_br))
 			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
 		txq->ift_cleaned_prev = txq->ift_cleaned;
 	}
 #ifdef DEV_NETMAP
 	if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP)
 		iflib_netmap_timer_adjust(ctx, txq, &reset_on);
 #endif
 	/* handle any laggards */
 	if (txq->ift_db_pending)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
 	sctx->isc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu);
 	return;
  hung:
 	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
 				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 	STATE_LOCK(ctx);
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
 	iflib_admin_intr_deferred(ctx);
 	STATE_UNLOCK(ctx);
 }
 
 static void
 iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	/*
 	 * XXX don't set the max_frame_size to larger
 	 * than the hardware can handle
 	 */
 	if (sctx->isc_max_frame_size <= MCLBYTES)
 		ctx->ifc_rx_mbuf_sz = MCLBYTES;
 	else
 		ctx->ifc_rx_mbuf_sz = MJUMPAGESIZE;
 }
 
 uint32_t
 iflib_get_rx_mbuf_sz(if_ctx_t ctx)
 {
 	return (ctx->ifc_rx_mbuf_sz);
 }
 
 static void
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
 	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
 	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 		iflib_netmap_txq_init(ctx, txq);
 	}
 
 	/*
 	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
 	 * that drivers can use the value when setting up the hardware receive
 	 * buffers.
 	 */
 	iflib_calc_rx_mbuf_sz(ctx);
 
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		/* XXX this should really be done on a per-queue basis */
 		if (if_getcapenable(ifp) & IFCAP_NETMAP) {
 			MPASS(rxq->ifr_id == i);
 			iflib_netmap_rxq_init(ctx, rxq);
 			continue;
 		}
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
 				goto done;
 			}
 		}
 	}
 done:
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	IFDI_INTR_ENABLE(ctx);
 	txq = ctx->ifc_txqs;
 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
 			txq->ift_timer.c_cpu);
 }
 
 static int
 iflib_media_change(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int err;
 
 	CTX_LOCK(ctx);
 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 static void
 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	CTX_LOCK(ctx);
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	IFDI_MEDIA_STATUS(ctx, ifmr);
 	CTX_UNLOCK(ctx);
 }
 
 void
 iflib_stop(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	iflib_dma_info_t di;
 	iflib_fl_t fl;
 	int i, j;
 
 	/* Tell the stack that the interface is no longer active */
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
 	DELAY(1000);
 	IFDI_STOP(ctx);
 	DELAY(1000);
 
 	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 
 		/* clean any enqueued buffers */
 		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
 		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
 		ifmp_ring_reset_stats(txq->ift_br);
 		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
 }
 
 static inline caddr_t
 calc_next_rxd(iflib_fl_t fl, int cidx)
 {
 	qidx_t size;
 	int nrxd;
 	caddr_t start, end, cur, next;
 
 	nrxd = fl->ifl_size;
 	size = fl->ifl_rxd_size;
 	start = fl->ifl_ifdi->idi_vaddr;
 
 	if (__predict_false(size == 0))
 		return (start);
 	cur = start + size*cidx;
 	end = start + size*nrxd;
 	next = CACHE_PTR_NEXT(cur);
 	return (next < end ? next : start);
 }
 
 static inline void
 prefetch_pkts(iflib_fl_t fl, int cidx)
 {
 	int nextptr;
 	int nrxd = fl->ifl_size;
 	caddr_t next_rxd;
 
 
 	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
 	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
 	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
 	next_rxd = calc_next_rxd(fl, cidx);
 	prefetch(next_rxd);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 }
 
 static void
 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
 {
 	int flid, cidx;
 	bus_dmamap_t map;
 	iflib_fl_t fl;
 	int next;
 
 	map = NULL;
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
 	sd->ifsd_fl = fl;
 	sd->ifsd_cidx = cidx;
 	sd->ifsd_m = &fl->ifl_sds.ifsd_m[cidx];
 	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
 #endif
 	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
 		prefetch_pkts(fl, cidx);
 	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
 	prefetch(&fl->ifl_sds.ifsd_map[next]);
 	map = fl->ifl_sds.ifsd_map[cidx];
 	next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1);
 
 	/* not valid assert if bxe really does SGE from non-contiguous elements */
 	MPASS(fl->ifl_cidx == cidx);
 	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
 	if (unload)
 		bus_dmamap_unload(fl->ifl_buf_tag, map);
 	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
 	if (__predict_false(fl->ifl_cidx == 0))
 		fl->ifl_gen = 0;
 	bit_clear(fl->ifl_rx_bitmap, cidx);
 }
 
 static struct mbuf *
 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd)
 {
 	int i, padlen , flags;
 	struct mbuf *m, *mh, *mt;
 	caddr_t cl;
 
 	i = 0;
 	mh = NULL;
 	do {
 		rxd_frag_to_sd(rxq, &ri->iri_frags[i], TRUE, sd);
 
 		MPASS(*sd->ifsd_cl != NULL);
 		MPASS(*sd->ifsd_m != NULL);
 
 		/* Don't include zero-length frags */
 		if (ri->iri_frags[i].irf_len == 0) {
 			/* XXX we can save the cluster here, but not the mbuf */
 			m_init(*sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
 			m_free(*sd->ifsd_m);
 			*sd->ifsd_m = NULL;
 			continue;
 		}
 		m = *sd->ifsd_m;
 		*sd->ifsd_m = NULL;
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
 			padlen = ri->iri_pad;
 		} else {
 			flags = M_EXT;
 			mt->m_next = m;
 			mt = m;
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
 		cl = *sd->ifsd_cl;
 		*sd->ifsd_cl = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
 		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
 		m->m_data += padlen;
 		ri->iri_len -= padlen;
 		m->m_len = ri->iri_frags[i].irf_len;
 	} while (++i < ri->iri_nfrags);
 
 	return (mh);
 }
 
 /*
  * Process one software descriptor
  */
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	struct if_rxsd sd;
 	struct mbuf *m;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
 		rxd_frag_to_sd(rxq, &ri->iri_frags[0], FALSE, &sd);
 		m = *sd.ifsd_m;
 		*sd.ifsd_m = NULL;
 		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
 #ifndef __NO_STRICT_ALIGNMENT
 		if (!IP_ALIGNED(m))
 			m->m_data += 2;
 #endif
 		memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
 		m->m_len = ri->iri_frags[0].irf_len;
        } else {
 		m = assemble_segments(rxq, ri, &sd);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
 	m->m_flags |= ri->iri_flags;
 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
 	m->m_pkthdr.flowid = ri->iri_flowid;
 	M_HASHTYPE_SET(m, ri->iri_rsstype);
 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 	m->m_pkthdr.csum_data = ri->iri_csum_data;
 	return (m);
 }
 
 #if defined(INET6) || defined(INET)
 static void
 iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
 {
 	CURVNET_SET(lc->ifp->if_vnet);
 #if defined(INET6)
 	*v6 = VNET(ip6_forwarding);
 #endif
 #if defined(INET)
 	*v4 = VNET(ipforwarding);
 #endif
 	CURVNET_RESTORE();
 }
 
 /*
  * Returns true if it's possible this packet could be LROed.
  * if it returns false, it is guaranteed that tcp_lro_rx()
  * would not return zero.
  */
 static bool
 iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
 {
 	struct ether_header *eh;
 	uint16_t eh_type;
 
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	switch (eh_type) {
 #if defined(INET6)
 		case ETHERTYPE_IPV6:
 			return !v6_forwarding;
 #endif
 #if defined (INET)
 		case ETHERTYPE_IP:
 			return !v4_forwarding;
 #endif
 	}
 
 	return false;
 }
 #else
 static void
 iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
 {
 }
 #endif
 
 static bool
 iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
 	qidx_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	struct ifnet *ifp;
 	int lro_enabled;
 	bool v4_forwarding, v6_forwarding, lro_possible;
 
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
 	struct mbuf *m, *mh, *mt, *mf;
 
 	lro_possible = v4_forwarding = v6_forwarding = false;
 	ifp = ctx->ifc_ifp;
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidxp = &rxq->ifr_cq_cidx;
 	else
 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 			__iflib_fl_refill_lt(ctx, fl, budget + 8);
 		DBG_COUNTER_INC(rx_unavail);
 		return (false);
 	}
 
 	for (budget_left = budget; budget_left > 0 && avail > 0;) {
 		if (__predict_false(!CTX_ACTIVE(ctx))) {
 			DBG_COUNTER_INC(rx_ctx_inactive);
 			break;
 		}
 		/*
 		 * Reset client set fields to their default values
 		 */
 		rxd_info_zero(&ri);
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
 		ri.iri_ifp = ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
 		if (err)
 			goto err;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
 			/* XXX NB: shurd - check if this is still safe */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 				rxq->ifr_cq_gen = 0;
 			}
 			/* was this only a completion queue message? */
 			if (__predict_false(ri.iri_nfrags == 0))
 				continue;
 		}
 		MPASS(ri.iri_nfrags != 0);
 		MPASS(ri.iri_len != 0);
 
 		/* will advance the cidx on the corresponding free lists */
 		m = iflib_rxd_pkt_get(rxq, &ri);
 		avail--;
 		budget_left--;
 		if (avail == 0 && budget_left)
 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 
 		if (__predict_false(m == NULL)) {
 			DBG_COUNTER_INC(rx_mbuf_null);
 			continue;
 		}
 		/* imm_pkt: -- cxgb */
 		if (mh == NULL)
 			mh = mt = m;
 		else {
 			mt->m_nextpkt = m;
 			mt = m;
 		}
 	}
 	/* make sure that we can refill faster than drain */
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		__iflib_fl_refill_lt(ctx, fl, budget + 8);
 
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
 	if (lro_enabled)
 		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
 	mt = mf = NULL;
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
 #ifndef __NO_STRICT_ALIGNMENT
 		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
 			continue;
 #endif
 		rx_bytes += m->m_pkthdr.len;
 		rx_pkts++;
 #if defined(INET6) || defined(INET)
 		if (lro_enabled) {
 			if (!lro_possible) {
 				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
 				if (lro_possible && mf != NULL) {
 					ifp->if_input(ifp, mf);
 					DBG_COUNTER_INC(rx_if_input);
 					mt = mf = NULL;
 				}
 			}
 			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
 			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
 				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
 					continue;
 			}
 		}
 #endif
 		if (lro_possible) {
 			ifp->if_input(ifp, m);
 			DBG_COUNTER_INC(rx_if_input);
 			continue;
 		}
 
 		if (mf == NULL)
 			mf = m;
 		if (mt != NULL)
 			mt->m_nextpkt = m;
 		mt = m;
 	}
 	if (mf != NULL) {
 		ifp->if_input(ifp, mf);
 		DBG_COUNTER_INC(rx_if_input);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	/*
 	 * Flush any outstanding LRO work
 	 */
 #if defined(INET6) || defined(INET)
 	tcp_lro_flush_all(&rxq->ifr_lc);
 #endif
 	if (avail)
 		return true;
 	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
 err:
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_DO_RESET;
 	iflib_admin_intr_deferred(ctx);
 	STATE_UNLOCK(ctx);
 	return (false);
 }
 
 #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
 static inline qidx_t
 txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
 {
 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 	qidx_t minthresh = txq->ift_size / 8;
 	if (in_use > 4*minthresh)
 		return (notify_count);
 	if (in_use > 2*minthresh)
 		return (notify_count >> 1);
 	if (in_use > minthresh)
 		return (notify_count >> 3);
 	return (0);
 }
 
 static inline qidx_t
 txq_max_rs_deferred(iflib_txq_t txq)
 {
 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 	qidx_t minthresh = txq->ift_size / 8;
 	if (txq->ift_in_use > 4*minthresh)
 		return (notify_count);
 	if (txq->ift_in_use > 2*minthresh)
 		return (notify_count >> 1);
 	if (txq->ift_in_use > minthresh)
 		return (notify_count >> 2);
 	return (2);
 }
 
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
 
 #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
 #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
 /* forward compatibility for cxgb */
 #define FIRST_QSET(ctx) 0
 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 
 /* XXX we should be setting this to something other than zero */
 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 #define	MAX_TX_DESC(ctx) max((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
     (ctx)->ifc_softc_ctx.isc_tx_nsegments)
 
 static inline bool
 iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use)
 {
 	qidx_t dbval, max;
 	bool rang;
 
 	rang = false;
 	max = TXQ_MAX_DB_DEFERRED(txq, in_use);
 	if (ring || txq->ift_db_pending >= max) {
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 		txq->ift_db_pending = txq->ift_npending = 0;
 		rang = true;
 	}
 	return (rang);
 }
 
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
 {
 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 }
 #endif
 
 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
 
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
 	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct ether_vlan_header *eh;
 	struct mbuf *m;
 
 	m = *mp;
 	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
 	    M_WRITABLE(m) == 0) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			DBG_COUNTER_INC(tx_frees);
 			*mp = m;
 		}
 	}
 
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present,
 	 * helpful for QinQ too.
 	 */
 	if (__predict_false(m->m_len < sizeof(*eh))) {
 		txq->ift_pullups++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		pi->ipi_etype = ntohs(eh->evl_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct mbuf *n;
 		struct ip *ip = NULL;
 		struct tcphdr *th = NULL;
 		int minthlen;
 
 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 		if (__predict_false(m->m_len < minthlen)) {
 			/*
 			 * if this code bloat is causing too much of a hit
 			 * move it to a separate function and mark it noinline
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				if (n->m_len >= sizeof(*ip))  {
 					ip = (struct ip *)n->m_data;
 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 				} else {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		/* TCP checksum offload may require TCP header length */
 		if (IS_TX_OFFLOAD4(pi)) {
 			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
 				if (__predict_false(th == NULL)) {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
 						return (ENOMEM);
 					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
 				}
 				pi->ipi_tcp_hflags = th->th_flags;
 				pi->ipi_tcp_hlen = th->th_off << 2;
 				pi->ipi_tcp_seq = th->th_seq;
 			}
 			if (IS_TSO4(pi)) {
 				if (__predict_false(ip->ip_p != IPPROTO_TCP))
 					return (ENXIO);
 				/*
 				 * TSO always requires hardware checksum offload.
 				 */
 				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
 					ip->ip_sum = 0;
 					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
 				}
 			}
 		}
 		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
                        ip->ip_sum = 0;
 
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 		struct tcphdr *th;
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			txq->ift_pullups++;
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		/* TCP checksum offload may require TCP header length */
 		if (IS_TX_OFFLOAD6(pi)) {
 			if (pi->ipi_ipproto == IPPROTO_TCP) {
 				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
 						return (ENOMEM);
 				}
 				pi->ipi_tcp_hflags = th->th_flags;
 				pi->ipi_tcp_hlen = th->th_off << 2;
 				pi->ipi_tcp_seq = th->th_seq;
 			}
 			if (IS_TSO6(pi)) {
 				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 					return (ENXIO);
 				/*
 				 * TSO always requires hardware checksum offload.
 				 */
 				pi->ipi_csum_flags |= CSUM_IP6_TCP;
 				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 			}
 		}
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 /*
  * If dodgy hardware rejects the scatter gather chain we've handed it
  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
  * m_defrag'd mbufs
  */
 static __noinline struct mbuf *
 iflib_remove_mbuf(iflib_txq_t txq)
 {
 	int ntxd, pidx;
 	struct mbuf *m, **ifsd_m;
 
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	pidx = txq->ift_pidx & (ntxd - 1);
 	ifsd_m = txq->ift_sds.ifsd_m;
 	m = ifsd_m[pidx];
 	ifsd_m[pidx] = NULL;
 	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
 	if (txq->ift_sds.ifsd_tso_map != NULL)
 		bus_dmamap_unload(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[pidx]);
 #if MEMORY_LOGGING
 	txq->ift_dequeued++;
 #endif
 	return (m);
 }
 
 static inline caddr_t
 calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
 {
 	qidx_t size;
 	int ntxd;
 	caddr_t start, end, cur, next;
 
 	ntxd = txq->ift_size;
 	size = txq->ift_txd_size[qid];
 	start = txq->ift_ifdi[qid].idi_vaddr;
 
 	if (__predict_false(size == 0))
 		return (start);
 	cur = start + size*cidx;
 	end = start + size*ntxd;
 	next = CACHE_PTR_NEXT(cur);
 	return (next < end ? next : start);
 }
 
 /*
  * Pad an mbuf to ensure a minimum ethernet frame size.
  * min_frame_size is the frame size (less CRC) to pad the mbuf to
  */
 static __noinline int
 iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
 {
 	/*
 	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
 	 * and ARP message is the smallest common payload I can think of
 	 */
 	static char pad[18];	/* just zeros */
 	int n;
 	struct mbuf *new_head;
 
 	if (!M_WRITABLE(*m_head)) {
 		new_head = m_dup(*m_head, M_NOWAIT);
 		if (new_head == NULL) {
 			m_freem(*m_head);
 			device_printf(dev, "cannot pad short frame, m_dup() failed");
 			DBG_COUNTER_INC(encap_pad_mbuf_fail);
 			DBG_COUNTER_INC(tx_frees);
 			return ENOMEM;
 		}
 		m_freem(*m_head);
 		*m_head = new_head;
 	}
 
 	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
 	     n > 0; n -= sizeof(pad))
 		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
 			break;
 
 	if (n > 0) {
 		m_freem(*m_head);
 		device_printf(dev, "cannot pad short frame\n");
 		DBG_COUNTER_INC(encap_pad_mbuf_fail);
 		DBG_COUNTER_INC(tx_frees);
 		return (ENOBUFS);
 	}
 
 	return 0;
 }
 
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
 	if_ctx_t		ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	bus_dma_tag_t		buf_tag;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head, **ifsd_m;
 	void			*next_txd;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	segs = txq->ift_segs;
 	ntxd = txq->ift_size;
 	m_head = *m_headp;
 	map = NULL;
 
 	/*
 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
 	if (ctx->ifc_flags & IFC_PREFETCH) {
 		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
 		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
 			next_txd = calc_next_txd(txq, cidx, 0);
 			prefetch(next_txd);
 		}
 
 		/* prefetch the next cache line of mbuf pointers and flags */
 		prefetch(&txq->ift_sds.ifsd_m[next]);
 		prefetch(&txq->ift_sds.ifsd_map[next]);
 		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
 	}
 	map = txq->ift_sds.ifsd_map[pidx];
 	ifsd_m = txq->ift_sds.ifsd_m;
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		buf_tag = txq->ift_tso_buf_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
 		map = txq->ift_sds.ifsd_tso_map[pidx];
 		MPASS(buf_tag != NULL);
 		MPASS(max_segs > 0);
 	} else {
 		buf_tag = txq->ift_buf_tag;
 		max_segs = scctx->isc_tx_nsegments;
 		map = txq->ift_sds.ifsd_map[pidx];
 	}
 	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
 	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
 		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
 		if (err) {
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return err;
 		}
 	}
 	m_head = *m_headp;
 
 	pkt_info_zero(&pi);
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
 	pi.ipi_len = m_head->m_pkthdr.len;
 	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
 	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return (err);
 		}
 		m_head = *m_headp;
 	}
 
 retry:
 	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
 	    BUS_DMA_NOWAIT);
 defrag:
 	if (__predict_false(err)) {
 		switch (err) {
 		case EFBIG:
 			/* try collapse once and defrag once */
 			if (remap == 0) {
 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 				/* try defrag if collapsing fails */
 				if (m_head == NULL)
 					remap++;
 			}
 			if (remap == 1) {
 				txq->ift_mbuf_defrag++;
 				m_head = m_defrag(*m_headp, M_NOWAIT);
 			}
 			/*
 			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
 			 * failed to map an mbuf that was run through m_defrag
 			 */
 			MPASS(remap <= 1);
 			if (__predict_false(m_head == NULL || remap > 1))
 				goto defrag_failed;
 			remap++;
 			*m_headp = m_head;
 			goto retry;
 			break;
 		case ENOMEM:
 			txq->ift_no_tx_dma_setup++;
 			break;
 		default:
 			txq->ift_no_tx_dma_setup++;
 			m_freem(*m_headp);
 			DBG_COUNTER_INC(tx_frees);
 			*m_headp = NULL;
 			break;
 		}
 		txq->ift_map_failed++;
 		DBG_COUNTER_INC(encap_load_mbuf_fail);
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 		return (err);
 	}
 	ifsd_m[pidx] = m_head;
 	/*
 	 * XXX assumes a 1 to 1 relationship between segments and
 	 *        descriptors - this does not hold true on all drivers, e.g.
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 		txq->ift_no_desc_avail++;
 		bus_dmamap_unload(buf_tag, map);
 		DBG_COUNTER_INC(encap_txq_avail_fail);
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
 	/*
 	 * On Intel cards we can greatly reduce the number of TX interrupts
 	 * we see by only setting report status on every Nth descriptor.
 	 * However, this also means that the driver will need to keep track
 	 * of the descriptors that RS was set on to check them for the DD bit.
 	 */
 	txq->ift_rs_pending += nsegs + 1;
 	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
 	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
 		pi.ipi_flags |= IPI_TX_INTR;
 		txq->ift_rs_pending = 0;
 	}
 
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
 	MPASS(pidx >= 0 && pidx < txq->ift_size);
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
 		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
 		DBG_COUNTER_INC(tx_encap);
 		MPASS(pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
 		/*
 		 * drivers can need as many as 
 		 * two sentinels
 		 */
 		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
 
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
 		 */
 		txq->ift_pidx = pi.ipi_new_pidx;
 		txq->ift_npending += pi.ipi_ndescs;
 	} else {
 		*m_headp = m_head = iflib_remove_mbuf(txq);
 		if (err == EFBIG) {
 			txq->ift_txd_encap_efbig++;
 			if (remap < 2) {
 				remap = 1;
 				goto defrag;
 			}
 		}
 		goto defrag_failed;
 	}
 	/*
 	 * err can't possibly be non-zero here, so we don't neet to test it
 	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
 	 */
 	return (err);
 
 defrag_failed:
 	txq->ift_mbuf_defrag_failed++;
 	txq->ift_map_failed++;
 	m_freem(*m_headp);
 	DBG_COUNTER_INC(tx_frees);
 	*m_headp = NULL;
 	DBG_COUNTER_INC(encap_txd_encap_fail);
 	return (ENOMEM);
 }
 
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
 	uint32_t qsize, cidx, mask, gen;
 	struct mbuf *m, **ifsd_m;
 	bool do_prefetch;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
 	qsize = txq->ift_size;
 	mask = qsize-1;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
 
 	while (n-- > 0) {
 		if (do_prefetch) {
 			prefetch(ifsd_m[(cidx + 3) & mask]);
 			prefetch(ifsd_m[(cidx + 4) & mask]);
 		}
 		if ((m = ifsd_m[cidx]) != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 				bus_dmamap_sync(txq->ift_tso_buf_tag,
 				    txq->ift_sds.ifsd_tso_map[cidx],
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(txq->ift_tso_buf_tag,
 				    txq->ift_sds.ifsd_tso_map[cidx]);
 			} else {
 				bus_dmamap_sync(txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[cidx],
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[cidx]);
 			}
 			/* XXX we don't support any drivers that batch packets yet */
 			MPASS(m->m_nextpkt == NULL);
 			m_freem(m);
 			ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 			txq->ift_dequeued++;
 #endif
 			DBG_COUNTER_INC(tx_frees);
 		}
 		if (__predict_false(++cidx == qsize)) {
 			cidx = 0;
 			gen = 0;
 		}
 	}
 	txq->ift_cidx = cidx;
 	txq->ift_gen = gen;
 }
 
 static __inline int
 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 {
 	int reclaim;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
 	iflib_tx_credits_update(ctx, txq);
 	reclaim = DESC_RECLAIMABLE(txq);
 
 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 #ifdef INVARIANTS
 		if (iflib_verbose_debug) {
 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 			       reclaim, thresh);
 
 		}
 #endif
 		return (0);
 	}
 	iflib_tx_desc_free(txq, reclaim);
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
 	return (reclaim);
 }
 
 static struct mbuf **
 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
 {
 	int next, size;
 	struct mbuf **items;
 
 	size = r->size;
 	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
 	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
 
 	prefetch(items[(cidx + offset) & (size-1)]);
 	if (remaining > 1) {
 		prefetch2cachelines(&items[next]);
 		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
 		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
 		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
 	}
 	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
 	ifmp_ring_check_drainage(txq->ift_br, budget);
 }
 
 static uint32_t
 iflib_txq_can_drain(struct ifmp_ring *r)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
 		return (1);
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD);
 	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
 	    false));
 }
 
 static uint32_t
 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 	struct ifnet *ifp = ctx->ifc_ifp;
 	struct mbuf **mp, *m;
 	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail;
 	int reclaimed, err, in_use_prev, desc_used;
 	bool do_prefetch, ring, rang;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
 	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 	rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use);
 	avail = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		DBG_COUNTER_INC(txq_drain_flushing);
 		for (i = 0; i < avail; i++) {
 			if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
 				m_free(r->items[(cidx + i) & (r->size-1)]);
 			r->items[(cidx + i) & (r->size-1)] = NULL;
 		}
 		return (avail);
 	}
 
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
 	if (reclaimed)
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
 		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
 #endif
 	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
 	avail = TXQ_AVAIL(txq);
 	err = 0;
 	for (desc_used = i = 0; i < count && avail > MAX_TX_DESC(ctx) + 2; i++) {
 		int rem = do_prefetch ? count - i : 0;
 
 		mp = _ring_peek_one(r, cidx, i, rem);
 		MPASS(mp != NULL && *mp != NULL);
 		if (__predict_false(*mp == (struct mbuf *)txq)) {
 			consumed++;
 			reclaimed++;
 			continue;
 		}
 		in_use_prev = txq->ift_in_use;
 		err = iflib_encap(txq, mp);
 		if (__predict_false(err)) {
 			/* no room - bail out */
 			if (err == ENOBUFS)
 				break;
 			consumed++;
 			/* we can't send this packet - skip it */
 			continue;
 		}
 		consumed++;
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
 		mcast_sent += !!(m->m_flags & M_MCAST);
 		avail = TXQ_AVAIL(txq);
 
 		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
 		desc_used += (txq->ift_in_use - in_use_prev);
 		ETHER_BPF_MTAP(ifp, m);
 		if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
 			break;
 		rang = iflib_txd_db_check(ctx, txq, false, in_use_prev);
 	}
 
 	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
 	ring = rang ? false  : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx));
 	iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("consumed=%d\n", consumed);
 #endif
 	return (consumed);
 }
 
 static uint32_t
 iflib_txq_drain_always(struct ifmp_ring *r)
 {
 	return (1);
 }
 
 static uint32_t
 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	int i, avail;
 	struct mbuf **mp;
 	iflib_txq_t txq;
 
 	txq = r->cookie;
 
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	CALLOUT_LOCK(txq);
 	callout_stop(&txq->ift_timer);
 	CALLOUT_UNLOCK(txq);
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	for (i = 0; i < avail; i++) {
 		mp = _ring_peek_one(r, cidx, i, avail - i);
 		if (__predict_false(*mp == (struct mbuf *)txq))
 			continue;
 		m_freem(*mp);
 		DBG_COUNTER_INC(tx_frees);
 	}
 	MPASS(ifmp_ring_is_stalled(r) == 0);
 	return (avail);
 }
 
 static void
 iflib_ifmp_purge(iflib_txq_t txq)
 {
 	struct ifmp_ring *r;
 
 	r = txq->ift_br;
 	r->drain = iflib_txq_drain_free;
 	r->can_drain = iflib_txq_drain_always;
 
 	ifmp_ring_check_drainage(r, r->size);
 
 	r->drain = iflib_txq_drain;
 	r->can_drain = iflib_txq_can_drain;
 }
 
 static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
 #if defined(ALTQ) || defined(DEV_NETMAP)
 	if_t ifp = ctx->ifc_ifp;
 #endif
 	int abdicate = ctx->ifc_sysctl_tx_abdicate;
 
 #ifdef IFLIB_DIAGNOSTICS
 	txq->ift_cpu_exec_count[curcpu]++;
 #endif
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 #ifdef DEV_NETMAP
 	if (if_getcapenable(ifp) & IFCAP_NETMAP) {
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD);
 		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false))
 			netmap_tx_irq(ifp, txq->ift_id);
 		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
 		return;
 	}
 #endif
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		iflib_altq_if_start(ifp);
 #endif
 	if (txq->ift_db_pending)
 		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
 	else if (!abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 	/*
 	 * When abdicating, we always need to check drainage, not just when we don't enqueue
 	 */
 	if (abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 	if (ctx->ifc_flags & IFC_LEGACY)
 		IFDI_INTR_ENABLE(ctx);
 	else {
 #ifdef INVARIANTS
 		int rc =
 #endif
 			IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
 	}
 }
 
 static void
 _task_fn_rx(void *context)
 {
 	iflib_rxq_t rxq = context;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	bool more;
 	uint16_t budget;
 
 #ifdef IFLIB_DIAGNOSTICS
 	rxq->ifr_cpu_exec_count[curcpu]++;
 #endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 	more = true;
 #ifdef DEV_NETMAP
 	if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) {
 		u_int work = 0;
 		if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work)) {
 			more = false;
 		}
 	}
 #endif
 	budget = ctx->ifc_sysctl_rx_budget;
 	if (budget == 0)
 		budget = 16;	/* XXX */
 	if (more == false || (more = iflib_rxeof(rxq, budget)) == false) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else {
 #ifdef INVARIANTS
 			int rc =
 #endif
 				IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
 			DBG_COUNTER_INC(rx_intr_enables);
 		}
 	}
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 	if (more)
 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
 }
 
 static void
 _task_fn_admin(void *context)
 {
 	if_ctx_t ctx = context;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	iflib_txq_t txq;
 	int i;
 	bool oactive, running, do_reset, do_watchdog, in_detach;
 	uint32_t reset_on = hz / 2;
 
 	STATE_LOCK(ctx);
 	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
 	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
 	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
 	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
 	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
 	ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
 	STATE_UNLOCK(ctx);
 
 	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 		return;
 	if (in_detach)
 		return;
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 	}
 	if (do_watchdog) {
 		ctx->ifc_watchdog_events++;
 		IFDI_WATCHDOG_RESET(ctx);
 	}
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 #ifdef DEV_NETMAP
 		reset_on = hz / 2;
 		if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP)
 			iflib_netmap_timer_adjust(ctx, txq, &reset_on);
 #endif
 		callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu);
 	}
 	IFDI_LINK_INTR_ENABLE(ctx);
 	if (do_reset)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
 		return;
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 }
 
 
 static void
 _task_fn_iov(void *context)
 {
 	if_ctx_t ctx = context;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
 	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VFLR_HANDLE(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 	if_int_delay_info_t info;
 	if_ctx_t ctx;
 
 	info = (if_int_delay_info_t)arg1;
 	ctx = info->iidi_ctx;
 	info->iidi_req = req;
 	info->iidi_oidp = oidp;
 	CTX_LOCK(ctx);
 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 /*********************************************************************
  *
  *  IFNET FUNCTIONS
  *
  **********************************************************************/
 
 static void
 iflib_if_init_locked(if_ctx_t ctx)
 {
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 }
 
 
 static void
 iflib_if_init(void *arg)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t	ctx = if_getsoftc(ifp);
 
 	iflib_txq_t txq;
 	int err, qidx;
 	int abdicate = ctx->ifc_sysctl_tx_abdicate;
 
 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
-		return (ENOBUFS);
+		return (ENETDOWN);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
 	/* ALTQ-enabled interfaces always use queue 0. */
 	qidx = 0;
 	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
 		qidx = QIDX(ctx, m);
 	/*
 	 * XXX calculate buf_ring based on flowid (divvy up bits?)
 	 */
 	txq = &ctx->ifc_txqs[qidx];
 
 #ifdef DRIVER_BACKPRESSURE
 	if (txq->ift_closed) {
 		while (m != NULL) {
 			next = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 			DBG_COUNTER_INC(tx_frees);
 			m = next;
 		}
 		return (ENOBUFS);
 	}
 #endif
 #ifdef notyet
 	qidx = count = 0;
 	mp = marr;
 	next = m;
 	do {
 		count++;
 		next = next->m_nextpkt;
 	} while (next != NULL);
 
 	if (count > nitems(marr))
 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 			/* XXX check nextpkt */
 			m_freem(m);
 			/* XXX simplify for now */
 			DBG_COUNTER_INC(tx_frees);
 			return (ENOBUFS);
 		}
 	for (next = m, i = 0; next != NULL; i++) {
 		mp[i] = next;
 		next = next->m_nextpkt;
 		mp[i]->m_nextpkt = NULL;
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
 	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
 
 	if (abdicate)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
  	if (err) {
 		if (!abdicate)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 		m_freem(m);
 		DBG_COUNTER_INC(tx_frees);
 	}
 
 	return (err);
 }
 
 #ifdef ALTQ
 /*
  * The overall approach to integrating iflib with ALTQ is to continue to use
  * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
  * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
  * is redundant/unnecessary, but doing so minimizes the amount of
  * ALTQ-specific code required in iflib.  It is assumed that the overhead of
  * redundantly queueing to an intermediate mp_ring is swamped by the
  * performance limitations inherent in using ALTQ.
  *
  * When ALTQ support is compiled in, all iflib drivers will use a transmit
  * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
  * given interface.  If ALTQ is enabled for an interface, then all
  * transmitted packets for that interface will be submitted to the ALTQ
  * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
  * implementation because it uses IFQ_HANDOFF(), which will duplicatively
  * update stats that the iflib machinery handles, and which is sensitve to
  * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
  * will be installed as the start routine for use by ALTQ facilities that
  * need to trigger queue drains on a scheduled basis.
  *
  */
 static void
 iflib_altq_if_start(if_t ifp)
 {
 	struct ifaltq *ifq = &ifp->if_snd;
 	struct mbuf *m;
 	
 	IFQ_LOCK(ifq);
 	IFQ_DEQUEUE_NOLOCK(ifq, m);
 	while (m != NULL) {
 		iflib_if_transmit(ifp, m);
 		IFQ_DEQUEUE_NOLOCK(ifq, m);
 	}
 	IFQ_UNLOCK(ifq);
 }
 
 static int
 iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
 {
 	int err;
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_ENQUEUE(&ifp->if_snd, m, err);
 		if (err == 0)
 			iflib_altq_if_start(ifp);
 	} else
 		err = iflib_if_transmit(ifp, m);
 
 	return (err);
 }
 #endif /* ALTQ */
 
 static void
 iflib_if_qflush(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_QFLUSH;
 	STATE_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
 			iflib_txq_check_drain(txq, 0);
 	STATE_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
 	STATE_UNLOCK(ctx);
 
 	/*
 	 * When ALTQ is enabled, this will also take care of purging the
 	 * ALTQ queue(s).
 	 */
 	if_qflush(ifp);
 }
 
 
 #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
 		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
 		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM)
 
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	struct ifreq	*ifr = (struct ifreq *)data;
 #if defined(INET) || defined(INET6)
 	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
 	bool		avoid_reset = FALSE;
 	int		err = 0, reinit = 0, bits;
 
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			avoid_reset = TRUE;
 #endif
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6)
 			avoid_reset = TRUE;
 #endif
 		/*
 		** Calling init results in link renegotiation,
 		** so we avoid doing it when possible.
 		*/
 		if (avoid_reset) {
 			if_setflagbits(ifp, IFF_UP,0);
 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 				reinit = 1;
 #ifdef INET
 			if (!(if_getflags(ifp) & IFF_NOARP))
 				arp_ifinit(ifp, ifa);
 #endif
 		} else
 			err = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
 		CTX_LOCK(ctx);
 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
 			CTX_UNLOCK(ctx);
 			break;
 		}
 		bits = if_getdrvflags(ifp);
 		/* stop the driver and free any clusters before proceeding */
 		iflib_stop(ctx);
 
 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 			STATE_LOCK(ctx);
 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 				ctx->ifc_flags |= IFC_MULTISEG;
 			else
 				ctx->ifc_flags &= ~IFC_MULTISEG;
 			STATE_UNLOCK(ctx);
 			err = if_setmtu(ifp, ifr->ifr_mtu);
 		}
 		iflib_init_locked(ctx);
 		STATE_LOCK(ctx);
 		if_setdrvflags(ifp, bits);
 		STATE_UNLOCK(ctx);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCSIFFLAGS:
 		CTX_LOCK(ctx);
 		if (if_getflags(ifp) & IFF_UP) {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 				}
 			} else
 				reinit = 1;
 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			iflib_stop(ctx);
 		}
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			CTX_LOCK(ctx);
 			IFDI_INTR_DISABLE(ctx);
 			IFDI_MULTI_SET(ctx);
 			IFDI_INTR_ENABLE(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	case SIOCSIFMEDIA:
 		CTX_LOCK(ctx);
 		IFDI_MEDIA_SET(ctx);
 		CTX_UNLOCK(ctx);
 		/* falls thru */
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
 		break;
 	case SIOCGI2C:
 	{
 		struct ifi2creq i2c;
 
 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (err != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			err = EINVAL;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			err = EINVAL;
 			break;
 		}
 
 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
 			    sizeof(i2c));
 		break;
 	}
 	case SIOCSIFCAP:
 	{
 		int mask, setmask, oldmask;
 
 		oldmask = if_getcapenable(ifp);
 		mask = ifr->ifr_reqcap ^ oldmask;
 		mask &= ctx->ifc_softc_ctx.isc_capabilities;
 		setmask = 0;
 #ifdef TCP_OFFLOAD
 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 		setmask |= (mask & IFCAP_WOL);
 
 		/*
 		 * If any RX csum has changed, change all the ones that
 		 * are supported by the driver.
 		 */
 		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
 			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		}
 
 		/*
 		 * want to ensure that traffic has stopped before we change any of the flags
 		 */
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
 			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 				iflib_stop(ctx);
 			STATE_LOCK(ctx);
 			if_togglecapenable(ifp, setmask);
 			STATE_UNLOCK(ctx);
 			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 				iflib_init_locked(ctx);
 			STATE_LOCK(ctx);
 			if_setdrvflags(ifp, bits);
 			STATE_UNLOCK(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		if_vlancap(ifp);
 		break;
 	}
 	case SIOCGPRIVATE_0:
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		CTX_LOCK(ctx);
 		err = IFDI_PRIV_IOCTL(ctx, command, data);
 		CTX_UNLOCK(ctx);
 		break;
 	default:
 		err = ether_ioctl(ifp, command, data);
 		break;
 	}
 	if (reinit)
 		iflib_if_init(ctx);
 	return (err);
 }
 
 static uint64_t
 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	return (IFDI_GET_COUNTER(ctx, cnt));
 }
 
 /*********************************************************************
  *
  *  OTHER FUNCTIONS EXPORTED TO THE STACK
  *
  **********************************************************************/
 
 static void
 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_led_func(void *arg, int onoff)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	IFDI_LED_FUNC(ctx, onoff);
 	CTX_UNLOCK(ctx);
 }
 
 /*********************************************************************
  *
  *  BUS FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 int
 iflib_device_probe(device_t dev)
 {
 	pci_vendor_info_t *ent;
 
 	uint16_t	pci_vendor_id, pci_device_id;
 	uint16_t	pci_subvendor_id, pci_subdevice_id;
 	uint16_t	pci_rev_id;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 	pci_subvendor_id = pci_get_subvendor(dev);
 	pci_subdevice_id = pci_get_subdevice(dev);
 	pci_rev_id = pci_get_revid(dev);
 	if (sctx->isc_parse_devinfo != NULL)
 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 
 	ent = sctx->isc_vendor_info;
 	while (ent->pvi_vendor_id != 0) {
 		if (pci_vendor_id != ent->pvi_vendor_id) {
 			ent++;
 			continue;
 		}
 		if ((pci_device_id == ent->pvi_device_id) &&
 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 		     (ent->pvi_subvendor_id == 0)) &&
 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 		     (ent->pvi_subdevice_id == 0)) &&
 		    ((pci_rev_id == ent->pvi_rev_id) ||
 		     (ent->pvi_rev_id == 0))) {
 
 			device_set_desc_copy(dev, ent->pvi_name);
 			/* this needs to be changed to zero if the bus probing code
 			 * ever stops re-probing on best match because the sctx
 			 * may have its values over written by register calls
 			 * in subsequent probes
 			 */
 			return (BUS_PROBE_DEFAULT);
 		}
 		ent++;
 	}
 	return (ENXIO);
 }
 
 static void
 iflib_reset_qvalues(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	device_t dev = ctx->ifc_dev;
 	int i;
 
 	scctx->isc_txrx_budget_bytes_max = IFLIB_MAX_TX_BYTES;
 	scctx->isc_tx_qdepth = IFLIB_DEFAULT_TX_QDEPTH;
 	/*
 	 * XXX sanity check that ntxd & nrxd are a power of 2
 	 */
 	if (ctx->ifc_sysctl_ntxqs != 0)
 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 	if (ctx->ifc_sysctl_nrxqs != 0)
 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (ctx->ifc_sysctl_ntxds[i] != 0)
 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 		else
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (ctx->ifc_sysctl_nrxds[i] != 0)
 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 		else
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 		}
 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 		}
 	}
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 		}
 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 		}
 	}
 }
 
 int
 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 {
 	int err, rid, msix;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	uint16_t main_txq;
 	uint16_t main_rxq;
 
 
 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 
 	if (sc == NULL) {
 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 		device_set_softc(dev, ctx);
 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	}
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "iflib_register failed %d\n", err);
 		goto fail_ctx_free;
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	iflib_reset_qvalues(ctx);
 	CTX_LOCK(ctx);
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		goto fail_unlock;
 	}
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 #ifdef INVARIANTS
 	MPASS(scctx->isc_capabilities);
 	if (scctx->isc_capabilities & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS);
 	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
 	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "Using %d tx descriptors and %d rx descriptors\n",
 	    scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			/* round down instead? */
 			device_printf(dev, "# rx descriptors must be a power of 2\n");
 			err = EINVAL;
 			goto fail_iflib_detach;
 		}
 	}
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev,
 			    "# tx descriptors must be a power of 2");
 			err = EINVAL;
 			goto fail_iflib_detach;
 		}
 	}
 
 	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	if (if_getcapabilities(ifp) & IFCAP_TSO) {
 		/*
 		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
 		 * but some MACs do.
 		 */
 		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 		    IP_MAXPACKET));
 		/*
 		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 		 * into account.  In the worst case, each of these calls will
 		 * add another mbuf and, thus, the requirement for another DMA
 		 * segment.  So for best performance, it doesn't make sense to
 		 * advertize a maximum of TSO segments that typically will
 		 * require defragmentation in iflib_encap().
 		 */
 		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 	}
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 	    NULL, NULL, "admin");
 
 	/* Set up cpu set.  If it fails, use the set of all CPUs. */
 	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
 		device_printf(dev, "Unable to fetch CPU list\n");
 		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
 	}
 	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
 
 	/*
 	** Now set up MSI or MSI-X, should return us the number of supported
 	** vectors (will be 1 for a legacy interrupt and MSI).
 	*/
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
 	       /*
 		* The simple fact that isc_msix_bar is not 0 does not mean we
 		* we have a good value there that is known to work.
 		*/
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
 		scctx->isc_ntxqsets = 1;
 		scctx->isc_nrxqsets = 1;
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 		msix = 0;
 	}
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail_intr_free;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx)))
 		goto fail_queues;
 
 	/*
 	 * Group taskqueues aren't properly set up until SMP is started,
 	 * so we disable interrupts until we can handle them post
 	 * SI_SUB_SMP.
 	 *
 	 * XXX: disabling interrupts doesn't actually work, at least for
 	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
 	 * we do null handling and depend on this not causing too large an
 	 * interrupt storm.
 	 */
 	IFDI_INTR_DISABLE(ctx);
 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
 		goto fail_queues;
 	}
 	if (msix <= 1) {
 		rid = 0;
 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
 			MPASS(msix == 1);
 			rid = 1;
 		}
 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 			goto fail_queues;
 		}
 	}
 
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 
 	/*
 	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 	 * This must appear after the call to ether_ifattach() because
 	 * ether_ifattach() sets if_hdrlen to the default value.
 	 */
 	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	if ((err = iflib_netmap_attach(ctx))) {
 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 		goto fail_detach;
 	}
 	*ctxp = ctx;
 
 	NETDUMP_SET(ctx->ifc_ifp, iflib);
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
 	CTX_UNLOCK(ctx);
 	return (0);
 
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_intr_free:
 	iflib_free_intr_mem(ctx);
 fail_queues:
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 fail_iflib_detach:
 	IFDI_DETACH(ctx);
 fail_unlock:
 	CTX_UNLOCK(ctx);
 fail_ctx_free:
         if (ctx->ifc_flags & IFC_SC_ALLOCATED)
                 free(ctx->ifc_softc, M_IFLIB);
         free(ctx, M_IFLIB);
 	return (err);
 }
 
 int
 iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
 					  struct iflib_cloneattach_ctx *clctx)
 {
 	int err;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	void *sc;
 	uint16_t main_txq;
 	uint16_t main_rxq;
 
 	ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
 	sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 	ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
 		ctx->ifc_flags |= IFC_PSEUDO;
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_softc = sc;
 	ctx->ifc_dev = dev;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
 		goto fail_ctx_free;
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	/*
 	 * XXX sanity check that ntxd & nrxd are a power of 2
 	 */
 	iflib_reset_qvalues(ctx);
-
+	CTX_LOCK(ctx);
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
-		goto fail_ctx_free;
+		goto fail_unlock;
 	}
 	if (sctx->isc_flags & IFLIB_GEN_MAC)
 		iflib_gen_mac(ctx);
 	if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
 								clctx->cc_params)) != 0) {
 		device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
 		goto fail_ctx_free;
 	}
 	ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
 	ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&ctx->ifc_media, IFM_ETHER | IFM_AUTO);
 
 #ifdef INVARIANTS
 	MPASS(scctx->isc_capabilities);
 	if (scctx->isc_capabilities & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 
 	ifp->if_flags |= IFF_NOGROUP;
 	if (sctx->isc_flags & IFLIB_PSEUDO) {
 		ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 
 		if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 			device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 			goto fail_detach;
 		}
 		*ctxp = ctx;
 
 		/*
 		 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 		 * This must appear after the call to ether_ifattach() because
 		 * ether_ifattach() sets if_hdrlen to the default value.
 		 */
 		if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 			if_setifheaderlen(ifp,
 			    sizeof(struct ether_vlan_header));
 
 		if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 		iflib_add_device_sysctl_post(ctx);
 		ctx->ifc_flags |= IFC_INIT_DONE;
 		return (0);
 	}
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
 	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "Using %d tx descriptors and %d rx descriptors\n",
 	    scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			/* round down instead? */
 			device_printf(dev, "# rx descriptors must be a power of 2\n");
 			err = EINVAL;
 			goto fail_iflib_detach;
 		}
 	}
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev,
 			    "# tx descriptors must be a power of 2");
 			err = EINVAL;
 			goto fail_iflib_detach;
 		}
 	}
 
 	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	if (if_getcapabilities(ifp) & IFCAP_TSO) {
 		/*
 		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
 		 * but some MACs do.
 		 */
 		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 		    IP_MAXPACKET));
 		/*
 		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 		 * into account.  In the worst case, each of these calls will
 		 * add another mbuf and, thus, the requirement for another DMA
 		 * segment.  So for best performance, it doesn't make sense to
 		 * advertize a maximum of TSO segments that typically will
 		 * require defragmentation in iflib_encap().
 		 */
 		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 	}
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 	    NULL, NULL, "admin");
 
 	/* XXX --- can support > 1 -- but keep it simple for now */
 	scctx->isc_intr = IFLIB_INTR_LEGACY;
 
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail_iflib_detach;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx))) {
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
 
 	/*
 	 * XXX What if anything do we want to do about interrupts?
 	 */
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 
 	/*
 	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 	 * This must appear after the call to ether_ifattach() because
 	 * ether_ifattach() sets if_hdrlen to the default value.
 	 */
 	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	/* XXX handle more than one queue */
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
 
 	*ctxp = ctx;
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
+	CTX_UNLOCK(ctx);
 	return (0);
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_queues:
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 fail_iflib_detach:
 	IFDI_DETACH(ctx);
+fail_unlock:
+	CTX_UNLOCK(ctx);
 fail_ctx_free:
 	free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (err);
 }
 
 int
 iflib_pseudo_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	struct taskqgroup *tqg;
 	iflib_fl_t fl;
 
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 	if (ctx->ifc_vlan_detach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 
 	ether_ifdetach(ifp);
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 	/* XXX drain any dependent tasks */
 	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			free(fl->ifl_rx_bitmap, M_IFLIB);
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 
 	if_free(ifp);
 
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 int
 iflib_device_attach(device_t dev)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_enable_busmaster(dev);
 
 	return (iflib_device_register(dev, NULL, sctx, &ctx));
 }
 
 int
 iflib_device_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	device_t dev = ctx->ifc_dev;
 	int i, j;
 	struct taskqgroup *tqg;
 	iflib_fl_t fl;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev, "Vlan in use, detach first\n");
 		return (EBUSY);
 	}
 #ifdef PCI_IOV
 	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
 		device_printf(dev, "SR-IOV in use; detach first.\n");
 		return (EBUSY);
 	}
 #endif
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_IN_DETACH;
 	STATE_UNLOCK(ctx);
 
 	CTX_LOCK(ctx);
 	iflib_stop(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 	if (ctx->ifc_vlan_detach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 
 	iflib_netmap_detach(ifp);
 	ether_ifdetach(ifp);
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 	/* XXX drain any dependent tasks */
 	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			free(fl->ifl_rx_bitmap, M_IFLIB);
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 	CTX_LOCK(ctx);
 	IFDI_DETACH(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 	device_set_softc(ctx->ifc_dev, NULL);
 	iflib_free_intr_mem(ctx);
 
 	bus_generic_detach(dev);
 	if_free(ifp);
 
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	STATE_LOCK_DESTROY(ctx);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 static void
 iflib_free_intr_mem(if_ctx_t ctx)
 {
 
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 	}
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 		pci_release_msi(ctx->ifc_dev);
 	}
 	if (ctx->ifc_msix_mem != NULL) {
 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 }
 
 int
 iflib_device_detach(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	return (iflib_device_deregister(ctx));
 }
 
 int
 iflib_device_suspend(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SUSPEND(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 int
 iflib_device_shutdown(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SHUTDOWN(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 
 
 int
 iflib_device_resume(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	CTX_LOCK(ctx);
 	IFDI_RESUME(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 
 	return (bus_generic_resume(dev));
 }
 
 int
 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 void
 iflib_device_iov_uninit(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_IOV_UNINIT(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 int
 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 /*********************************************************************
  *
  *  MODULE FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 /*
  * - Start a fast taskqueue thread for each core
  * - Start a taskqueue for control operations
  */
 static int
 iflib_module_init(void)
 {
 	return (0);
 }
 
 static int
 iflib_module_event_handler(module_t mod, int what, void *arg)
 {
 	int err;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((err = iflib_module_init()) != 0)
 			return (err);
 		break;
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 /*********************************************************************
  *
  *  PUBLIC FUNCTION DEFINITIONS
  *     ordered as in iflib.h
  *
  **********************************************************************/
 
 
 static void
 _iflib_assert(if_shared_ctx_t sctx)
 {
 	MPASS(sctx->isc_tx_maxsize);
 	MPASS(sctx->isc_tx_maxsegsize);
 
 	MPASS(sctx->isc_rx_maxsize);
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
 	MPASS(sctx->isc_nrxd_min[0]);
 	MPASS(sctx->isc_nrxd_max[0]);
 	MPASS(sctx->isc_nrxd_default[0]);
 	MPASS(sctx->isc_ntxd_min[0]);
 	MPASS(sctx->isc_ntxd_max[0]);
 	MPASS(sctx->isc_ntxd_default[0]);
 }
 
 static void
 _iflib_pre_assert(if_softc_ctx_t scctx)
 {
 
 	MPASS(scctx->isc_txrx->ift_txd_encap);
 	MPASS(scctx->isc_txrx->ift_txd_flush);
 	MPASS(scctx->isc_txrx->ift_txd_credits_update);
 	MPASS(scctx->isc_txrx->ift_rxd_available);
 	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
 	MPASS(scctx->isc_txrx->ift_rxd_refill);
 	MPASS(scctx->isc_txrx->ift_rxd_flush);
 }
 
 static int
 iflib_register(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	driver_t *driver = sctx->isc_driver;
 	device_t dev = ctx->ifc_dev;
 	if_t ifp;
 
 	_iflib_assert(sctx);
 
 	CTX_LOCK_INIT(ctx);
 	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 	ifp = ctx->ifc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "can not allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Initialize our context's device specific methods
 	 */
 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 	kobj_class_compile((kobj_class_t) driver);
 	driver->refs++;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setsoftc(ifp, ctx);
 	if_setdev(ifp, dev);
 	if_setinitfn(ifp, iflib_if_init);
 	if_setioctlfn(ifp, iflib_if_ioctl);
 #ifdef ALTQ
 	if_setstartfn(ifp, iflib_altq_if_start);
 	if_settransmitfn(ifp, iflib_altq_if_transmit);
 	if_setsendqready(ifp);
 #else
 	if_settransmitfn(ifp, iflib_if_transmit);
 #endif
 	if_setqflushfn(ifp, iflib_if_qflush);
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 	ctx->ifc_vlan_detach_event =
 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 
 	ifmedia_init(&ctx->ifc_media, IFM_IMASK,
 					 iflib_media_change, iflib_media_status);
 
 	return (0);
 }
 
 
 static int
 iflib_queues_alloc(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int nrxqsets = scctx->isc_nrxqsets;
 	int ntxqsets = scctx->isc_ntxqsets;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
 	int i, j, cpu, err, txconf, rxconf;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
 	uint32_t *txqsizes = scctx->isc_txqsizes;
 	uint8_t nrxqs = sctx->isc_nrxqs;
 	uint8_t ntxqs = sctx->isc_ntxqs;
 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 
 	/* Allocate the TX ring struct memory */
 	if (!(ctx->ifc_txqs =
 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX ring memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/* Now allocate the RX */
 	if (!(ctx->ifc_rxqs =
 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate RX ring memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 
 	txq = ctx->ifc_txqs;
 	rxq = ctx->ifc_rxqs;
 
 	/*
 	 * XXX handle allocation failure
 	 */
 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
 		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 			device_printf(dev,
 			    "Unable to allocate TX DMA info memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		txq->ift_ifdi = ifdip;
 		for (j = 0; j < ntxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
 				device_printf(dev,
 				    "Unable to allocate TX descriptors\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 			txq->ift_br_offset = 1;
 		} else {
 			txq->ift_br_offset = 0;
 		}
 		/* XXX fix this */
 		txq->ift_timer.c_cpu = cpu;
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		/* Initialize the TX lock */
 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout",
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
 
 		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
 			 device_get_nameunit(dev), txq->ift_id);
 
 		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
 				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
 		if (err) {
 			/* XXX free any allocated rings */
 			device_printf(dev, "Unable to allocate buf_ring\n");
 			goto err_tx_desc;
 		}
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
 		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 			device_printf(dev,
 			    "Unable to allocate RX DMA info memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		rxq->ifr_ifdi = ifdip;
 		/* XXX this needs to be changed if #rx queues != #tx queues */
 		rxq->ifr_ntxqirq = 1;
 		rxq->ifr_txqid[0] = i;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
 				device_printf(dev,
 				    "Unable to allocate RX descriptors\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 		}
 		rxq->ifr_ctx = ctx;
 		rxq->ifr_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			rxq->ifr_fl_offset = 1;
 		} else {
 			rxq->ifr_fl_offset = 0;
 		}
 		rxq->ifr_nfl = nfree_lists;
 		if (!(fl =
 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate free list memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
 			fl[j].ifl_rxq = rxq;
 			fl[j].ifl_id = j;
 			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
 			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
 		}
 		/* Allocate receive buffers for the ring */
 		if (iflib_rxsd_alloc(rxq)) {
 			device_printf(dev,
 			    "Critical Failure setting up receive buffers\n");
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) 
 			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
 			    M_WAITOK);
 	}
 
 	/* TXQs */
 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < ntxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 
 		for (j = 0; j < ntxqs; j++, di++) {
 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
 			paddrs[i*ntxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 		device_printf(ctx->ifc_dev,
 		    "Unable to allocate device TX queue\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	/* RXQs */
 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < nrxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 
 		for (j = 0; j < nrxqs; j++, di++) {
 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
 			paddrs[i*nrxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 		device_printf(ctx->ifc_dev,
 		    "Unable to allocate device RX queue\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	return (0);
 
 /* XXX handle allocation failure changes */
 err_rx_desc:
 err_tx_desc:
 rx_fail:
 	if (ctx->ifc_rxqs != NULL)
 		free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 	if (ctx->ifc_txqs != NULL)
 		free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 fail:
 	return (err);
 }
 
 static int
 iflib_tx_structures_setup(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_setup(txq);
 
 	return (0);
 }
 
 static void
 iflib_tx_structures_free(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	int i, j;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		iflib_txq_destroy(txq);
 		for (j = 0; j < sctx->isc_ntxqs; j++)
 			iflib_dma_free(&txq->ift_ifdi[j]);
 	}
 	free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 	IFDI_QUEUES_FREE(ctx);
 }
 
 /*********************************************************************
  *
  *  Initialize all receive rings.
  *
  **********************************************************************/
 static int
 iflib_rx_structures_setup(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	int q;
 #if defined(INET6) || defined(INET)
 	int i, err;
 #endif
 
 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 #if defined(INET6) || defined(INET)
 		tcp_lro_free(&rxq->ifr_lc);
 		if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 		    TCP_LRO_ENTRIES, min(1024,
 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) {
 			device_printf(ctx->ifc_dev, "LRO Initialization failed!\n");
 			goto fail;
 		}
 		rxq->ifr_lro_enabled = TRUE;
 #endif
 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 	}
 	return (0);
 #if defined(INET6) || defined(INET)
 fail:
 	/*
 	 * Free RX software descriptors allocated so far, we will only handle
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself. 'q' failed, so its the terminus.
 	 */
 	rxq = ctx->ifc_rxqs;
 	for (i = 0; i < q; ++i, rxq++) {
 		iflib_rx_sds_free(rxq);
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 	return (err);
 #endif
 }
 
 /*********************************************************************
  *
  *  Free all receive rings.
  *
  **********************************************************************/
 static void
 iflib_rx_structures_free(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 		iflib_rx_sds_free(rxq);
 	}
 	free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 }
 
 static int
 iflib_qset_structures_setup(if_ctx_t ctx)
 {
 	int err;
 
 	/*
 	 * It is expected that the caller takes care of freeing queues if this
 	 * fails.
 	 */
 	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
 		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
 		return (err);
 	}
 
 	if ((err = iflib_rx_structures_setup(ctx)) != 0)
 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 
 	return (err);
 }
 
 int
 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
 {
 
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
 #ifdef SMP
 static int
 find_nth(if_ctx_t ctx, int qid)
 {
 	cpuset_t cpus;
 	int i, cpuid, eqid, count;
 
 	CPU_COPY(&ctx->ifc_cpus, &cpus);
 	count = CPU_COUNT(&cpus);
 	eqid = qid % count;
 	/* clear up to the qid'th bit */
 	for (i = 0; i < eqid; i++) {
 		cpuid = CPU_FFS(&cpus);
 		MPASS(cpuid != 0);
 		CPU_CLR(cpuid-1, &cpus);
 	}
 	cpuid = CPU_FFS(&cpus);
 	MPASS(cpuid != 0);
 	return (cpuid-1);
 }
 
 #ifdef SCHED_ULE
 extern struct cpu_group *cpu_top;              /* CPU topology */
 
 static int
 find_child_with_core(int cpu, struct cpu_group *grp)
 {
 	int i;
 
 	if (grp->cg_children == 0)
 		return -1;
 
 	MPASS(grp->cg_child);
 	for (i = 0; i < grp->cg_children; i++) {
 		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
 			return i;
 	}
 
 	return -1;
 }
 
 /*
  * Find the nth "close" core to the specified core
  * "close" is defined as the deepest level that shares
  * at least an L2 cache.  With threads, this will be
  * threads on the same core.  If the sahred cache is L3
  * or higher, simply returns the same core.
  */
 static int
 find_close_core(int cpu, int core_offset)
 {
 	struct cpu_group *grp;
 	int i;
 	int fcpu;
 	cpuset_t cs;
 
 	grp = cpu_top;
 	if (grp == NULL)
 		return cpu;
 	i = 0;
 	while ((i = find_child_with_core(cpu, grp)) != -1) {
 		/* If the child only has one cpu, don't descend */
 		if (grp->cg_child[i].cg_count <= 1)
 			break;
 		grp = &grp->cg_child[i];
 	}
 
 	/* If they don't share at least an L2 cache, use the same CPU */
 	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
 		return cpu;
 
 	/* Now pick one */
 	CPU_COPY(&grp->cg_mask, &cs);
 
 	/* Add the selected CPU offset to core offset. */
 	for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) {
 		if (fcpu - 1 == cpu)
 			break;
 		CPU_CLR(fcpu - 1, &cs);
 	}
 	MPASS(fcpu);
 
 	core_offset += i;
 
 	CPU_COPY(&grp->cg_mask, &cs);
 	for (i = core_offset % grp->cg_count; i > 0; i--) {
 		MPASS(CPU_FFS(&cs));
 		CPU_CLR(CPU_FFS(&cs) - 1, &cs);
 	}
 	MPASS(CPU_FFS(&cs));
 	return CPU_FFS(&cs) - 1;
 }
 #else
 static int
 find_close_core(int cpu, int core_offset __unused)
 {
 	return cpu;
 }
 #endif
 
 static int
 get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid)
 {
 	switch (type) {
 	case IFLIB_INTR_TX:
 		/* TX queues get cores which share at least an L2 cache with the corresponding RX queue */
 		/* XXX handle multiple RX threads per core and more than two core per L2 group */
 		return qid / CPU_COUNT(&ctx->ifc_cpus) + 1;
 	case IFLIB_INTR_RX:
 	case IFLIB_INTR_RXTX:
 		/* RX queues get the specified core */
 		return qid / CPU_COUNT(&ctx->ifc_cpus);
 	default:
 		return -1;
 	}
 }
 #else
 #define get_core_offset(ctx, type, qid)	CPU_FIRST()
 #define find_close_core(cpuid, tid)	CPU_FIRST()
 #define find_nth(ctx, gid)		CPU_FIRST()
 #endif
 
 /* Just to avoid copy/paste */
 static inline int
 iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
     int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
     const char *name)
 {
 	device_t dev;
 	int err, cpuid, tid;
 
 	dev = ctx->ifc_dev;
 	cpuid = find_nth(ctx, qid);
 	tid = get_core_offset(ctx, type, qid);
 	MPASS(tid >= 0);
 	cpuid = find_close_core(cpuid, tid);
 	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev, irq->ii_res,
 	    name);
 	if (err) {
 		device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
 		return (err);
 	}
 #ifdef notyet
 	if (cpuid > ctx->ifc_cpuid_highest)
 		ctx->ifc_cpuid_highest = cpuid;
 #endif
 	return 0;
 }
 
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 			iflib_intr_type_t type, driver_filter_t *filter,
 			void *filter_arg, int qid, const char *name)
 {
 	device_t dev;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
 	gtask_fn_t *fn;
 	int tqrid, err;
 	driver_filter_t *intr_fast;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
 	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		intr_fast = iflib_fast_intr;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		intr_fast = iflib_fast_intr;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RXTX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		intr_fast = iflib_fast_intr_rxtx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
 		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_admin;
 		intr_fast = iflib_fast_intr_ctx;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = q;
 
 	dev = ctx->ifc_dev;
 	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
 	if (err != 0) {
 		device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
 	}
 	if (type == IFLIB_INTR_ADMIN)
 		return (0);
 
 	if (tqrid != -1) {
 		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg,
 		    q, name);
 		if (err)
 			return (err);
 	} else {
 		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
 	}
 
 	return (0);
 }
 
 void
 iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
 	int err;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_iov;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	GROUPTASK_INIT(gtask, 0, fn, q);
 	if (irq != NULL) {
 		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg,
 		    q, name);
 		if (err)
 			taskqgroup_attach(tqg, gtask, q, ctx->ifc_dev,
 			    irq->ii_res, name);
 	} else {
 		taskqgroup_attach(tqg, gtask, q, NULL, NULL, name);
 	}
 }
 
 void
 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 {
 
 	if (irq->ii_tag)
 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 
 	if (irq->ii_res)
 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
 		    rman_get_rid(irq->ii_res), irq->ii_res);
 }
 
 static int
 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_irq_t irq = &ctx->ifc_legacy_irq;
 	iflib_filter_info_t info;
 	device_t dev;
 	struct grouptask *gtask;
 	struct resource *res;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	int tqrid;
 	void *q;
 	int err;
 
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
 	tqg = qgroup_if_io_tqg;
 	tqrid = irq->ii_rid = *rid;
 	fn = _task_fn_rx;
 
 	ctx->ifc_flags |= IFC_LEGACY;
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = ctx;
 
 	dev = ctx->ifc_dev;
 	/* We allocate a single interrupt resource */
 	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr_ctx, NULL, info, name)) != 0)
 		return (err);
 	GROUPTASK_INIT(gtask, 0, fn, q);
 	res = irq->ii_res;
 	taskqgroup_attach(tqg, gtask, q, dev, res, name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
 	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
 	    "tx");
 	return (0);
 }
 
 void
 iflib_led_create(if_ctx_t ctx)
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
 	    device_get_nameunit(ctx->ifc_dev));
 }
 
 void
 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 }
 
 void
 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 }
 
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
 #ifdef INVARIANTS
 	struct grouptask *gtask;
 
 	gtask = &ctx->ifc_admin_task;
 	MPASS(gtask != NULL && gtask->gt_taskqueue != NULL);
 #endif
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
 void
 iflib_iov_intr_deferred(if_ctx_t ctx)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 }
 
 void
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
 {
 
 	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
 	    name);
 }
 
 void
 iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
 	const char *name)
 {
 
 	GROUPTASK_INIT(gtask, 0, fn, ctx);
 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL,
 	    name);
 }
 
 void
 iflib_config_gtask_deinit(struct grouptask *gtask)
 {
 
 	taskqgroup_detach(qgroup_if_config_tqg, gtask);	
 }
 
 void
 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	if_setbaudrate(ifp, baudrate);
 	if (baudrate >= IF_Gbps(10)) {
 		STATE_LOCK(ctx);
 		ctx->ifc_flags |= IFC_PREFETCH;
 		STATE_UNLOCK(ctx);
 	}
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	}
 	ctx->ifc_link_state = link_state;
 	if_link_state_change(ifp, link_state);
 }
 
 static int
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
 #ifdef INVARIANTS
 	int credits_pre = txq->ift_cidx_processed;
 #endif
 
 	if (ctx->isc_txd_credits_update == NULL)
 		return (0);
 
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD);
 	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
 	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
 {
 	iflib_fl_t fl;
 	u_int i;
 
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 	    budget));
 }
 
 void
 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 	const char *description, if_int_delay_info_t info,
 	int offset, int value)
 {
 	info->iidi_ctx = ctx;
 	info->iidi_offset = offset;
 	info->iidi_value = value;
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
 	    info, 0, iflib_sysctl_int_delay, "I", description);
 }
 
 struct sx *
 iflib_ctx_lock_get(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_ctx_sx);
 }
 
 static int
 iflib_msix_init(if_ctx_t ctx)
 {
 	device_t dev = ctx->ifc_dev;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs;
 	int iflib_num_tx_queues, iflib_num_rx_queues;
 	int err, admincnt, bar;
 
 	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
 	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
 
 	if (bootverbose)
 		device_printf(dev, "msix_init qsets capped at %d\n",
 		    imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
 
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	admincnt = sctx->isc_admin_intrcnt;
 	/* Override by tuneable */
 	if (scctx->isc_disable_msix)
 		goto msi;
 
 	/* First try MSI-X */
 	if ((msgs = pci_msix_count(dev)) == 0) {
 		if (bootverbose)
 			device_printf(dev, "MSI-X not supported or disabled\n");
 		goto msi;
 	}
 	/*
 	 * bar == -1 => "trust me I know what I'm doing"
 	 * Some drivers are for hardware that is so shoddily
 	 * documented that no one knows which bars are which
 	 * so the developer has to map all bars. This hack
 	 * allows shoddy garbage to use MSI-X in this framework.
 	 */
 	if (bar != -1) {
 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
 		if (ctx->ifc_msix_mem == NULL) {
 			device_printf(dev, "Unable to map MSI-X table\n");
 			goto msi;
 		}
 	}
 #if IFLIB_DEBUG
 	/* use only 1 qset in debug mode */
 	queuemsgs = min(msgs - admincnt, 1);
 #else
 	queuemsgs = msgs - admincnt;
 #endif
 #ifdef RSS
 	queues = imin(queuemsgs, rss_getnumbuckets());
 #else
 	queues = queuemsgs;
 #endif
 	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
 	if (bootverbose)
 		device_printf(dev,
 		    "intr CPUs: %d queue msgs: %d admincnt: %d\n",
 		    CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
 		queues = rss_getnumbuckets();
 #endif
 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
 
 	if (rx_queues > scctx->isc_nrxqsets)
 		rx_queues = scctx->isc_nrxqsets;
 
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 		tx_queues = iflib_num_tx_queues;
 	else
 		tx_queues = mp_ncpus;
 
 	if (tx_queues > scctx->isc_ntxqsets)
 		tx_queues = scctx->isc_ntxqsets;
 
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
 			device_printf(dev,
 			    "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 			    min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 #endif
 		tx_queues = min(rx_queues, tx_queues);
 		rx_queues = min(rx_queues, tx_queues);
 	}
 
 	device_printf(dev, "Using %d rx queues %d tx queues\n",
 	    rx_queues, tx_queues);
 
 	vectors = rx_queues + admincnt;
 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 		device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
 		    vectors);
 		scctx->isc_vectors = vectors;
 		scctx->isc_nrxqsets = rx_queues;
 		scctx->isc_ntxqsets = tx_queues;
 		scctx->isc_intr = IFLIB_INTR_MSIX;
 
 		return (vectors);
 	} else {
 		device_printf(dev,
 		    "failed to allocate %d MSI-X vectors, err: %d - using MSI\n",
 		    vectors, err);
 		bus_release_resource(dev, SYS_RES_MEMORY, bar,
 		    ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 msi:
 	vectors = pci_msi_count(dev);
 	scctx->isc_nrxqsets = 1;
 	scctx->isc_ntxqsets = 1;
 	scctx->isc_vectors = vectors;
 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 		device_printf(dev,"Using an MSI interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_MSI;
 	} else {
 		scctx->isc_vectors = 1;
 		device_printf(dev,"Using a Legacy interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 	}
 
 	return (vectors);
 }
 
 static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 
 static int
 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 	struct sbuf *sb;
 	const char *ring_state = "UNKNOWN";
 
 	/* XXX needed ? */
 	rc = sysctl_wire_old_buffer(req, 0);
 	MPASS(rc == 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 	MPASS(sb != NULL);
 	if (sb == NULL)
 		return (ENOMEM);
 	if (state[3] <= 3)
 		ring_state = ring_states[state[3]];
 
 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 		    state[0], state[1], state[2], ring_state);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
         return(rc);
 }
 
 enum iflib_ndesc_handler {
 	IFLIB_NTXD_HANDLER,
 	IFLIB_NRXD_HANDLER,
 };
 
 static int
 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 {
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
 	qidx_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
 	MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER);
 
 	nqs = 8;
 	switch(type) {
 	case IFLIB_NTXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_ntxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_ntxqs;
 		break;
 	case IFLIB_NRXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_nrxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_nrxqs;
 		break;
 	default:
 			panic("unhandled type");
 	}
 	if (nqs == 0)
 		nqs = 8;
 
 	for (i=0; i<8; i++) {
 		if (i >= nqs)
 			break;
 		if (i)
 			strcat(buf, ",");
 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
 	}
 
 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (rc || req->newptr == NULL)
 		return rc;
 
 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 	    i++, p = strsep(&next, " ,")) {
 		ndesc[i] = strtoul(p, NULL, 10);
 	}
 
 	return(rc);
 }
 
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
 	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
 	ctx_list = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 						      CTLFLAG_RD, NULL, "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
 	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version,
 		       "driver version");
 
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 			"# of txqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 			"# of rxqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
 	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
                       CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
                       "disable MSI-X (default 0)");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
                        "set the rx budget");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
 		       "cause tx to abdicate instead of running to completion");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of tx descriptors to use, 0 = use default #");
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of rx descriptors to use, 0 = use default #");
 }
 
 static void
 iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx_list;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	char namebuf[NAME_BUFLEN];
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
 	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
 		qfmt = "txq%02d";
 	else
 		qfmt = "txq%d";
 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 #if MEMORY_LOGGING
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 				CTLFLAG_RD,
 				&txq->ift_dequeued, "total mbufs freed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 				CTLFLAG_RD,
 				&txq->ift_enqueued, "total mbufs enqueued");
 #endif
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 				   CTLFLAG_RD,
 				   &txq->ift_pullups, "# of times m_pullup was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 				   CTLFLAG_RD,
 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_map_failed, "# of times dma map failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 				   CTLFLAG_RD,
 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 				   CTLFLAG_RD,
 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 				   CTLFLAG_RD,
 				   &txq->ift_pidx, 1, "Producer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx, 1, "Consumer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 				   CTLFLAG_RD,
 				   &txq->ift_in_use, 1, "descriptors in use");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_processed, "descriptors procesed for clean");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
 				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state),
 				0, mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
 				       CTLFLAG_RD, &txq->ift_br->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
 				       CTLFLAG_RD, &txq->ift_br->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
 				       CTLFLAG_RD, &txq->ift_br->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
 				       CTLFLAG_RD, &txq->ift_br->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
 			       CTLFLAG_RD, &txq->ift_br->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
 				       CTLFLAG_RD, &txq->ift_br->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
 	}
 
 	if (scctx->isc_nrxqsets > 100)
 		qfmt = "rxq%03d";
 	else if (scctx->isc_nrxqsets > 10)
 		qfmt = "rxq%02d";
 	else
 		qfmt = "rxq%d";
 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 						     CTLFLAG_RD, NULL, "freelist Name");
 			fl_list = SYSCTL_CHILDREN(fl_node);
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_cidx, 1, "Consumer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 				       CTLFLAG_RD,
 				       &fl->ifl_credits, 1, "credits available");
 #if MEMORY_LOGGING
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_m_enqueued, "mbufs allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_m_dequeued, "mbufs freed");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_enqueued, "clusters allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_dequeued, "clusters freed");
 #endif
 
 		}
 	}
 
 }
 
 void
 iflib_request_reset(if_ctx_t ctx)
 {
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_DO_RESET;
 	STATE_UNLOCK(ctx);
 }
 
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf *
 iflib_fixup_rx(struct mbuf *m)
 {
 	struct mbuf *n;
 
 	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
 		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
 		m->m_data += ETHER_HDR_LEN;
 		n = m;
 	} else {
 		MGETHDR(n, M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return (NULL);
 		}
 		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
 		m->m_data += ETHER_HDR_LEN;
 		m->m_len -= ETHER_HDR_LEN;
 		n->m_len = ETHER_HDR_LEN;
 		M_MOVE_PKTHDR(n, m);
 		n->m_next = m;
 	}
 	return (n);
 }
 #endif
 
 #ifdef NETDUMP
 static void
 iflib_netdump_init(struct ifnet *ifp, int *nrxr, int *ncl, int *clsize)
 {
 	if_ctx_t ctx;
 
 	ctx = if_getsoftc(ifp);
 	CTX_LOCK(ctx);
 	*nrxr = NRXQSETS(ctx);
 	*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
 	*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_netdump_event(struct ifnet *ifp, enum netdump_ev event)
 {
 	if_ctx_t ctx;
 	if_softc_ctx_t scctx;
 	iflib_fl_t fl;
 	iflib_rxq_t rxq;
 	int i, j;
 
 	ctx = if_getsoftc(ifp);
 	scctx = &ctx->ifc_softc_ctx;
 
 	switch (event) {
 	case NETDUMP_START:
 		for (i = 0; i < scctx->isc_nrxqsets; i++) {
 			rxq = &ctx->ifc_rxqs[i];
 			for (j = 0; j < rxq->ifr_nfl; j++) {
 				fl = rxq->ifr_fl;
 				fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 			}
 		}
 		iflib_no_tx_batch = 1;
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 iflib_netdump_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	if_ctx_t ctx;
 	iflib_txq_t txq;
 	int error;
 
 	ctx = if_getsoftc(ifp);
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
 	error = iflib_encap(txq, &m);
 	if (error == 0)
 		(void)iflib_txd_db_check(ctx, txq, true, txq->ift_in_use);
 	return (error);
 }
 
 static int
 iflib_netdump_poll(struct ifnet *ifp, int count)
 {
 	if_ctx_t ctx;
 	if_softc_ctx_t scctx;
 	iflib_txq_t txq;
 	int i;
 
 	ctx = if_getsoftc(ifp);
 	scctx = &ctx->ifc_softc_ctx;
 
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
 	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
 	return (0);
 }
 #endif /* NETDUMP */
Index: projects/capsicum-test/sys/powerpc/aim/trap_subr64.S
===================================================================
--- projects/capsicum-test/sys/powerpc/aim/trap_subr64.S	(revision 345709)
+++ projects/capsicum-test/sys/powerpc/aim/trap_subr64.S	(revision 345710)
@@ -1,974 +1,977 @@
 /* $FreeBSD$ */
 /* $NetBSD: trap_subr.S,v 1.20 2002/04/22 23:20:08 kleink Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * NOTICE: This is not a standalone file.  to use it, #include it in
  * your port's locore.S, like so:
  *
  *	#include <powerpc/aim/trap_subr.S>
  */
 
 /* Locate the per-CPU data structure */
 #define GET_CPUINFO(r)  \
         mfsprg0  r
 #define GET_TOCBASE(r)  \
 	lis	r,DMAP_BASE_ADDRESS@highesta;	/* To real-mode alias/dmap */ \
 	sldi	r,r,32;							\
 	ori	r,r,TRAP_TOCBASE;	/* Magic address for TOC */	\
 	ld	r,0(r)
 
 /*
  * Restore SRs for a pmap
  *
  * Requires that r28-r31 be scratch, with r28 initialized to the SLB cache
  */
 
 /*
  * User SRs are loaded through a pointer to the current pmap.
  */
 restore_usersrs:
 	GET_CPUINFO(%r28)
 	ld	%r28,PC_USERSLB(%r28)
 	cmpdi	%r28, 0			/* If user SLB pointer NULL, exit */
 	beqlr
 
 	li	%r29, 0			/* Set the counter to zero */
 
 	slbia
 	slbmfee	%r31,%r29		
 	clrrdi	%r31,%r31,28
 	slbie	%r31
 1:	ld	%r31, 0(%r28)		/* Load SLB entry pointer */
 	cmpdi	%r31, 0			/* If NULL, stop */
 	beqlr
 
 	ld	%r30, 0(%r31)		/* Load SLBV */
 	ld	%r31, 8(%r31)		/* Load SLBE */
 	or	%r31, %r31, %r29	/*  Set SLBE slot */
 	slbmte	%r30, %r31		/* Install SLB entry */
 
 	addi	%r28, %r28, 8		/* Advance pointer */
 	addi	%r29, %r29, 1
 	b	1b			/* Repeat */
 
 /*
  * Kernel SRs are loaded directly from the PCPU fields
  */
 restore_kernsrs:
 	GET_CPUINFO(%r28)
 	addi	%r28,%r28,PC_KERNSLB
 	ld	%r29,16(%r28)		/* One past USER_SLB_SLOT */
 	cmpdi	%r29,0
 	beqlr				/* If first kernel entry is invalid,
 					 * SLBs not in use, so exit early */
 
 	/* Otherwise, set up SLBs */
 	li	%r29, 0			/* Set the counter to zero */
 
 	slbia
 	slbmfee	%r31,%r29		
 	clrrdi	%r31,%r31,28
 	slbie	%r31
 1:	cmpdi	%r29, USER_SLB_SLOT	/* Skip the user slot */
 	beq-	2f
 
 	ld	%r31, 8(%r28)		/* Load SLBE */
 	cmpdi	%r31, 0			/* If SLBE is not valid, stop */
 	beqlr
 	ld	%r30, 0(%r28)		/* Load SLBV  */
 	slbmte	%r30, %r31		/* Install SLB entry */
 
 2:	addi	%r28, %r28, 16		/* Advance pointer */
 	addi	%r29, %r29, 1
 	cmpdi	%r29, 64		/* Repeat if we are not at the end */
 	blt	1b 
 	blr
 
 /*
  * FRAME_SETUP assumes:
  *	SPRG1		SP (1)
  * 	SPRG3		trap type
  *	savearea	r27-r31,DAR,DSISR   (DAR & DSISR only for DSI traps)
  *	r28		LR
  *	r29		CR
  *	r30		scratch
  *	r31		scratch
  *	r1		kernel stack
  *	SRR0/1		as at start of trap
  *
  * NOTE: SPRG1 is never used while the MMU is on, making it safe to reuse
  * in any real-mode fault handler, including those handling double faults.
  */
 #define	FRAME_SETUP(savearea)						\
 /* Have to enable translation to allow access of kernel stack: */	\
 	GET_CPUINFO(%r31);						\
 	mfsrr0	%r30;							\
 	std	%r30,(savearea+CPUSAVE_SRR0)(%r31);	/* save SRR0 */	\
 	mfsrr1	%r30;							\
 	std	%r30,(savearea+CPUSAVE_SRR1)(%r31);	/* save SRR1 */	\
 	mfsprg1	%r31;			/* get saved SP (clears SPRG1) */ \
 	mfmsr	%r30;							\
 	ori	%r30,%r30,(PSL_DR|PSL_IR|PSL_RI)@l; /* relocation on */	\
 	mtmsr	%r30;			/* stack can now be accessed */	\
 	isync;								\
 	stdu	%r31,-(FRAMELEN+288)(%r1); /* save it in the callframe */ \
 	std	%r0, FRAME_0+48(%r1);	/* save r0 in the trapframe */	\
 	std	%r31,FRAME_1+48(%r1);	/* save SP   "      "       */	\
 	std	%r2, FRAME_2+48(%r1);	/* save r2   "      "       */	\
 	std	%r28,FRAME_LR+48(%r1);	/* save LR   "      "       */	\
 	std	%r29,FRAME_CR+48(%r1);	/* save CR   "      "       */	\
 	GET_CPUINFO(%r2);						\
 	ld	%r27,(savearea+CPUSAVE_R27)(%r2); /* get saved r27 */	\
 	ld	%r28,(savearea+CPUSAVE_R28)(%r2); /* get saved r28 */	\
 	ld	%r29,(savearea+CPUSAVE_R29)(%r2); /* get saved r29 */	\
 	ld	%r30,(savearea+CPUSAVE_R30)(%r2); /* get saved r30 */	\
 	ld	%r31,(savearea+CPUSAVE_R31)(%r2); /* get saved r31 */	\
 	std	%r3,  FRAME_3+48(%r1);	/* save r3-r31 */		\
 	std	%r4,  FRAME_4+48(%r1);					\
 	std	%r5,  FRAME_5+48(%r1);					\
 	std	%r6,  FRAME_6+48(%r1);					\
 	std	%r7,  FRAME_7+48(%r1);					\
 	std	%r8,  FRAME_8+48(%r1);					\
 	std	%r9,  FRAME_9+48(%r1);					\
 	std	%r10, FRAME_10+48(%r1);					\
 	std	%r11, FRAME_11+48(%r1);					\
 	std	%r12, FRAME_12+48(%r1);					\
 	std	%r13, FRAME_13+48(%r1);					\
 	std	%r14, FRAME_14+48(%r1);					\
 	std	%r15, FRAME_15+48(%r1);					\
 	std	%r16, FRAME_16+48(%r1);					\
 	std	%r17, FRAME_17+48(%r1);					\
 	std	%r18, FRAME_18+48(%r1);					\
 	std	%r19, FRAME_19+48(%r1);					\
 	std	%r20, FRAME_20+48(%r1);					\
 	std	%r21, FRAME_21+48(%r1);					\
 	std	%r22, FRAME_22+48(%r1);					\
 	std	%r23, FRAME_23+48(%r1);					\
 	std	%r24, FRAME_24+48(%r1);					\
 	std	%r25, FRAME_25+48(%r1);					\
 	std	%r26, FRAME_26+48(%r1);					\
 	std	%r27, FRAME_27+48(%r1);					\
 	std	%r28, FRAME_28+48(%r1);					\
 	std	%r29, FRAME_29+48(%r1);					\
 	std	%r30, FRAME_30+48(%r1);					\
 	std	%r31, FRAME_31+48(%r1);					\
 	ld	%r28,(savearea+CPUSAVE_AIM_DAR)(%r2);  /* saved DAR */	\
 	ld	%r29,(savearea+CPUSAVE_AIM_DSISR)(%r2);/* saved DSISR */\
 	ld	%r30,(savearea+CPUSAVE_SRR0)(%r2); /* saved SRR0 */	\
 	ld	%r31,(savearea+CPUSAVE_SRR1)(%r2); /* saved SRR1 */	\
 	mfxer	%r3;							\
 	mfctr	%r4;							\
 	mfsprg3	%r5;							\
 	std	%r3, FRAME_XER+48(1);	/* save xer/ctr/exc */		\
 	std	%r4, FRAME_CTR+48(1);					\
 	std	%r5, FRAME_EXC+48(1);					\
 	std	%r28,FRAME_AIM_DAR+48(1);				\
 	std	%r29,FRAME_AIM_DSISR+48(1); /* save dsisr/srr0/srr1 */	\
 	std	%r30,FRAME_SRR0+48(1);					\
 	std	%r31,FRAME_SRR1+48(1);					\
 	ld	%r13,PC_CURTHREAD(%r2)	/* set kernel curthread */
 
 #define	FRAME_LEAVE(savearea)						\
 /* Disable exceptions: */						\
 	mfmsr	%r2;							\
 	andi.	%r2,%r2,~PSL_EE@l;					\
 	mtmsr	%r2;							\
 	isync;								\
 /* Now restore regs: */							\
 	ld	%r2,FRAME_SRR0+48(%r1);					\
 	ld	%r3,FRAME_SRR1+48(%r1);					\
 	ld	%r4,FRAME_CTR+48(%r1);					\
 	ld	%r5,FRAME_XER+48(%r1);					\
 	ld	%r6,FRAME_LR+48(%r1);					\
 	GET_CPUINFO(%r7);						\
 	std	%r2,(savearea+CPUSAVE_SRR0)(%r7); /* save SRR0 */	\
 	std	%r3,(savearea+CPUSAVE_SRR1)(%r7); /* save SRR1 */	\
 	ld	%r7,FRAME_CR+48(%r1);					\
 	mtctr	%r4;							\
 	mtxer	%r5;							\
 	mtlr	%r6;							\
 	mtsprg2	%r7;			/* save cr */			\
 	ld	%r31,FRAME_31+48(%r1);   /* restore r0-31 */		\
 	ld	%r30,FRAME_30+48(%r1);					\
 	ld	%r29,FRAME_29+48(%r1);					\
 	ld	%r28,FRAME_28+48(%r1);					\
 	ld	%r27,FRAME_27+48(%r1);					\
 	ld	%r26,FRAME_26+48(%r1);					\
 	ld	%r25,FRAME_25+48(%r1);					\
 	ld	%r24,FRAME_24+48(%r1);					\
 	ld	%r23,FRAME_23+48(%r1);					\
 	ld	%r22,FRAME_22+48(%r1);					\
 	ld	%r21,FRAME_21+48(%r1);					\
 	ld	%r20,FRAME_20+48(%r1);					\
 	ld	%r19,FRAME_19+48(%r1);					\
 	ld	%r18,FRAME_18+48(%r1);					\
 	ld	%r17,FRAME_17+48(%r1);					\
 	ld	%r16,FRAME_16+48(%r1);					\
 	ld	%r15,FRAME_15+48(%r1);					\
 	ld	%r14,FRAME_14+48(%r1);					\
 	ld	%r13,FRAME_13+48(%r1);					\
 	ld	%r12,FRAME_12+48(%r1);					\
 	ld	%r11,FRAME_11+48(%r1);					\
 	ld	%r10,FRAME_10+48(%r1);					\
 	ld	%r9, FRAME_9+48(%r1);					\
 	ld	%r8, FRAME_8+48(%r1);					\
 	ld	%r7, FRAME_7+48(%r1);					\
 	ld	%r6, FRAME_6+48(%r1);					\
 	ld	%r5, FRAME_5+48(%r1);					\
 	ld	%r4, FRAME_4+48(%r1);					\
 	ld	%r3, FRAME_3+48(%r1);					\
 	ld	%r2, FRAME_2+48(%r1);					\
 	ld	%r0, FRAME_0+48(%r1);					\
 	ld	%r1, FRAME_1+48(%r1);					\
 /* Can't touch %r1 from here on */					\
 	mtsprg3	%r3;			/* save r3 */			\
 /* Disable translation, machine check and recoverability: */		\
 	mfmsr	%r3;							\
 	andi.	%r3,%r3,~(PSL_DR|PSL_IR|PSL_ME|PSL_RI)@l;		\
 	mtmsr	%r3;							\
 	isync;								\
 /* Decide whether we return to user mode: */				\
 	GET_CPUINFO(%r3);						\
 	ld	%r3,(savearea+CPUSAVE_SRR1)(%r3);			\
 	mtcr	%r3;							\
 	bf	17,1f;			/* branch if PSL_PR is false */	\
 /* Restore user SRs */							\
 	GET_CPUINFO(%r3);						\
 	std	%r27,(savearea+CPUSAVE_R27)(%r3);			\
 	std	%r28,(savearea+CPUSAVE_R28)(%r3);			\
 	std	%r29,(savearea+CPUSAVE_R29)(%r3);			\
 	std	%r30,(savearea+CPUSAVE_R30)(%r3);			\
 	std	%r31,(savearea+CPUSAVE_R31)(%r3);			\
 	mflr	%r27;			/* preserve LR */		\
 	bl	restore_usersrs;	/* uses r28-r31 */		\
 	mtlr	%r27;							\
 	ld	%r31,(savearea+CPUSAVE_R31)(%r3);			\
 	ld	%r30,(savearea+CPUSAVE_R30)(%r3);			\
 	ld	%r29,(savearea+CPUSAVE_R29)(%r3);			\
 	ld	%r28,(savearea+CPUSAVE_R28)(%r3);			\
 	ld	%r27,(savearea+CPUSAVE_R27)(%r3);			\
 1:	mfsprg2	%r3;			/* restore cr */		\
 	mtcr	%r3;							\
 	GET_CPUINFO(%r3);						\
 	ld	%r3,(savearea+CPUSAVE_SRR0)(%r3); /* restore srr0 */	\
 	mtsrr0	%r3;							\
 	GET_CPUINFO(%r3);						\
 	ld	%r3,(savearea+CPUSAVE_SRR1)(%r3); /* restore srr1 */	\
 	mtsrr1	%r3;							\
 	mfsprg3	%r3			/* restore r3 */
 
 #ifdef KDTRACE_HOOKS
 	.data
 	.globl	dtrace_invop_calltrap_addr
 	.align	8
 	.type	dtrace_invop_calltrap_addr, @object
         .size	dtrace_invop_calltrap_addr, 8
 dtrace_invop_calltrap_addr:
 	.word	0
 	.word	0
 
 	.text
 #endif
 
 /*
  * Processor reset exception handler. These are typically
  * the first instructions the processor executes after a
  * software reset. We do this in two bits so that we are
  * not still hanging around in the trap handling region
  * once the MMU is turned on.
  */
 	.globl	CNAME(rstcode), CNAME(rstcodeend), CNAME(cpu_reset_handler)
 	.globl	CNAME(cpu_wakeup_handler)
 	.p2align 3
 CNAME(rstcode):
 	/*
 	 * Check if this is software reset or
 	 * processor is waking up from power saving mode
 	 * It is software reset when 46:47 = 0b00
 	 */
 	mfsrr1	%r9			/* Load SRR1 into r9 */
 	andis.	%r9,%r9,0x3		/* Logic AND with 46:47 bits */
 	beq	2f			/* Branch if software reset */
 	bl	1f
 	.llong	cpu_wakeup_handler
 
 	/* It is software reset */
 
 	/* Explicitly set MSR[SF] */
 2:	mfmsr	%r9
 	li	%r8,1
 	insrdi	%r9,%r8,1,0
 	mtmsrd	%r9
 	isync
 
 	bl	1f
 	.llong	cpu_reset_handler /* Make sure to maintain 8-byte alignment */
 
 1:	mflr	%r9
 	ld	%r9,0(%r9)
 	mtlr	%r9
 
 	blr
 CNAME(rstcodeend):
 
 cpu_reset_handler:
 	GET_TOCBASE(%r2)
 
-	ld	%r1,TOC_REF(tmpstk)(%r2)	/* get new SP */
+	addis	%r1,%r2,TOC_REF(tmpstk)@ha
+	ld	%r1,TOC_REF(tmpstk)@l(%r1)	/* get new SP */
 	addi	%r1,%r1,(TMPSTKSZ-48)
 
 	bl	CNAME(cpudep_ap_early_bootstrap) /* Set PCPU */
 	nop
 	lis	%r3,1@l
 	bl	CNAME(pmap_cpu_bootstrap)	/* Turn on virtual memory */
 	nop
 	bl	CNAME(cpudep_ap_bootstrap)	/* Set up PCPU and stack */
 	nop
 	mr	%r1,%r3				/* Use new stack */
 	bl	CNAME(cpudep_ap_setup)
 	nop
 	GET_CPUINFO(%r5)
 	ld	%r3,(PC_RESTORE)(%r5)
 	cmpldi	%cr0,%r3,0
 	beq	%cr0,2f
 	nop
 	li	%r4,1
 	bl	CNAME(longjmp)
 	nop
 2:
 #ifdef SMP
 	bl	CNAME(machdep_ap_bootstrap)	/* And away! */
 	nop
 #endif
 
 	/* Should not be reached */
 9:
 	b	9b
 
 cpu_wakeup_handler:
 	GET_TOCBASE(%r2)
 
 	/* Check for false wake up due to badly SRR1 set (eg. by OPAL) */
-	ld	%r3,TOC_REF(can_wakeup)(%r2)
+	addis	%r3,%r2,TOC_REF(can_wakeup)@ha
+	ld	%r3,TOC_REF(can_wakeup)@l(%r3)
 	ld	%r3,0(%r3)
 	cmpdi	%r3,0
 	beq	cpu_reset_handler
 
 	/* Turn on MMU after return from interrupt */
 	mfsrr1	%r3
 	ori	%r3,%r3,(PSL_IR | PSL_DR)
 	mtsrr1	%r3
 
 	/* Turn on MMU (needed to access PCB) */
 	mfmsr	%r3
 	ori	%r3,%r3,(PSL_IR | PSL_DR)
 	mtmsr	%r3
 	isync
 
 	mfsprg0	%r3
 
 	ld	%r3,PC_CURTHREAD(%r3)	/* Get current thread */
 	ld	%r3,TD_PCB(%r3)		/* Get PCB of current thread */
 	ld	%r12,PCB_CONTEXT(%r3)	/* Load the non-volatile GP regs. */
 	ld	%r13,PCB_CONTEXT+1*8(%r3)
 	ld	%r14,PCB_CONTEXT+2*8(%r3)
 	ld	%r15,PCB_CONTEXT+3*8(%r3)
 	ld	%r16,PCB_CONTEXT+4*8(%r3)
 	ld	%r17,PCB_CONTEXT+5*8(%r3)
 	ld	%r18,PCB_CONTEXT+6*8(%r3)
 	ld	%r19,PCB_CONTEXT+7*8(%r3)
 	ld	%r20,PCB_CONTEXT+8*8(%r3)
 	ld	%r21,PCB_CONTEXT+9*8(%r3)
 	ld	%r22,PCB_CONTEXT+10*8(%r3)
 	ld	%r23,PCB_CONTEXT+11*8(%r3)
 	ld	%r24,PCB_CONTEXT+12*8(%r3)
 	ld	%r25,PCB_CONTEXT+13*8(%r3)
 	ld	%r26,PCB_CONTEXT+14*8(%r3)
 	ld	%r27,PCB_CONTEXT+15*8(%r3)
 	ld	%r28,PCB_CONTEXT+16*8(%r3)
 	ld	%r29,PCB_CONTEXT+17*8(%r3)
 	ld	%r30,PCB_CONTEXT+18*8(%r3)
 	ld	%r31,PCB_CONTEXT+19*8(%r3)
 	ld	%r5,PCB_CR(%r3)		/* Load the condition register */
 	mtcr	%r5
 	ld	%r5,PCB_LR(%r3)		/* Load the link register */
 	mtsrr0	%r5
 	ld	%r1,PCB_SP(%r3)		/* Load the stack pointer */
 	ld	%r2,PCB_TOC(%r3)	/* Load the TOC pointer */
 
 	rfid
 
 /*
  * This code gets copied to all the trap vectors
  * (except ISI/DSI, ALI, and the interrupts). Has to fit in 8 instructions!
  */
 
 	.globl	CNAME(trapcode),CNAME(trapcodeend)
 	.p2align 3
 CNAME(trapcode):
 	mtsprg1	%r1			/* save SP */
 	mflr	%r1			/* Save the old LR in r1 */
 	mtsprg2 %r1			/* And then in SPRG2 */
 	ld	%r1,TRAP_GENTRAP(0)
 	mtlr	%r1
 	li	%r1, 0xe0		/* How to get the vector from LR */
 	blrl				/* Branch to generictrap */
 CNAME(trapcodeend):
 
 /* Same thing for traps setting HSRR0/HSS1 */
 	.globl	CNAME(hypertrapcode),CNAME(hypertrapcodeend)
 	.p2align 3
 CNAME(hypertrapcode):
 	mtsprg1	%r1			/* save SP */
 	mflr	%r1			/* Save the old LR in r1 */
 	mtsprg2 %r1			/* And then in SPRG2 */
 	ld	%r1,TRAP_GENTRAP(0)
 	addi	%r1,%r1,(generichypertrap-generictrap)
 	mtlr	%r1
 	li	%r1, 0xe0		/* How to get the vector from LR */
 	blrl				/* Branch to generictrap */
 CNAME(hypertrapcodeend):
 
 /*
  * For SLB misses: do special things for the kernel
  *
  * Note: SPRG1 is always safe to overwrite any time the MMU is on, which is
  * the only time this can be called.
  */
 	.globl	CNAME(slbtrap),CNAME(slbtrapend)
 	.p2align 3
 CNAME(slbtrap):
 	mtsprg1	%r1			/* save SP */
 	GET_CPUINFO(%r1)
 	std	%r2,(PC_SLBSAVE+16)(%r1)
 	mfcr	%r2			/* save CR */
 	std	%r2,(PC_SLBSAVE+104)(%r1)
 	mfsrr1	%r2			/* test kernel mode */
 	mtcr	%r2
 	bf	17,2f			/* branch if PSL_PR is false */
 	/* User mode */
 	ld	%r2,(PC_SLBSAVE+104)(%r1) /* Restore CR */
 	mtcr	%r2
 	ld	%r2,(PC_SLBSAVE+16)(%r1) /* Restore R2 */
 	mflr	%r1			/* Save the old LR in r1 */
 	mtsprg2 %r1			/* And then in SPRG2 */
 					/* 52 bytes so far */
 	bl	1f
 	.llong	generictrap
 1:	mflr	%r1
 	ld	%r1,0(%r1)
 	mtlr	%r1
 	li	%r1, 0x80		/* How to get the vector from LR */
 	blrl				/* Branch to generictrap */
 					/* 84 bytes */
 2:	mflr	%r2			/* Save the old LR in r2 */
 	nop
 	bl	3f			/* Begin dance to jump to kern_slbtrap*/
 	.llong	kern_slbtrap
 3:	mflr	%r1
 	ld	%r1,0(%r1)
 	mtlr	%r1
 	GET_CPUINFO(%r1)
 	blrl				/* 124 bytes -- 4 to spare */
 CNAME(slbtrapend):
 
 kern_slbtrap:
 	std	%r2,(PC_SLBSAVE+136)(%r1) /* old LR */
 	std	%r3,(PC_SLBSAVE+24)(%r1) /* save R3 */
 
 	/* Check if this needs to be handled as a regular trap (userseg miss) */
 	mflr	%r2
 	andi.	%r2,%r2,0xff80
 	cmpwi	%r2,0x380
 	bne	1f
 	mfdar	%r2
 	b	2f
 1:	mfsrr0	%r2
 2:	/* r2 now contains the fault address */
 	lis	%r3,SEGMENT_MASK@highesta
 	ori	%r3,%r3,SEGMENT_MASK@highera
 	sldi	%r3,%r3,32
 	oris	%r3,%r3,SEGMENT_MASK@ha
 	ori	%r3,%r3,SEGMENT_MASK@l
 	and	%r2,%r2,%r3	/* R2 = segment base address */
 	lis	%r3,USER_ADDR@highesta
 	ori	%r3,%r3,USER_ADDR@highera
 	sldi	%r3,%r3,32
 	oris	%r3,%r3,USER_ADDR@ha
 	ori	%r3,%r3,USER_ADDR@l
 	cmpd	%r2,%r3		/* Compare fault base to USER_ADDR */
 	bne	3f
 
 	/* User seg miss, handle as a regular trap */
 	ld	%r2,(PC_SLBSAVE+104)(%r1) /* Restore CR */
 	mtcr	%r2
 	ld	%r2,(PC_SLBSAVE+16)(%r1) /* Restore R2,R3 */
 	ld	%r3,(PC_SLBSAVE+24)(%r1)
 	ld	%r1,(PC_SLBSAVE+136)(%r1) /* Save the old LR in r1 */
 	mtsprg2 %r1			/* And then in SPRG2 */
 	li	%r1, 0x80		/* How to get the vector from LR */
 	b	generictrap		/* Retain old LR using b */
 	
 3:	/* Real kernel SLB miss */
 	std	%r0,(PC_SLBSAVE+0)(%r1)	/* free all volatile regs */
 	mfsprg1	%r2			/* Old R1 */
 	std	%r2,(PC_SLBSAVE+8)(%r1)
 	/* R2,R3 already saved */
 	std	%r4,(PC_SLBSAVE+32)(%r1)
 	std	%r5,(PC_SLBSAVE+40)(%r1)
 	std	%r6,(PC_SLBSAVE+48)(%r1)
 	std	%r7,(PC_SLBSAVE+56)(%r1)
 	std	%r8,(PC_SLBSAVE+64)(%r1)
 	std	%r9,(PC_SLBSAVE+72)(%r1)
 	std	%r10,(PC_SLBSAVE+80)(%r1)
 	std	%r11,(PC_SLBSAVE+88)(%r1)
 	std	%r12,(PC_SLBSAVE+96)(%r1)
 	/* CR already saved */
 	mfxer	%r2			/* save XER */
 	std	%r2,(PC_SLBSAVE+112)(%r1)
 	mflr	%r2			/* save LR (SP already saved) */
 	std	%r2,(PC_SLBSAVE+120)(%r1)
 	mfctr	%r2			/* save CTR */
 	std	%r2,(PC_SLBSAVE+128)(%r1)
 
 	/* Call handler */
 	addi	%r1,%r1,PC_SLBSTACK-48+1024
 	li	%r2,~15
 	and	%r1,%r1,%r2
 	GET_TOCBASE(%r2)
 	mflr	%r3
 	andi.	%r3,%r3,0xff80
 	mfdar	%r4
 	mfsrr0	%r5
 	bl	handle_kernel_slb_spill
 	nop
 
 	/* Save r28-31, restore r4-r12 */
 	GET_CPUINFO(%r1)
 	ld	%r4,(PC_SLBSAVE+32)(%r1)
 	ld	%r5,(PC_SLBSAVE+40)(%r1)
 	ld	%r6,(PC_SLBSAVE+48)(%r1)
 	ld	%r7,(PC_SLBSAVE+56)(%r1)
 	ld	%r8,(PC_SLBSAVE+64)(%r1)
 	ld	%r9,(PC_SLBSAVE+72)(%r1)
 	ld	%r10,(PC_SLBSAVE+80)(%r1)
 	ld	%r11,(PC_SLBSAVE+88)(%r1)
 	ld	%r12,(PC_SLBSAVE+96)(%r1)
 	std	%r28,(PC_SLBSAVE+64)(%r1)
 	std	%r29,(PC_SLBSAVE+72)(%r1)
 	std	%r30,(PC_SLBSAVE+80)(%r1)
 	std	%r31,(PC_SLBSAVE+88)(%r1)
 
 	/* Restore kernel mapping */
 	bl	restore_kernsrs
 
 	/* Restore remaining registers */
 	ld	%r28,(PC_SLBSAVE+64)(%r1)
 	ld	%r29,(PC_SLBSAVE+72)(%r1)
 	ld	%r30,(PC_SLBSAVE+80)(%r1)
 	ld	%r31,(PC_SLBSAVE+88)(%r1)
 
 	ld	%r2,(PC_SLBSAVE+104)(%r1)
 	mtcr	%r2
 	ld	%r2,(PC_SLBSAVE+112)(%r1)
 	mtxer	%r2
 	ld	%r2,(PC_SLBSAVE+120)(%r1)
 	mtlr	%r2
 	ld	%r2,(PC_SLBSAVE+128)(%r1)
 	mtctr	%r2
 	ld	%r2,(PC_SLBSAVE+136)(%r1)
 	mtlr	%r2
 
 	/* Restore r0-r3 */
 	ld	%r0,(PC_SLBSAVE+0)(%r1)
 	ld	%r2,(PC_SLBSAVE+16)(%r1)
 	ld	%r3,(PC_SLBSAVE+24)(%r1)
 	mfsprg1	%r1
 
 	/* Back to whatever we were doing */
 	rfid
 
 /*
  * For ALI: has to save DSISR and DAR
  */
 	.globl	CNAME(alitrap),CNAME(aliend)
 CNAME(alitrap):
 	mtsprg1	%r1			/* save SP */
 	GET_CPUINFO(%r1)
 	std	%r27,(PC_TEMPSAVE+CPUSAVE_R27)(%r1)	/* free r27-r31 */
 	std	%r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1)
 	std	%r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1)
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1)
 	mfdar	%r30
 	mfdsisr	%r31
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1)
 	mfsprg1	%r1			/* restore SP, in case of branch */
 	mflr	%r28			/* save LR */
 	mfcr	%r29			/* save CR */
 
 	/* Begin dance to branch to s_trap in a bit */
 	b	1f
 	.p2align 3
 1:	nop
 	bl	1f
 	.llong	s_trap
 1:	mflr	%r31
 	ld	%r31,0(%r31)
 	mtlr	%r31
 
 	/* Put our exception vector in SPRG3 */
 	li	%r31, EXC_ALI
 	mtsprg3	%r31
 
 	/* Test whether we already had PR set */
 	mfsrr1	%r31
 	mtcr	%r31
 	blrl
 CNAME(aliend):
 
 /*
  * Similar to the above for DSI
  * Has to handle standard pagetable spills
  */
 	.globl	CNAME(dsitrap),CNAME(dsiend)
 	.p2align 3
 CNAME(dsitrap):
 	mtsprg1	%r1			/* save SP */
 	GET_CPUINFO(%r1)
 	std	%r27,(PC_DISISAVE+CPUSAVE_R27)(%r1)	/* free r27-r31 */
 	std	%r28,(PC_DISISAVE+CPUSAVE_R28)(%r1)
 	std	%r29,(PC_DISISAVE+CPUSAVE_R29)(%r1)
 	std	%r30,(PC_DISISAVE+CPUSAVE_R30)(%r1)
 	std	%r31,(PC_DISISAVE+CPUSAVE_R31)(%r1)
 	mfcr	%r29			/* save CR */
 	mfxer	%r30			/* save XER */
 	mtsprg2	%r30			/* in SPRG2 */
 	mfsrr1	%r31			/* test kernel mode */
 	mtcr	%r31
 	mflr	%r28			/* save LR (SP already saved) */
 	bl	1f			/* Begin branching to disitrap */
 	.llong	disitrap
 1:	mflr	%r1
 	ld	%r1,0(%r1)
 	mtlr	%r1
 	blrl				/* Branch to generictrap */
 CNAME(dsiend):
 
 /*
  * Preamble code for DSI/ISI traps
  */
 disitrap:
 	/* Write the trap vector to SPRG3 by computing LR & 0xff00 */
 	mflr	%r1
 	andi.	%r1,%r1,0xff00
 	mtsprg3	%r1
 	
 	GET_CPUINFO(%r1)
 	ld	%r31,(PC_DISISAVE+CPUSAVE_R27)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_R27)(%r1)
 	ld	%r30,(PC_DISISAVE+CPUSAVE_R28)(%r1)
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_R28)(%r1)
 	ld	%r31,(PC_DISISAVE+CPUSAVE_R29)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_R29)(%r1)
 	ld	%r30,(PC_DISISAVE+CPUSAVE_R30)(%r1)
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1)
 	ld	%r31,(PC_DISISAVE+CPUSAVE_R31)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1)
 	mfdar	%r30
 	mfdsisr	%r31
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1)
 
 #ifdef KDB
 	/* Try to detect a kernel stack overflow */
 	mfsrr1	%r31
 	mtcr	%r31
 	bt	17,realtrap		/* branch is user mode */
 	mfsprg1	%r31			/* get old SP */
 	clrrdi	%r31,%r31,12		/* Round SP down to nearest page */
 	sub.	%r30,%r31,%r30		/* SP - DAR */
 	bge	1f
 	neg	%r30,%r30		/* modulo value */
 1:	cmpldi	%cr0,%r30,4096		/* is DAR within a page of SP? */
 	bge	%cr0,realtrap		/* no, too far away. */
 
 	/* Now convert this DSI into a DDB trap.  */
 	GET_CPUINFO(%r1)
 	ld	%r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) /* get DAR */
 	std	%r30,(PC_DBSAVE  +CPUSAVE_AIM_DAR)(%r1) /* save DAR */
 	ld	%r30,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) /* get DSISR */
 	std	%r30,(PC_DBSAVE  +CPUSAVE_AIM_DSISR)(%r1) /* save DSISR */
 	ld	%r31,(PC_DISISAVE+CPUSAVE_R27)(%r1) /* get  r27 */
 	std	%r31,(PC_DBSAVE  +CPUSAVE_R27)(%r1) /* save r27 */
 	ld	%r30,(PC_DISISAVE+CPUSAVE_R28)(%r1) /* get  r28 */
 	std	%r30,(PC_DBSAVE  +CPUSAVE_R28)(%r1) /* save r28 */
 	ld	%r31,(PC_DISISAVE+CPUSAVE_R29)(%r1) /* get  r29 */
 	std	%r31,(PC_DBSAVE  +CPUSAVE_R29)(%r1) /* save r29 */
 	ld	%r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) /* get  r30 */
 	std	%r30,(PC_DBSAVE  +CPUSAVE_R30)(%r1) /* save r30 */
 	ld	%r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) /* get  r31 */
 	std	%r31,(PC_DBSAVE  +CPUSAVE_R31)(%r1) /* save r31 */
 	b	dbtrap
 #endif
 
 	/* XXX need stack probe here */
 realtrap:
 /* Test whether we already had PR set */
 	mfsrr1	%r1
 	mtcr	%r1
 	mfsprg1	%r1			/* restore SP (might have been
 					   overwritten) */
 	bf	17,k_trap		/* branch if PSL_PR is false */
 	GET_CPUINFO(%r1)
 	ld	%r1,PC_CURPCB(%r1)
 	mr	%r27,%r28		/* Save LR, r29 */
 	mtsprg2	%r29
 	bl	restore_kernsrs		/* enable kernel mapping */
 	mfsprg2	%r29
 	mr	%r28,%r27
 	b	s_trap
 
 /*
  * generictrap does some standard setup for trap handling to minimize
  * the code that need be installed in the actual vectors. It expects
  * the following conditions.
  * 
  * R1 - Trap vector = LR & (0xff00 | R1)
  * SPRG1 - Original R1 contents
  * SPRG2 - Original LR
  */
 
 generichypertrap:
 	mtsprg3 %r1
 	mfspr	%r1, SPR_HSRR0
 	mtsrr0	%r1
 	mfspr	%r1, SPR_HSRR1
 	mtsrr1	%r1
 	mfsprg3	%r1
 	.globl	CNAME(generictrap)
 generictrap:
 	/* Save R1 for computing the exception vector */
 	mtsprg3 %r1
 
 	/* Save interesting registers */
 	GET_CPUINFO(%r1)
 	std	%r27,(PC_TEMPSAVE+CPUSAVE_R27)(%r1)	/* free r27-r31 */
 	std	%r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1)
 	std	%r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1)
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1)
 	mfdar	%r30
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1)
 	mfsprg1	%r1			/* restore SP, in case of branch */
 	mfsprg2	%r28			/* save LR */
 	mfcr	%r29			/* save CR */
 
 	/* Compute the exception vector from the link register */
 	mfsprg3 %r31
 	ori	%r31,%r31,0xff00
 	mflr	%r30
 	addi	%r30,%r30,-4 /* The branch instruction, not the next */
 	and	%r30,%r30,%r31
 	mtsprg3	%r30
 
 	/* Test whether we already had PR set */
 	mfsrr1	%r31
 	mtcr	%r31
 
 s_trap:
 	bf	17,k_trap		/* branch if PSL_PR is false */
 	GET_CPUINFO(%r1)
 u_trap:
 	ld	%r1,PC_CURPCB(%r1)
 	mr	%r27,%r28		/* Save LR, r29 */
 	mtsprg2	%r29
 	bl	restore_kernsrs		/* enable kernel mapping */
 	mfsprg2	%r29
 	mr	%r28,%r27
 
 /*
  * Now the common trap catching code.
  */
 k_trap:
 	FRAME_SETUP(PC_TEMPSAVE)
 /* Call C interrupt dispatcher: */
 trapagain:
 	GET_TOCBASE(%r2)
 	addi	%r3,%r1,48
 	bl	CNAME(powerpc_interrupt)
 	nop
 
 	.globl	CNAME(trapexit)	/* backtrace code sentinel */
 CNAME(trapexit):
 /* Disable interrupts: */
 	mfmsr	%r3
 	andi.	%r3,%r3,~PSL_EE@l
 	mtmsr	%r3
 	isync
 /* Test AST pending: */
 	ld	%r5,FRAME_SRR1+48(%r1)
 	mtcr	%r5
 	bf	17,1f			/* branch if PSL_PR is false */
 
 	GET_CPUINFO(%r3)		/* get per-CPU pointer */
 	lwz	%r4, TD_FLAGS(%r13)	/* get thread flags value */
 	lis	%r5, (TDF_ASTPENDING|TDF_NEEDRESCHED)@h
 	ori	%r5,%r5, (TDF_ASTPENDING|TDF_NEEDRESCHED)@l
 	and.	%r4,%r4,%r5
 	beq	1f
 	mfmsr	%r3			/* re-enable interrupts */
 	ori	%r3,%r3,PSL_EE@l
 	mtmsr	%r3
 	isync
 	GET_TOCBASE(%r2)
 	addi	%r3,%r1,48
 	bl	CNAME(ast)
 	nop
 	.globl	CNAME(asttrapexit)	/* backtrace code sentinel #2 */
 CNAME(asttrapexit):
 	b	trapexit		/* test ast ret value ? */
 1:
 	FRAME_LEAVE(PC_TEMPSAVE)
 	rfid
 
 #if defined(KDB)
 /*
  * Deliberate entry to dbtrap
  */
 ASENTRY_NOPROF(breakpoint)
 	mtsprg1	%r1
 	mfmsr	%r3
 	mtsrr1	%r3
 	andi.	%r3,%r3,~(PSL_EE|PSL_ME)@l
 	mtmsr	%r3			/* disable interrupts */
 	isync
 	GET_CPUINFO(%r3)
 	std	%r27,(PC_DBSAVE+CPUSAVE_R27)(%r3)
 	std	%r28,(PC_DBSAVE+CPUSAVE_R28)(%r3)
 	std	%r29,(PC_DBSAVE+CPUSAVE_R29)(%r3)
 	std	%r30,(PC_DBSAVE+CPUSAVE_R30)(%r3)
 	std	%r31,(PC_DBSAVE+CPUSAVE_R31)(%r3)
 	mflr	%r28
 	li	%r29,EXC_BPT
 	mtlr	%r29
 	mfcr	%r29
 	mtsrr0	%r28
 
 /*
  * Now the kdb trap catching code.
  */
 dbtrap:
 	/* Write the trap vector to SPRG3 by computing LR & 0xff00 */
 	mflr	%r1
 	andi.	%r1,%r1,0xff00
 	mtsprg3	%r1
 
 	GET_TOCBASE(%r1)			/* get new SP */
-	ld	%r1,TOC_REF(trapstk)(%r1)
+	addis	%r1,%r1,TOC_REF(trapstk)@ha
+	ld	%r1,TOC_REF(trapstk)@l(%r1)
 	addi	%r1,%r1,(TRAPSTKSZ-48)
 
 	FRAME_SETUP(PC_DBSAVE)
 /* Call C trap code: */
 	GET_TOCBASE(%r2)
 	addi	%r3,%r1,48
 	bl	CNAME(db_trap_glue)
 	nop
 	or.	%r3,%r3,%r3
 	bne	dbleave
 /* This wasn't for KDB, so switch to real trap: */
 	ld	%r3,FRAME_EXC+48(%r1)	/* save exception */
 	GET_CPUINFO(%r4)
 	std	%r3,(PC_DBSAVE+CPUSAVE_R31)(%r4)
 	FRAME_LEAVE(PC_DBSAVE)
 	mtsprg1	%r1			/* prepare for entrance to realtrap */
 	GET_CPUINFO(%r1)
 	std	%r27,(PC_TEMPSAVE+CPUSAVE_R27)(%r1)
 	std	%r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1)
 	std	%r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1)
 	std	%r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1)
 	std	%r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1)
 	mflr	%r28
 	mfcr	%r29
 	ld	%r31,(PC_DBSAVE+CPUSAVE_R31)(%r1)
 	mtsprg3	%r31			/* SPRG3 was clobbered by FRAME_LEAVE */
 	mfsprg1	%r1
 	b	realtrap
 dbleave:
 	FRAME_LEAVE(PC_DBSAVE)
 	rfid
 
 /*
  * In case of KDB we want a separate trap catcher for it
  */
 	.globl	CNAME(dblow),CNAME(dbend)
 	.p2align 3
 CNAME(dblow):
 	mtsprg1	%r1			/* save SP */
 	mtsprg2	%r29			/* save r29 */
 	mfcr	%r29			/* save CR in r29 */
 	mfsrr1	%r1
 	mtcr	%r1
 	bf	17,1f			/* branch if privileged */
 
 	/* Unprivileged case */
 	mtcr	%r29			/* put the condition register back */
         mfsprg2	%r29			/* ... and r29 */
         mflr	%r1			/* save LR */
 	mtsprg2 %r1			/* And then in SPRG2 */
 
 	ld	%r1, TRAP_GENTRAP(0)	/* Get branch address */
 	mtlr	%r1
 	li	%r1, 0	 		/* How to get the vector from LR */
 	blrl				/* Branch to generictrap */
 
 1:
 	GET_CPUINFO(%r1)
 	std	%r27,(PC_DBSAVE+CPUSAVE_R27)(%r1)	/* free r27 */
 	std	%r28,(PC_DBSAVE+CPUSAVE_R28)(%r1)	/* free r28 */
         mfsprg2	%r28				/* r29 holds cr...  */
         std	%r28,(PC_DBSAVE+CPUSAVE_R29)(%r1)	/* free r29 */
         std	%r30,(PC_DBSAVE+CPUSAVE_R30)(%r1)	/* free r30 */
         std	%r31,(PC_DBSAVE+CPUSAVE_R31)(%r1)	/* free r31 */
         mflr	%r28					/* save LR */
 	nop						/* alignment */
 	bl	9f					/* Begin branch */
 	.llong	dbtrap
 9:	mflr	%r1
 	ld	%r1,0(%r1)
 	mtlr	%r1
 	blrl				/* Branch to generictrap */
 CNAME(dbend):
 #endif /* KDB */
Index: projects/capsicum-test/sys/powerpc/booke/locore.S
===================================================================
--- projects/capsicum-test/sys/powerpc/booke/locore.S	(revision 345709)
+++ projects/capsicum-test/sys/powerpc/booke/locore.S	(revision 345710)
@@ -1,938 +1,940 @@
 /*-
  * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski <raj@semihalf.com>
  * Copyright (C) 2006 Semihalf, Marian Balakowicz <m8@semihalf.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
  * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "assym.inc"
 
 #include "opt_hwpmc_hooks.h"
 
 #include <machine/asm.h>
 #include <machine/hid.h>
 #include <machine/param.h>
 #include <machine/spr.h>
 #include <machine/pte.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 #include <machine/tlb.h>
 
 #define TMPSTACKSZ	16384
 
 #ifdef __powerpc64__
 #define GET_TOCBASE(r)  \
 	mfspr	r, SPR_SPRG8
 #define	TOC_RESTORE	nop
 #define	CMPI	cmpdi
 #define	CMPL	cmpld
 #define	LOAD	ld
 #define	LOADX	ldarx
 #define	STORE	std
 #define	STOREX	stdcx.
 #define	STU	stdu
 #define	CALLSIZE	48
 #define	REDZONE		288
 #define	THREAD_REG	%r13
 #define	ADDR(x)	\
 	.llong	x
 #define	WORD_SIZE	8
 #else
 #define	GET_TOCBASE(r)
 #define	TOC_RESTORE
 #define	CMPI	cmpwi
 #define	CMPL	cmplw
 #define	LOAD	lwz
 #define	LOADX	lwarx
 #define	STOREX	stwcx.
 #define	STORE	stw
 #define	STU	stwu
 #define	CALLSIZE	8
 #define	REDZONE		0
 #define	THREAD_REG	%r2
 #define	ADDR(x)	\
 	.long	x
 #define	WORD_SIZE	4
 #endif
 
 	.text
 	.globl	btext
 btext:
 
 /*
  * This symbol is here for the benefit of kvm_mkdb, and is supposed to
  * mark the start of kernel text.
  */
 	.globl	kernel_text
 kernel_text:
 
 /*
  * Startup entry.  Note, this must be the first thing in the text segment!
  */
 	.text
 	.globl	__start
 __start:
 
 /*
  * Assumptions on the boot loader:
  *  - System memory starts from physical address 0
  *  - It's mapped by a single TLB1 entry
  *  - TLB1 mapping is 1:1 pa to va
  *  - Kernel is loaded at 64MB boundary
  *  - All PID registers are set to the same value
  *  - CPU is running in AS=0
  *
  * Registers contents provided by the loader(8):
  *	r1	: stack pointer
  *	r3	: metadata pointer
  *
  * We rearrange the TLB1 layout as follows:
  *  - Find TLB1 entry we started in
  *  - Make sure it's protected, invalidate other entries
  *  - Create temp entry in the second AS (make sure it's not TLB[1])
  *  - Switch to temp mapping
  *  - Map 64MB of RAM in TLB1[1]
  *  - Use AS=0, set EPN to VM_MIN_KERNEL_ADDRESS and RPN to kernel load address
  *  - Switch to TLB1[1] mapping
  *  - Invalidate temp mapping
  *
  * locore registers use:
  *	r1	: stack pointer
  *	r2	: trace pointer (AP only, for early diagnostics)
  *	r3-r27	: scratch registers
  *	r28	: temp TLB1 entry
  *	r29	: initial TLB1 entry we started in
  *	r30-r31	: arguments (metadata pointer)
  */
 
 /*
  * Keep arguments in r30 & r31 for later use.
  */
 	mr	%r30, %r3
 	mr	%r31, %r4
 
 /*
  * Initial cleanup
  */
 	li	%r3, PSL_DE	/* Keep debug exceptions for CodeWarrior. */
 #ifdef __powerpc64__
 	oris	%r3, %r3, PSL_CM@h
 #endif
 	mtmsr	%r3
 	isync
 
 /*
  * Initial HIDs configuration
  */
 1:
 	mfpvr	%r3
 	rlwinm	%r3, %r3, 16, 16, 31
 
 	lis	%r4, HID0_E500_DEFAULT_SET@h
 	ori	%r4, %r4, HID0_E500_DEFAULT_SET@l
 
 	/* Check for e500mc and e5500 */
 	cmpli	0, 0, %r3, FSL_E500mc
 	bne	2f
 
 	lis	%r4, HID0_E500MC_DEFAULT_SET@h
 	ori	%r4, %r4, HID0_E500MC_DEFAULT_SET@l
 	b	3f
 2:
 	cmpli	0, 0, %r3, FSL_E5500
 	bne	3f
 
 	lis	%r4, HID0_E5500_DEFAULT_SET@h
 	ori	%r4, %r4, HID0_E5500_DEFAULT_SET@l
 
 3:
 	mtspr	SPR_HID0, %r4
 	isync
 
 /*
  * E500mc and E5500 do not have HID1 register, so skip HID1 setup on
  * this core.
  */
 	cmpli	0, 0, %r3, FSL_E500mc
 	beq	1f
 	cmpli	0, 0, %r3, FSL_E5500
 	beq	1f
 	cmpli	0, 0, %r3, FSL_E6500
 	beq	1f
 
 	lis	%r3, HID1_E500_DEFAULT_SET@h
 	ori	%r3, %r3, HID1_E500_DEFAULT_SET@l
 	mtspr	SPR_HID1, %r3
 	isync
 1:
 	/* Invalidate all entries in TLB0 */
 	li	%r3, 0
 	bl	tlb_inval_all
 
 	cmpwi	%r30, 0
 	beq	done_mapping
 
 /*
  * Locate the TLB1 entry that maps this code
  */
 	bl	1f
 1:	mflr	%r3
 	bl	tlb1_find_current	/* the entry found is returned in r29 */
 
 	bl	tlb1_inval_all_but_current
 
 /*
  * Create temporary mapping in AS=1 and switch to it
  */
 	bl	tlb1_temp_mapping_as1
 
 	mfmsr	%r3
 	ori	%r3, %r3, (PSL_IS | PSL_DS)
 	bl	2f
 2:	mflr	%r4
 	addi	%r4, %r4, (3f - 2b)
 	mtspr	SPR_SRR0, %r4
 	mtspr	SPR_SRR1, %r3
 	rfi				/* Switch context */
 
 /*
  * Invalidate initial entry
  */
 3:
 	mr	%r3, %r29
 	bl	tlb1_inval_entry
 
 /*
  * Setup final mapping in TLB1[1] and switch to it
  */
 	/* Final kernel mapping, map in 64 MB of RAM */
 	lis	%r3, MAS0_TLBSEL1@h	/* Select TLB1 */
 	li	%r4, 0			/* Entry 0 */
 	rlwimi	%r3, %r4, 16, 10, 15
 	mtspr	SPR_MAS0, %r3
 	isync
 
 	li	%r3, (TLB_SIZE_64M << MAS1_TSIZE_SHIFT)@l
 	oris	%r3, %r3, (MAS1_VALID | MAS1_IPROT)@h
 	mtspr	SPR_MAS1, %r3		/* note TS was not filled, so it's TS=0 */
 	isync
 
 	LOAD_ADDR(%r3, VM_MIN_KERNEL_ADDRESS)
 	ori	%r3, %r3, (_TLB_ENTRY_SHARED | MAS2_M)@l /* WIMGE = 0b00100 */
 	mtspr	SPR_MAS2, %r3
 	isync
 
 	/* Discover phys load address */
 	bl	3f
 3:	mflr	%r4			/* Use current address */
 	rlwinm	%r4, %r4, 0, 0, 5	/* 64MB alignment mask */
 	ori	%r4, %r4, (MAS3_SX | MAS3_SW | MAS3_SR)@l
 	mtspr	SPR_MAS3, %r4		/* Set RPN and protection */
 	isync
 	bl	zero_mas7
 	bl	zero_mas8
 	isync
 	tlbwe
 	isync
 	msync
 
 	/* Switch to the above TLB1[1] mapping */
 	bl	4f
 4:	mflr	%r4
 #ifdef __powerpc64__
 	clrldi	%r4, %r4, 38
 	clrrdi	%r3, %r3, 12
 #else
 	rlwinm	%r4, %r4, 0, 6, 31	/* Current offset from kernel load address */
 	rlwinm	%r3, %r3, 0, 0, 19
 #endif
 	add	%r4, %r4, %r3		/* Convert to kernel virtual address */
 	addi	%r4, %r4, (5f - 4b)
 	li	%r3, PSL_DE		/* Note AS=0 */
 #ifdef __powerpc64__
 	oris	%r3, %r3, PSL_CM@h
 #endif
 	mtspr   SPR_SRR0, %r4
 	mtspr   SPR_SRR1, %r3
 	rfi
 
 /*
  * Invalidate temp mapping
  */
 5:
 	mr	%r3, %r28
 	bl	tlb1_inval_entry
 
 done_mapping:
 
 #ifdef __powerpc64__
 	/* Set up the TOC pointer */
 	b	0f
 	.align 3
 0:	nop
 	bl	1f
 	.llong	__tocbase + 0x8000 - .
 1:	mflr	%r2
 	ld	%r1,0(%r2)
 	add	%r2,%r1,%r2
 	mtspr	SPR_SPRG8, %r2
 
 	/* Get load offset */
 	ld	%r31,-0x8000(%r2) /* First TOC entry is TOC base */
 	subf    %r31,%r31,%r2	/* Subtract from real TOC base to get base */
 
 	/* Set up the stack pointer */
-	ld	%r1,TOC_REF(tmpstack)(%r2)
+	addis	%r1,%r2,TOC_REF(tmpstack)@ha
+	ld	%r1,TOC_REF(tmpstack)@l(%r1)
 	addi	%r1,%r1,TMPSTACKSZ-96
 	add	%r1,%r1,%r31
 	bl	1f
 	.llong _DYNAMIC-.
 1:	mflr	%r3
 	ld	%r4,0(%r3)
 	add	%r3,%r4,%r3
 	mr	%r4,%r31
 #else
 /*
  * Setup a temporary stack
  */
 	bl	1f
 	.long tmpstack-.
 1:	mflr	%r1
 	lwz	%r2,0(%r1)
 	add	%r1,%r1,%r2
 	addi	%r1, %r1, (TMPSTACKSZ - 16)
 
 /*
  * Relocate kernel
  */
 	bl      1f
 	.long   _DYNAMIC-.
 	.long   _GLOBAL_OFFSET_TABLE_-.
 1:	mflr    %r5
 	lwz	%r3,0(%r5)	/* _DYNAMIC in %r3 */
 	add	%r3,%r3,%r5
 	lwz	%r4,4(%r5)	/* GOT pointer */
 	add	%r4,%r4,%r5
 	lwz	%r4,4(%r4)	/* got[0] is _DYNAMIC link addr */
 	subf	%r4,%r4,%r3	/* subtract to calculate relocbase */
 #endif
 	bl	CNAME(elf_reloc_self)
 	TOC_RESTORE
 
 /*
  * Initialise exception vector offsets
  */
 	bl	CNAME(ivor_setup)
 	TOC_RESTORE
 
 /*
  * Set up arguments and jump to system initialization code
  */
 	mr	%r3, %r30
 	mr	%r4, %r31
 
 	/* Prepare core */
 	bl	CNAME(booke_init)
 	TOC_RESTORE
 
 	/* Switch to thread0.td_kstack now */
 	mr	%r1, %r3
 	li	%r3, 0
 	STORE	%r3, 0(%r1)
 
 	/* Machine independet part, does not return */
 	bl	CNAME(mi_startup)
 	TOC_RESTORE
 	/* NOT REACHED */
 5:	b	5b
 
 
 #ifdef SMP
 /************************************************************************/
 /* AP Boot page */
 /************************************************************************/
 	.text
 	.globl	__boot_page
 	.align	12
 __boot_page:
 	bl	1f
 
 	.globl	bp_trace
 bp_trace:
 	.long	0
 
 	.globl	bp_kernload
 bp_kernload:
 	.long	0
 
 /*
  * Initial configuration
  */
 1:
 	mflr    %r31		/* r31 hold the address of bp_trace */
 
 	/* Set HIDs */
 	mfpvr	%r3
 	rlwinm	%r3, %r3, 16, 16, 31
 
 	/* HID0 for E500 is default */
 	lis	%r4, HID0_E500_DEFAULT_SET@h
 	ori	%r4, %r4, HID0_E500_DEFAULT_SET@l
 
 	cmpli	0, 0, %r3, FSL_E500mc
 	bne	2f
 	lis	%r4, HID0_E500MC_DEFAULT_SET@h
 	ori	%r4, %r4, HID0_E500MC_DEFAULT_SET@l
 	b	3f
 2:
 	cmpli	0, 0, %r3, FSL_E5500
 	bne	3f
 	lis	%r4, HID0_E5500_DEFAULT_SET@h
 	ori	%r4, %r4, HID0_E5500_DEFAULT_SET@l
 3:
 	mtspr	SPR_HID0, %r4
 	isync
 
 	/* Enable branch prediction */
 	li	%r3, BUCSR_BPEN
 	mtspr	SPR_BUCSR, %r3
 	isync
 
 	/* Invalidate all entries in TLB0 */
 	li	%r3, 0
 	bl	tlb_inval_all
 
 /*
  * Find TLB1 entry which is translating us now
  */
 	bl	2f
 2:	mflr	%r3
 	bl	tlb1_find_current	/* the entry number found is in r29 */
 
 	bl	tlb1_inval_all_but_current
 
 /*
  * Create temporary translation in AS=1 and switch to it
  */
 
 	bl	tlb1_temp_mapping_as1
 
 	mfmsr	%r3
 	ori	%r3, %r3, (PSL_IS | PSL_DS)
 #ifdef __powerpc64__
 	oris	%r3, %r3, PSL_CM@h
 #endif
 	bl	3f
 3:	mflr	%r4
 	addi	%r4, %r4, (4f - 3b)
 	mtspr	SPR_SRR0, %r4
 	mtspr	SPR_SRR1, %r3
 	rfi				/* Switch context */
 
 /*
  * Invalidate initial entry
  */
 4:
 	mr	%r3, %r29
 	bl	tlb1_inval_entry
 
 /*
  * Setup final mapping in TLB1[1] and switch to it
  */
 	/* Final kernel mapping, map in 64 MB of RAM */
 	lis	%r3, MAS0_TLBSEL1@h	/* Select TLB1 */
 	li	%r4, 0			/* Entry 0 */
 	rlwimi	%r3, %r4, 16, 4, 15
 	mtspr	SPR_MAS0, %r3
 	isync
 
 	li	%r3, (TLB_SIZE_64M << MAS1_TSIZE_SHIFT)@l
 	oris	%r3, %r3, (MAS1_VALID | MAS1_IPROT)@h
 	mtspr	SPR_MAS1, %r3		/* note TS was not filled, so it's TS=0 */
 	isync
 
 	LOAD_ADDR(%r3, VM_MIN_KERNEL_ADDRESS)
 	ori	%r3, %r3, (_TLB_ENTRY_SHARED | MAS2_M)@l /* WIMGE = 0b00100 */
 	mtspr	SPR_MAS2, %r3
 	isync
 
 	/* Retrieve kernel load [physical] address from bp_kernload */
 #ifdef __powerpc64__
 	b	0f
 	.align	3
 0:
 	nop
 #endif
 	bl 5f
 	ADDR(bp_kernload)
 	ADDR(__boot_page)
 5:	mflr	%r3
 #ifdef __powerpc64__
 	ld	%r4, 0(%r3)
 	ld	%r5, 8(%r3)
 	clrrdi	%r3, %r3, 12
 #else
 	lwz	%r4, 0(%r3)
 	lwz	%r5, 4(%r3)
 	rlwinm	%r3, %r3, 0, 0, 19
 #endif
 	sub	%r4, %r4, %r5	/* offset of bp_kernload within __boot_page */
 	lwzx	%r3, %r4, %r3
 
 	/* Set RPN and protection */
 	ori	%r3, %r3, (MAS3_SX | MAS3_SW | MAS3_SR)@l
 	mtspr	SPR_MAS3, %r3
 	isync
 	bl	zero_mas7
 	bl	zero_mas8
 	isync
 	tlbwe
 	isync
 	msync
 
 	/* Switch to the final mapping */
 	bl	6f
 6:	mflr	%r3
 	rlwinm	%r3, %r3, 0, 0xfff	/* Offset from boot page start */
 	add	%r3, %r3, %r5		/* Make this virtual address */
 	addi	%r3, %r3, (7f - 6b)
 #ifdef __powerpc64__
 	lis	%r4, PSL_CM@h		/* Note AS=0 */
 #else
 	li	%r4, 0			/* Note AS=0 */
 #endif
 	mtspr	SPR_SRR0, %r3
 	mtspr	SPR_SRR1, %r4
 	rfi
 7:
 
 /*
  * At this point we're running at virtual addresses VM_MIN_KERNEL_ADDRESS and
  * beyond so it's allowed to directly access all locations the kernel was linked
  * against.
  */
 
 /*
  * Invalidate temp mapping
  */
 	mr	%r3, %r28
 	bl	tlb1_inval_entry
 
 #ifdef __powerpc64__
 	/* Set up the TOC pointer */
 	b	0f
 	.align 3
 0:	nop
 	bl	1f
 	.llong	__tocbase + 0x8000 - .
 1:	mflr	%r2
 	ld	%r1,0(%r2)
 	add	%r2,%r1,%r2
 	mtspr	SPR_SPRG8, %r2
 
 	/* Set up the stack pointer */
-	ld	%r1,TOC_REF(tmpstack)(%r2)
+	addis	%r1,%r2,TOC_REF(tmpstack)@ha
+	ld	%r1,TOC_REF(tmpstack)@l(%r1)
 	addi	%r1,%r1,TMPSTACKSZ-96
 #else
 /*
  * Setup a temporary stack
  */
 	bl	1f
 	.long tmpstack-.
 1:	mflr	%r1
 	lwz	%r2,0(%r1)
 	add	%r1,%r1,%r2
 	stw	%r1, 0(%r1)
 	addi	%r1, %r1, (TMPSTACKSZ - 16)
 #endif
 
 /*
  * Initialise exception vector offsets
  */
 	bl	CNAME(ivor_setup)
 	TOC_RESTORE
 
 	/*
 	 * Assign our pcpu instance
 	 */
 	bl	1f
 	.long ap_pcpu-.
 1:	mflr	%r4
 	lwz	%r3, 0(%r4)
 	add	%r3, %r3, %r4
 	LOAD	%r3, 0(%r3)
 	mtsprg0	%r3
 
 	bl	CNAME(pmap_bootstrap_ap)
 	TOC_RESTORE
 
 	bl	CNAME(cpudep_ap_bootstrap)
 	TOC_RESTORE
 	/* Switch to the idle thread's kstack */
 	mr	%r1, %r3
 	
 	bl	CNAME(machdep_ap_bootstrap)
 	TOC_RESTORE
 
 	/* NOT REACHED */
 6:	b	6b
 #endif /* SMP */
 
 #if defined (BOOKE_E500)
 /*
  * Invalidate all entries in the given TLB.
  *
  * r3	TLBSEL
  */
 tlb_inval_all:
 	rlwinm	%r3, %r3, 3, (1 << 3)	/* TLBSEL */
 	ori	%r3, %r3, (1 << 2)	/* INVALL */
 	tlbivax	0, %r3
 	isync
 	msync
 
 	tlbsync
 	msync
 	blr
 
 /*
  * expects address to look up in r3, returns entry number in r29
  *
  * FIXME: the hidden assumption is we are now running in AS=0, but we should
  * retrieve actual AS from MSR[IS|DS] and put it in MAS6[SAS]
  */
 tlb1_find_current:
 	mfspr	%r17, SPR_PID0
 	slwi	%r17, %r17, MAS6_SPID0_SHIFT
 	mtspr	SPR_MAS6, %r17
 	isync
 	tlbsx	0, %r3
 	mfspr	%r17, SPR_MAS0
 	rlwinm	%r29, %r17, 16, 26, 31		/* MAS0[ESEL] -> r29 */
 
 	/* Make sure we have IPROT set on the entry */
 	mfspr	%r17, SPR_MAS1
 	oris	%r17, %r17, MAS1_IPROT@h
 	mtspr	SPR_MAS1, %r17
 	isync
 	tlbwe
 	isync
 	msync
 	blr
 
 /*
  * Invalidates a single entry in TLB1.
  *
  * r3		ESEL
  * r4-r5	scratched
  */
 tlb1_inval_entry:
 	lis	%r4, MAS0_TLBSEL1@h	/* Select TLB1 */
 	rlwimi	%r4, %r3, 16, 10, 15	/* Select our entry */
 	mtspr	SPR_MAS0, %r4
 	isync
 	tlbre
 	li	%r5, 0			/* MAS1[V] = 0 */
 	mtspr	SPR_MAS1, %r5
 	isync
 	tlbwe
 	isync
 	msync
 	blr
 
 /*
  * r29		current entry number
  * r28		returned temp entry
  * r3-r5	scratched
  */
 tlb1_temp_mapping_as1:
 	/* Read our current translation */
 	lis	%r3, MAS0_TLBSEL1@h	/* Select TLB1 */
 	rlwimi	%r3, %r29, 16, 10, 15	/* Select our current entry */
 	mtspr	SPR_MAS0, %r3
 	isync
 	tlbre
 
 	/*
 	 * Prepare and write temp entry
 	 *
 	 * FIXME this is not robust against overflow i.e. when the current
 	 * entry is the last in TLB1
 	 */
 	lis	%r3, MAS0_TLBSEL1@h	/* Select TLB1 */
 	addi	%r28, %r29, 1		/* Use next entry. */
 	rlwimi	%r3, %r28, 16, 10, 15	/* Select temp entry */
 	mtspr	SPR_MAS0, %r3
 	isync
 	mfspr	%r5, SPR_MAS1
 	li	%r4, 1			/* AS=1 */
 	rlwimi	%r5, %r4, 12, 19, 19
 	li	%r4, 0			/* Global mapping, TID=0 */
 	rlwimi	%r5, %r4, 16, 8, 15
 	oris	%r5, %r5, (MAS1_VALID | MAS1_IPROT)@h
 	mtspr	SPR_MAS1, %r5
 	isync
 	mflr	%r3
 	bl	zero_mas7
 	bl	zero_mas8
 	mtlr	%r3
 	isync
 	tlbwe
 	isync
 	msync
 	blr
 
 /*
  * Loops over TLB1, invalidates all entries skipping the one which currently
  * maps this code.
  *
  * r29		current entry
  * r3-r5	scratched
  */
 tlb1_inval_all_but_current:
 	mfspr	%r3, SPR_TLB1CFG	/* Get number of entries */
 	andi.	%r3, %r3, TLBCFG_NENTRY_MASK@l
 	li	%r4, 0			/* Start from Entry 0 */
 1:	lis	%r5, MAS0_TLBSEL1@h
 	rlwimi	%r5, %r4, 16, 10, 15
 	mtspr	SPR_MAS0, %r5
 	isync
 	tlbre
 	mfspr	%r5, SPR_MAS1
 	cmpw	%r4, %r29		/* our current entry? */
 	beq	2f
 	rlwinm	%r5, %r5, 0, 2, 31	/* clear VALID and IPROT bits */
 	mtspr	SPR_MAS1, %r5
 	isync
 	tlbwe
 	isync
 	msync
 2:	addi	%r4, %r4, 1
 	cmpw	%r4, %r3		/* Check if this is the last entry */
 	bne	1b
 	blr
 
 /*
  * MAS7 and MAS8 conditional zeroing.
  */
 .globl zero_mas7
 zero_mas7:
 	mfpvr	%r20
 	rlwinm	%r20, %r20, 16, 16, 31
 	cmpli	0, 0, %r20, FSL_E500v1
 	beq	1f
 
 	li	%r20, 0
 	mtspr	SPR_MAS7, %r20
 1:
 	blr
 
 .globl zero_mas8
 zero_mas8:
 	mfpvr	%r20
 	rlwinm	%r20, %r20, 16, 16, 31
 	cmpli	0, 0, %r20, FSL_E500mc
 	beq	1f
 	cmpli	0, 0, %r20, FSL_E5500
 	beq	1f
 
 	blr
 1:
 	li	%r20, 0
 	mtspr	SPR_MAS8, %r20
 	blr
 #endif
 
 #ifdef SMP
 .globl __boot_tlb1
 	/*
 	 * The __boot_tlb1 table is used to hold BSP TLB1 entries
 	 * marked with _TLB_ENTRY_SHARED flag during AP bootstrap.
 	 * The BSP fills in the table in tlb_ap_prep() function. Next,
 	 * AP loads its contents to TLB1 hardware in pmap_bootstrap_ap().
 	 */
 __boot_tlb1:
 	.space TLB1_MAX_ENTRIES * TLB_ENTRY_SIZE
 
 __boot_page_padding:
 	/*
 	 * Boot page needs to be exactly 4K, with the last word of this page
 	 * acting as the reset vector, so we need to stuff the remainder.
 	 * Upon release from holdoff CPU fetches the last word of the boot
 	 * page.
 	 */
 	.space	4092 - (__boot_page_padding - __boot_page)
 	b	__boot_page
 #endif /* SMP */
 
 /************************************************************************/
 /* locore subroutines */
 /************************************************************************/
 
 /*
  * Cache disable/enable/inval sequences according
  * to section 2.16 of E500CORE RM.
  */
 ENTRY(dcache_inval)
 	/* Invalidate d-cache */
 	mfspr	%r3, SPR_L1CSR0
 	ori	%r3, %r3, (L1CSR0_DCFI | L1CSR0_DCLFR)@l
 	msync
 	isync
 	mtspr	SPR_L1CSR0, %r3
 	isync
 1:	mfspr	%r3, SPR_L1CSR0
 	andi.	%r3, %r3, L1CSR0_DCFI
 	bne	1b
 	blr
 
 ENTRY(dcache_disable)
 	/* Disable d-cache */
 	mfspr	%r3, SPR_L1CSR0
 	li	%r4, L1CSR0_DCE@l
 	not	%r4, %r4
 	and	%r3, %r3, %r4
 	msync
 	isync
 	mtspr	SPR_L1CSR0, %r3
 	isync
 	blr
 
 ENTRY(dcache_enable)
 	/* Enable d-cache */
 	mfspr	%r3, SPR_L1CSR0
 	oris	%r3, %r3, (L1CSR0_DCPE | L1CSR0_DCE)@h
 	ori	%r3, %r3, (L1CSR0_DCPE | L1CSR0_DCE)@l
 	msync
 	isync
 	mtspr	SPR_L1CSR0, %r3
 	isync
 	blr
 
 ENTRY(icache_inval)
 	/* Invalidate i-cache */
 	mfspr	%r3, SPR_L1CSR1
 	ori	%r3, %r3, (L1CSR1_ICFI | L1CSR1_ICLFR)@l
 	isync
 	mtspr	SPR_L1CSR1, %r3
 	isync
 1:	mfspr	%r3, SPR_L1CSR1
 	andi.	%r3, %r3, L1CSR1_ICFI
 	bne	1b
 	blr
 
 ENTRY(icache_disable)
 	/* Disable i-cache */
 	mfspr	%r3, SPR_L1CSR1
 	li	%r4, L1CSR1_ICE@l
 	not	%r4, %r4
 	and	%r3, %r3, %r4
 	isync
 	mtspr	SPR_L1CSR1, %r3
 	isync
 	blr
 
 ENTRY(icache_enable)
 	/* Enable i-cache */
 	mfspr	%r3, SPR_L1CSR1
 	oris	%r3, %r3, (L1CSR1_ICPE | L1CSR1_ICE)@h
 	ori	%r3, %r3, (L1CSR1_ICPE | L1CSR1_ICE)@l
 	isync
 	mtspr	SPR_L1CSR1, %r3
 	isync
 	blr
 
 /*
  * L2 cache disable/enable/inval sequences for E500mc.
  */
 
 ENTRY(l2cache_inval)
 	mfspr	%r3, SPR_L2CSR0
 	oris	%r3, %r3, (L2CSR0_L2FI | L2CSR0_L2LFC)@h
 	ori	%r3, %r3, (L2CSR0_L2FI | L2CSR0_L2LFC)@l
 	isync
 	mtspr	SPR_L2CSR0, %r3
 	isync
 1:	mfspr   %r3, SPR_L2CSR0
 	andis.	%r3, %r3, L2CSR0_L2FI@h
 	bne	1b
 	blr
 
 ENTRY(l2cache_enable)
 	mfspr	%r3, SPR_L2CSR0
 	oris	%r3, %r3, (L2CSR0_L2E | L2CSR0_L2PE)@h
 	isync
 	mtspr	SPR_L2CSR0, %r3
 	isync
 	blr
 
 /*
  * Branch predictor setup.
  */
 ENTRY(bpred_enable)
 	mfspr	%r3, SPR_BUCSR
 	ori	%r3, %r3, BUCSR_BBFI
 	isync
 	mtspr	SPR_BUCSR, %r3
 	isync
 	ori	%r3, %r3, BUCSR_BPEN
 	isync
 	mtspr	SPR_BUCSR, %r3
 	isync
 	blr
 
 /*
  * XXX: This should be moved to a shared AIM/booke asm file, if one ever is
  * created.
  */
 ENTRY(get_spr)
 	mfspr	%r3, 0
 	blr
 
 /************************************************************************/
 /* Data section								*/
 /************************************************************************/
 	.data
 	.align 3
 GLOBAL(__startkernel)
 	ADDR(begin)
 GLOBAL(__endkernel)
 	ADDR(end)
 	.align	4
 tmpstack:
 	.space	TMPSTACKSZ
 tmpstackbound:
 	.space 10240	/* XXX: this really should not be necessary */
 #ifdef __powerpc64__
 TOC_ENTRY(tmpstack)
 TOC_ENTRY(bp_kernload)
 #endif
 
 /*
  * Compiled KERNBASE locations
  */
 	.globl	kernbase
 	.set	kernbase, KERNBASE
 
 #include <powerpc/booke/trap_subr.S>
Index: projects/capsicum-test/sys/powerpc/include/mmuvar.h
===================================================================
--- projects/capsicum-test/sys/powerpc/include/mmuvar.h	(revision 345709)
+++ projects/capsicum-test/sys/powerpc/include/mmuvar.h	(revision 345710)
@@ -1,122 +1,121 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 Peter Grehan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_MMUVAR_H_
 #define _MACHINE_MMUVAR_H_
 
 /*
  * A PowerPC MMU implementation is declared with a kernel object and
  * an associated method table. The MMU_DEF macro is used to declare
  * the class, and also links it to the global MMU class list.
  *
  * e.g.
  *
  * static mmu_method_t ppc8xx_methods[] = {
  *	MMUMETHOD(mmu_change_wiring,		ppc8xx_mmu_change_wiring),
  *	MMUMETHOD(mmu_clear_modify,		ppc8xx_mmu_clear_modify),
  *	MMUMETHOD(mmu_clear_reference,		ppc8xx_mmu_clear_reference),
  *  ...
  *	MMUMETHOD(mmu_dev_direct_mapped,	ppc8xx_mmu_dev_direct_mapped),
  *	{ 0, 0 }
  * };
  *
  * MMU_DEF(ppc8xx, MMU_TYPE_8xx, ppc8xx_methods, sizeof(ppc8xx_mmu_softc));
  *
  * A single level of inheritance is supported in a similar fashion to
  * kobj inheritance e.g.
  *
  * MMU_DEF_1(ppc860c, MMU_TYPE_860c, ppc860c_methods, 0, ppc8xx);
  */
 
 #include <sys/kobj.h>
 
 struct mmu_kobj {
 	/*
 	 * An MMU instance is a kernel object
 	 */
 	KOBJ_FIELDS;
 
 	/*
 	 * Utility elements that an instance may use
 	 */
 	struct mtx	mmu_mtx;	/* available for instance use */
 	void		*mmu_iptr;	/* instance data pointer */
 
 	/*
 	 * Opaque data that can be overlaid with an instance-private
 	 * structure. MMU code can test that this is large enough at
 	 * compile time with a sizeof() test againt it's softc. There
 	 * is also a run-time test when the MMU kernel object is
 	 * registered.
 	 */
 #define MMU_OPAQUESZ	64
 	u_int		mmu_opaque[MMU_OPAQUESZ];
 };
 
 typedef struct mmu_kobj		*mmu_t;
 typedef struct kobj_class	mmu_def_t;
 #define mmu_method_t		kobj_method_t
 
 #define MMUMETHOD	KOBJMETHOD
 
 #define MMU_DEF(name, ident, methods, size)	\
 						\
 mmu_def_t name = {				\
 	ident, methods, size, NULL		\
 };						\
 DATA_SET(mmu_set, name)
 
 #define MMU_DEF_INHERIT(name, ident, methods, size, base1)	\
 						\
 static kobj_class_t name ## _baseclasses[] =	\
        	{ &base1, NULL };			\
 mmu_def_t name = {                              \
 	ident, methods, size, name ## _baseclasses	\
 };                                              \
 DATA_SET(mmu_set, name)
 
 
 #if 0
 mmu_def_t name = {				\
 	ident, methods, size, name ## _baseclasses	\
 };						
 DATA_SET(mmu_set, name)
 #endif
 
 /*
  * Known MMU names
  */
 #define MMU_TYPE_BOOKE	"mmu_booke"	/* Book-E MMU specification */
 #define MMU_TYPE_OEA	"mmu_oea"	/* 32-bit OEA */
 #define MMU_TYPE_G5	"mmu_g5"	/* 64-bit bridge (ibm 970) */
-#define MMU_TYPE_P9H	"mmu_p9h"	/* 64-bit native ISA 3.0 (POWER9) hash */
 #define MMU_TYPE_8xx	"mmu_8xx"	/* 8xx quicc TLB */
 
 #endif /* _MACHINE_MMUVAR_H_ */
Index: projects/capsicum-test/sys/powerpc/ofw/ofwcall64.S
===================================================================
--- projects/capsicum-test/sys/powerpc/ofw/ofwcall64.S	(revision 345709)
+++ projects/capsicum-test/sys/powerpc/ofw/ofwcall64.S	(revision 345710)
@@ -1,327 +1,333 @@
 /*-
  * Copyright (C) 2009-2011 Nathan Whitehorn
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/syscall.h>
 
 #include <machine/trap.h>
 #include <machine/param.h>
 #include <machine/spr.h>
 #include <machine/asm.h>
 
 #define	OFWSTKSZ	4096		/* 4K Open Firmware stack */
 
 /*
  * Globals
  */
 	.data
 	.align	4
 ofwstk:
 	.space	OFWSTKSZ
 rtas_regsave:
 	.space	32 /* 4 * sizeof(register_t) */
 GLOBAL(ofmsr)
 	.llong  0, 0, 0, 0, 0		/* msr/sprg0-3 used in Open Firmware */
 GLOBAL(rtasmsr)
 	.llong	0
 GLOBAL(openfirmware_entry)
 	.llong	0			/* Open Firmware entry point */
 GLOBAL(rtas_entry)
 	.llong	0			/* RTAS entry point */
 
 TOC_ENTRY(ofmsr)
 TOC_ENTRY(ofwstk)
 TOC_ENTRY(rtasmsr)
 TOC_ENTRY(openfirmware_entry)
 TOC_ENTRY(rtas_entry)
 TOC_ENTRY(rtas_regsave)
 
 /*
  * Open Firmware Real-mode Entry Point. This is a huge pain.
  */
 
 ASENTRY_NOPROF(ofwcall)
 	mflr	%r8
 	std	%r8,16(%r1)
 	stdu	%r1,-208(%r1)
 
 	/*
 	 * We need to save the following, because OF's register save/
 	 * restore code assumes that the contents of registers are
 	 * at most 32 bits wide: lr, cr, r2, r13-r31, the old MSR. These
 	 * get placed in that order in the stack.
 	 */
 
 	mfcr	%r4
 	std	%r4,48(%r1)
 	std	%r13,56(%r1)
 	std	%r14,64(%r1)
 	std	%r15,72(%r1)
 	std	%r16,80(%r1)
 	std	%r17,88(%r1)
 	std	%r18,96(%r1)
 	std	%r19,104(%r1)
 	std	%r20,112(%r1)
 	std	%r21,120(%r1)
 	std	%r22,128(%r1)
 	std	%r23,136(%r1)
 	std	%r24,144(%r1)
 	std	%r25,152(%r1)
 	std	%r26,160(%r1)
 	std	%r27,168(%r1)
 	std	%r28,176(%r1)
 	std	%r29,184(%r1)
 	std	%r30,192(%r1)
 	std	%r31,200(%r1)
 
 	/* Record the old MSR */
 	mfmsr	%r6
 
 	/* read client interface handler */
-	ld	%r4,TOC_REF(openfirmware_entry)(%r2)
+	addis	%r4,%r2,TOC_REF(openfirmware_entry)@ha
+	ld	%r4,TOC_REF(openfirmware_entry)@l(%r4)
 	ld	%r4,0(%r4)
 
 	/* Get OF stack pointer */
-	ld	%r7,TOC_REF(ofwstk)(%r2)
+	addis	%r7,%r2,TOC_REF(ofwstk)@ha
+	ld	%r7,TOC_REF(ofwstk)@l(%r7)
 	addi	%r7,%r7,OFWSTKSZ-40
 
 	/*
 	 * Set the MSR to the OF value. This has the side effect of disabling
 	 * exceptions, which is important for the next few steps.
 	 */
 
-	ld	%r5,TOC_REF(ofmsr)(%r2)
+	addis	%r5,%r2,TOC_REF(ofmsr)@ha
+	ld	%r5,TOC_REF(ofmsr)@l(%r5)
 	ld	%r5,0(%r5)
 	mtmsrd	%r5
 	isync
 
 	/*
 	 * Set up OF stack. This needs to be accessible in real mode and
 	 * use the 32-bit ABI stack frame format. The pointer to the current
 	 * kernel stack is placed at the very top of the stack along with
 	 * the old MSR so we can get them back later.
 	 */
 	mr	%r5,%r1
 	mr	%r1,%r7
 	std	%r5,8(%r1)	/* Save real stack pointer */
 	std	%r2,16(%r1)	/* Save old TOC */
 	std	%r6,24(%r1)	/* Save old MSR */
 	std	%r8,32(%r1)	/* Save high 32-bits of the kernel's PC */
 
 	li	%r5,0
 	stw	%r5,4(%r1)
 	stw	%r5,0(%r1)
 
 	/* Finally, branch to OF */
 	mtctr	%r4
 	bctrl
 
 	/* Reload stack pointer, MSR, and reference PC from the OFW stack */
 	ld	%r7,32(%r1)
 	ld	%r6,24(%r1)
 	ld	%r2,16(%r1)
 	ld	%r1,8(%r1)
 
 	/* Get back to the MSR/PC we want, using the cached high bits of PC */
 	mtsrr1	%r6
 	clrrdi	%r7,%r7,32
 	bl	1f
 1:	mflr	%r8
 	or	%r8,%r8,%r7
 	addi	%r8,%r8,2f-1b
 	mtsrr0	%r8
 	rfid			/* Turn on MMU, exceptions, and 64-bit mode */
 
 2:
 	/* Sign-extend the return value from OF */
 	extsw	%r3,%r3
 
 	/* Restore all the non-volatile registers */
 	ld	%r5,48(%r1)
 	mtcr	%r5
 	ld	%r13,56(%r1)
 	ld	%r14,64(%r1)
 	ld	%r15,72(%r1)
 	ld	%r16,80(%r1)
 	ld	%r17,88(%r1)
 	ld	%r18,96(%r1)
 	ld	%r19,104(%r1)
 	ld	%r20,112(%r1)
 	ld	%r21,120(%r1)
 	ld	%r22,128(%r1)
 	ld	%r23,136(%r1)
 	ld	%r24,144(%r1)
 	ld	%r25,152(%r1)
 	ld	%r26,160(%r1)
 	ld	%r27,168(%r1)
 	ld	%r28,176(%r1)
 	ld	%r29,184(%r1)
 	ld	%r30,192(%r1)
 	ld	%r31,200(%r1)
 
 	/* Restore the stack and link register */
 	ld	%r1,0(%r1)
 	ld	%r0,16(%r1)
 	mtlr 	%r0
 	blr
 
 /*
  * RTAS 32-bit Entry Point. Similar to the OF one, but simpler (no separate
  * stack)
  *
  * C prototype: int rtascall(void *callbuffer, void *rtas_privdat);
  */
 
 ASENTRY_NOPROF(rtascall)
 	mflr	%r9
 	std	%r9,16(%r1)
 	stdu	%r1,-208(%r1)
 
 	/*
 	 * We need to save the following, because RTAS's register save/
 	 * restore code assumes that the contents of registers are
 	 * at most 32 bits wide: lr, cr, r2, r13-r31, the old MSR. These
 	 * get placed in that order in the stack.
 	 */
 
 	mfcr	%r5
 	std	%r5,48(%r1)
 	std	%r13,56(%r1)
 	std	%r14,64(%r1)
 	std	%r15,72(%r1)
 	std	%r16,80(%r1)
 	std	%r17,88(%r1)
 	std	%r18,96(%r1)
 	std	%r19,104(%r1)
 	std	%r20,112(%r1)
 	std	%r21,120(%r1)
 	std	%r22,128(%r1)
 	std	%r23,136(%r1)
 	std	%r24,144(%r1)
 	std	%r25,152(%r1)
 	std	%r26,160(%r1)
 	std	%r27,168(%r1)
 	std	%r28,176(%r1)
 	std	%r29,184(%r1)
 	std	%r30,192(%r1)
 	std	%r31,200(%r1)
 
 	/* Record the old MSR */
 	mfmsr	%r6
 
 	/* Read RTAS entry and reg save area pointers */
-	ld	%r5,TOC_REF(rtas_entry)(%r2)
+	addis	%r5,%r2,TOC_REF(rtas_entry)@ha
+	ld	%r5,TOC_REF(rtas_entry)@l(%r5)
 	ld	%r5,0(%r5)
-	ld	%r8,TOC_REF(rtas_regsave)(%r2)
+	addis	%r8,%r2,TOC_REF(rtas_regsave)@ha
+	ld	%r8,TOC_REF(rtas_regsave)@l(%r8)
 
 	/*
 	 * Set the MSR to the RTAS value. This has the side effect of disabling
 	 * exceptions, which is important for the next few steps.
 	 */
 
-	ld	%r7,TOC_REF(rtasmsr)(%r2)
+	addis	%r7,%r2,TOC_REF(rtasmsr)@ha
+	ld	%r7,TOC_REF(rtasmsr)@l(%r7)
 	ld	%r7,0(%r7)
 	mtmsrd	%r7
 	isync
 
 	/*
 	 * Set up RTAS register save area, so that we can get back all of
 	 * our 64-bit pointers. Save our stack pointer, the TOC, and the MSR.
 	 * Put this in r1, since RTAS is obliged to save it. Kernel globals
 	 * are below 4 GB, so this is safe.
 	 */
 	mr	%r7,%r1
 	mr	%r1,%r8
 	std	%r7,0(%r1)	/* Save 64-bit stack pointer */
 	std	%r2,8(%r1)	/* Save TOC */
 	std	%r6,16(%r1)	/* Save MSR */
 	std	%r9,24(%r1)	/* Save reference PC for high 32 bits */
 
 	/* Finally, branch to RTAS */
 	mtctr	%r5
 	bctrl
 
 	/* 
 	 * Reload stack pointer, MSR, reg PC from the reg save area in r1. We
 	 * are running in 32-bit mode at this point, so it doesn't matter if r1
 	 * has become sign-extended.
 	 */
 	ld	%r7,24(%r1)
 	ld	%r6,16(%r1)
 	ld	%r2,8(%r1)
 	ld	%r1,0(%r1)
 
 	/*
 	 * Get back to the right PC. We need to atomically re-enable
 	 * exceptions, 64-bit mode, and the MMU. One thing that has likely
 	 * happened is that, if we were running in the high-memory direct
 	 * map, we no longer are as a result of LR truncation in RTAS.
 	 * Fix this by copying the high-order bits of the LR at function
 	 * entry onto the current PC and then jumping there while flipping
 	 * all the MSR bits.
 	 */
 	mtsrr1	%r6
 	clrrdi	%r7,%r7,32
 	bl	1f
 1:	mflr	%r8
 	or	%r8,%r8,%r7
 	addi	%r8,%r8,2f-1b
 	mtsrr0	%r8
 	rfid			/* Turn on MMU, exceptions, and 64-bit mode */
 
 2:
 	/* Sign-extend the return value from RTAS */
 	extsw	%r3,%r3
 
 	/* Restore all the non-volatile registers */
 	ld	%r5,48(%r1)
 	mtcr	%r5
 	ld	%r13,56(%r1)
 	ld	%r14,64(%r1)
 	ld	%r15,72(%r1)
 	ld	%r16,80(%r1)
 	ld	%r17,88(%r1)
 	ld	%r18,96(%r1)
 	ld	%r19,104(%r1)
 	ld	%r20,112(%r1)
 	ld	%r21,120(%r1)
 	ld	%r22,128(%r1)
 	ld	%r23,136(%r1)
 	ld	%r24,144(%r1)
 	ld	%r25,152(%r1)
 	ld	%r26,160(%r1)
 	ld	%r27,168(%r1)
 	ld	%r28,176(%r1)
 	ld	%r29,184(%r1)
 	ld	%r30,192(%r1)
 	ld	%r31,200(%r1)
 
 	/* Restore the stack and link register */
 	ld	%r1,0(%r1)
 	ld	%r0,16(%r1)
 	mtlr 	%r0
 	blr
 
Index: projects/capsicum-test/sys/powerpc/powernv/opalcall.S
===================================================================
--- projects/capsicum-test/sys/powerpc/powernv/opalcall.S	(revision 345709)
+++ projects/capsicum-test/sys/powerpc/powernv/opalcall.S	(revision 345710)
@@ -1,99 +1,102 @@
 /*-
  * Copyright (C) 2015 Nathan Whitehorn
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asm.h>
 
 GLOBAL(opal_entrypoint)
 	.llong	0
 GLOBAL(opal_data)
 	.llong	0
 GLOBAL(opal_msr)
 	.llong	0
 
 TOC_ENTRY(opal_entrypoint)
 TOC_ENTRY(opal_data)
 TOC_ENTRY(opal_msr)
 
 ASENTRY(opal_call)
 	/* Args:
 	 * r3: opal token
 	 * r4-r10 opal arguments
 	 */
 
 	/* Save call stuff on stack */
 	mflr	%r0
 	std	%r0,16(%r1)
 	std	%r2,-16(%r1)
 	mfcr	%r0
 	std	%r0,8(%r1)
 
 	/* Load OPAL entry information */
 	mr	%r0,%r3
-	ld	%r3,TOC_REF(opal_entrypoint)(%r2)
+	addis	%r3,%r2,TOC_REF(opal_entrypoint)@ha
+	ld	%r3,TOC_REF(opal_entrypoint)@l(%r3)
 	ld	%r3,0(%r3)
 	mtctr	%r3
 
 	/* Save MSR in non-volatile scratch register and turn off translation */
 	std	%r31,-8(%r1)
 	mfmsr	%r31
 
 	/* Load last bits from the TOC */
-	ld	%r3,TOC_REF(opal_msr)(%r2)
+	addis	%r3,%r2,TOC_REF(opal_msr)@ha
+	ld	%r3,TOC_REF(opal_msr)@l(%r3)
 	ld	%r3,0(%r3)
-	ld	%r2,TOC_REF(opal_data)(%r2)
+	addis	%r2,%r2,TOC_REF(opal_data)@ha
+	ld	%r2,TOC_REF(opal_data)@l(%r2)
 	ld	%r2,0(%r2)
 
 	mtmsrd	%r3
 	isync
 
 	/* Shift registers over */
 	mr	%r3,%r4
 	mr	%r4,%r5
 	mr	%r5,%r6
 	mr	%r6,%r7
 	mr	%r7,%r8
 	mr	%r8,%r9
 	mr	%r9,%r10
 
 	/* Call OPAL */
 	bctrl
 
 	/* Restore MSR */
 	mtmsrd	%r31
 	isync
 	ld	%r31,-8(%r1)
 	
 	/* Restore call stuff from stack */
 	ld	%r0,16(%r1)
 	mtlr	%r0
 	ld	%r2,-16(%r1)
 	ld	%r0,8(%r1)
 	mtcr	%r0
 
 	/* And return */
 	blr
 
Index: projects/capsicum-test/sys/powerpc/powerpc/swtch64.S
===================================================================
--- projects/capsicum-test/sys/powerpc/powerpc/swtch64.S	(revision 345709)
+++ projects/capsicum-test/sys/powerpc/powerpc/swtch64.S	(revision 345710)
@@ -1,303 +1,304 @@
 /* $FreeBSD$ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "assym.inc"
 #include "opt_sched.h"
 
 #include <sys/syscall.h>
 
 #include <machine/trap.h>
 #include <machine/spr.h>
 #include <machine/param.h>
 #include <machine/asm.h>
 
 #ifdef _CALL_ELF
 .abiversion _CALL_ELF
 #endif
 
 TOC_ENTRY(blocked_lock)
 
 /*
  * void cpu_throw(struct thread *old, struct thread *new)
  */
 ENTRY(cpu_throw)
 	mr	%r13, %r4
 	li	%r14,0	/* Tell cpu_switchin not to release a thread */
 
 	b	cpu_switchin
 
 /*
  * void cpu_switch(struct thread *old,
  *		   struct thread *new,
  *		   struct mutex *mtx); 
  *
  * Switch to a new thread saving the current state in the old thread.
  */
 ENTRY(cpu_switch)
 	ld	%r6,TD_PCB(%r3)		/* Get the old thread's PCB ptr */
 	std	%r12,PCB_CONTEXT(%r6)	/* Save the non-volatile GP regs.
 					   These can now be used for scratch */
 	std	%r14,PCB_CONTEXT+2*8(%r6)	
 	std	%r15,PCB_CONTEXT+3*8(%r6)	
 	std	%r16,PCB_CONTEXT+4*8(%r6)	
 	std	%r17,PCB_CONTEXT+5*8(%r6)	
 	std	%r18,PCB_CONTEXT+6*8(%r6)	
 	std	%r19,PCB_CONTEXT+7*8(%r6)	
 	std	%r20,PCB_CONTEXT+8*8(%r6)	
 	std	%r21,PCB_CONTEXT+9*8(%r6)	
 	std	%r22,PCB_CONTEXT+10*8(%r6)	
 	std	%r23,PCB_CONTEXT+11*8(%r6)	
 	std	%r24,PCB_CONTEXT+12*8(%r6)	
 	std	%r25,PCB_CONTEXT+13*8(%r6)	
 	std	%r26,PCB_CONTEXT+14*8(%r6)	
 	std	%r27,PCB_CONTEXT+15*8(%r6)	
 	std	%r28,PCB_CONTEXT+16*8(%r6)	
 	std	%r29,PCB_CONTEXT+17*8(%r6)	
 	std	%r30,PCB_CONTEXT+18*8(%r6)	
 	std	%r31,PCB_CONTEXT+19*8(%r6)	
 
 	mfcr	%r16			/* Save the condition register */
 	std	%r16,PCB_CR(%r6)
 	mflr	%r16			/* Save the link register */
 	std	%r16,PCB_LR(%r6)
 	std	%r1,PCB_SP(%r6)		/* Save the stack pointer */
 	std	%r2,PCB_TOC(%r6)	/* Save the TOC pointer */
 	
 	mr	%r14,%r3		/* Copy the old thread ptr... */
 	mr	%r13,%r4		/* and the new thread ptr in curthread*/
 	mr	%r16,%r5		/* and the new lock */
 	mr	%r17,%r6		/* and the PCB */
 	
 	stdu	%r1,-48(%r1)
 
 	lwz	%r7, PCB_FLAGS(%r17)
 	andi.	%r7, %r7, PCB_CDSCR
 	beq	.L0
 	/* Custom DSCR was set. Reseting it to enter kernel */
 	li	%r7, 0x0
 	mtspr   SPR_DSCR, %r7
 
 .L0:
 	lwz	%r7,PCB_FLAGS(%r17)
 	/* Save FPU context if needed */
 	andi.	%r7, %r7, PCB_FPU
 	beq	.L1
 	bl	save_fpu
 	nop
 
 .L1:
 	mr	%r3,%r14		/* restore old thread ptr */
 	lwz	%r7,PCB_FLAGS(%r17)
 	/* Save Altivec context if needed */
 	andi.	%r7, %r7, PCB_VEC
 	beq	.L2
 	bl	save_vec
 	nop
 	
 .L2:
 	mr	%r3,%r14		/* restore old thread ptr */
 	bl	pmap_deactivate	/* Deactivate the current pmap */
 	nop
 
 	sync				/* Make sure all of that finished */
 
 cpu_switchin:
 #if defined(SMP) && defined(SCHED_ULE)
 	/* Wait for the new thread to become unblocked */
-	ld	%r6,TOC_REF(blocked_lock)(%r2)
+	addis	%r6,%r2,TOC_REF(blocked_lock)@ha
+	ld	%r6,TOC_REF(blocked_lock)@l(%r6)
 blocked_loop:
 	ld	%r7,TD_LOCK(%r13)
 	cmpd	%r6,%r7 
 	beq-	blocked_loop
 	isync
 #endif
 	
 	ld	%r17,TD_PCB(%r13)	/* Get new PCB */
 	ld	%r1,PCB_SP(%r17)	/* Load the stack pointer */
 	addi	%r1,%r1,-48		/* Remember about cpu_switch stack frame */
 
 	/* Release old thread now that we have a stack pointer set up */
 	cmpdi	%r14,0
 	beq-	1f
 	std	%r16,TD_LOCK(%r14)	/* ULE:	update old thread's lock */
 
 1:	mfsprg	%r7,0			/* Get the pcpu pointer */
 	std	%r13,PC_CURTHREAD(%r7)	/* Store new current thread */
 	ld	%r17,TD_PCB(%r13)	/* Store new current PCB */
 	std	%r17,PC_CURPCB(%r7)
 
 	mr	%r3,%r13		/* Get new thread ptr */
 	bl	pmap_activate		/* Activate the new address space */
 	nop
 
 	lwz	%r6, PCB_FLAGS(%r17)
 	/* Restore FPU context if needed */
 	andi.	%r6, %r6, PCB_FPU
 	beq	.L3
 	mr	%r3,%r13		/* Pass curthread to enable_fpu */
 	bl	enable_fpu
 	nop
 
 .L3:
 	lwz	%r6, PCB_FLAGS(%r17)
 	/* Restore Altivec context if needed */
 	andi.	%r6, %r6, PCB_VEC
 	beq	.L31
 	mr	%r3,%r13		/* Pass curthread to enable_vec */
 	bl	enable_vec
 	nop
 
 .L31:
 	lwz	%r6, PCB_FLAGS(%r17)
 	/* Restore Custom DSCR if needed */
 	andi.	%r6, %r6, PCB_CDSCR
 	beq	.L4
 	ld	%r6, PCB_DSCR(%r17)	/* Load the DSCR register*/
 	mtspr	SPR_DSCR, %r6
 
 	/* thread to restore is in r3 */
 .L4:
 	addi	%r1,%r1,48
 	mr	%r3,%r17		/* Recover PCB ptr */
 	ld	%r12,PCB_CONTEXT(%r3)	/* Load the non-volatile GP regs. */
 	ld	%r14,PCB_CONTEXT+2*8(%r3)	
 	ld	%r15,PCB_CONTEXT+3*8(%r3)	
 	ld	%r16,PCB_CONTEXT+4*8(%r3)	
 	ld	%r17,PCB_CONTEXT+5*8(%r3)	
 	ld	%r18,PCB_CONTEXT+6*8(%r3)	
 	ld	%r19,PCB_CONTEXT+7*8(%r3)	
 	ld	%r20,PCB_CONTEXT+8*8(%r3)	
 	ld	%r21,PCB_CONTEXT+9*8(%r3)	
 	ld	%r22,PCB_CONTEXT+10*8(%r3)	
 	ld	%r23,PCB_CONTEXT+11*8(%r3)	
 	ld	%r24,PCB_CONTEXT+12*8(%r3)	
 	ld	%r25,PCB_CONTEXT+13*8(%r3)	
 	ld	%r26,PCB_CONTEXT+14*8(%r3)	
 	ld	%r27,PCB_CONTEXT+15*8(%r3)	
 	ld	%r28,PCB_CONTEXT+16*8(%r3)
 	ld	%r29,PCB_CONTEXT+17*8(%r3)	
 	ld	%r30,PCB_CONTEXT+18*8(%r3)	
 	ld	%r31,PCB_CONTEXT+19*8(%r3)	
 	ld	%r5,PCB_CR(%r3)		/* Load the condition register */
 	mtcr	%r5
 	ld	%r5,PCB_LR(%r3)		/* Load the link register */
 	mtlr	%r5
 	ld	%r1,PCB_SP(%r3)		/* Load the stack pointer */
 	ld	%r2,PCB_TOC(%r3)	/* Load the TOC pointer */
 
 	/*
 	 * Perform a dummy stdcx. to clear any reservations we may have
 	 * inherited from the previous thread. It doesn't matter if the
 	 * stdcx succeeds or not. pcb_context[0] can be clobbered.
 	 */
 	stdcx.	%r1, 0, %r3
 	blr
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state
  */
 ENTRY(savectx)
 	std	%r12,PCB_CONTEXT(%r3)	/* Save the non-volatile GP regs. */
 	std	%r13,PCB_CONTEXT+1*8(%r3)	
 	std	%r14,PCB_CONTEXT+2*8(%r3)	
 	std	%r15,PCB_CONTEXT+3*8(%r3)	
 	std	%r16,PCB_CONTEXT+4*8(%r3)	
 	std	%r17,PCB_CONTEXT+5*8(%r3)	
 	std	%r18,PCB_CONTEXT+6*8(%r3)	
 	std	%r19,PCB_CONTEXT+7*8(%r3)	
 	std	%r20,PCB_CONTEXT+8*8(%r3)	
 	std	%r21,PCB_CONTEXT+9*8(%r3)	
 	std	%r22,PCB_CONTEXT+10*8(%r3)	
 	std	%r23,PCB_CONTEXT+11*8(%r3)	
 	std	%r24,PCB_CONTEXT+12*8(%r3)	
 	std	%r25,PCB_CONTEXT+13*8(%r3)	
 	std	%r26,PCB_CONTEXT+14*8(%r3)	
 	std	%r27,PCB_CONTEXT+15*8(%r3)	
 	std	%r28,PCB_CONTEXT+16*8(%r3)
 	std	%r29,PCB_CONTEXT+17*8(%r3)	
 	std	%r30,PCB_CONTEXT+18*8(%r3)	
 	std	%r31,PCB_CONTEXT+19*8(%r3)	
 
 	mfcr	%r4			/* Save the condition register */
 	std	%r4,PCB_CR(%r3)
 	std	%r1,PCB_SP(%r3)		/* Save the stack pointer */
 	std	%r2,PCB_TOC(%r3)	/* Save the TOC pointer */
 	mflr	%r4			/* Save the link register */
 	std	%r4,PCB_LR(%r3)
 	blr
 
 /*
  * fork_trampoline()
  * Set up the return from cpu_fork()
  */
 
 ENTRY_NOPROF(fork_trampoline)
 	ld	%r3,CF_FUNC(%r1)
 	ld	%r4,CF_ARG0(%r1)
 	ld	%r5,CF_ARG1(%r1)
 
 	stdu	%r1,-48(%r1)
 	bl	fork_exit
 	nop
 	addi	%r1,%r1,48+CF_SIZE-FSP	/* Allow 8 bytes in front of
 					   trapframe to simulate FRAME_SETUP
 					   does when allocating space for
 					   a frame pointer/saved LR */
 	bl	trapexit
 	nop
Index: projects/capsicum-test/sys/vm/vm_kern.c
===================================================================
--- projects/capsicum-test/sys/vm/vm_kern.c	(revision 345709)
+++ projects/capsicum-test/sys/vm/vm_kern.c	(revision 345710)
@@ -1,864 +1,865 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_kern.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Kernel memory management.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>		/* for ticks and hz */
 #include <sys/domainset.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 vm_map_t kernel_map;
 vm_map_t exec_map;
 vm_map_t pipe_map;
 
 const void *zero_region;
 CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
 
 /* NB: Used by kernel debuggers. */
 const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
 
 u_int exec_map_entry_size;
 u_int exec_map_entries;
 
 SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
     SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
 
 SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
 #if defined(__arm__) || defined(__sparc64__)
     &vm_max_kernel_address, 0,
 #else
     SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
 #endif
     "Max kernel address");
 
 #if VM_NRESERVLEVEL > 0
 #define	KVA_QUANTUM_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)
 #else
 /* On non-superpage architectures we want large import sizes. */
 #define	KVA_QUANTUM_SHIFT	(8 + PAGE_SHIFT)
 #endif
 #define	KVA_QUANTUM		(1 << KVA_QUANTUM_SHIFT)
 
 /*
  *	kva_alloc:
  *
  *	Allocate a virtual address range with no underlying object and
  *	no initial mapping to physical memory.  Any mapping from this
  *	range to physical memory must be explicitly created prior to
  *	its use, typically with pmap_qenter().  Any attempt to create
  *	a mapping on demand through vm_fault() will result in a panic. 
  */
 vm_offset_t
 kva_alloc(vm_size_t size)
 {
 	vm_offset_t addr;
 
 	size = round_page(size);
 	if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr))
 		return (0);
 
 	return (addr);
 }
 
 /*
  *	kva_free:
  *
  *	Release a region of kernel virtual memory allocated
  *	with kva_alloc, and return the physical pages
  *	associated with that region.
  *
  *	This routine may not block on kernel maps.
  */
 void
 kva_free(vm_offset_t addr, vm_size_t size)
 {
 
 	size = round_page(size);
 	vmem_free(kernel_arena, addr, size);
 }
 
 /*
  *	Allocates a region from the kernel address map and physical pages
  *	within the specified address range to the kernel object.  Creates a
  *	wired mapping from this region to these pages, and returns the
  *	region's starting virtual address.  The allocated pages are not
  *	necessarily physically contiguous.  If M_ZERO is specified through the
  *	given flags, then the pages are zeroed before they are mapped.
  */
 static vm_offset_t
 kmem_alloc_attr_domain(int domain, vm_size_t size, int flags, vm_paddr_t low,
     vm_paddr_t high, vm_memattr_t memattr)
 {
 	vmem_t *vmem;
 	vm_object_t object = kernel_object;
 	vm_offset_t addr, i, offset;
 	vm_page_t m;
 	int pflags, tries;
 	vm_prot_t prot;
 
 	size = round_page(size);
 	vmem = vm_dom[domain].vmd_kernel_arena;
 	if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr))
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	pflags |= VM_ALLOC_NOWAIT;
 	prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		tries = 0;
 retry:
 		m = vm_page_alloc_contig_domain(object, atop(offset + i),
 		    domain, pflags, 1, low, high, PAGE_SIZE, 0, memattr);
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
 				if (!vm_page_reclaim_contig_domain(domain,
 				    pflags, 1, low, high, PAGE_SIZE, 0) &&
 				    (flags & M_WAITOK) != 0)
 					vm_wait_domain(domain);
 				VM_OBJECT_WLOCK(object);
 				tries++;
 				goto retry;
 			}
 			kmem_unback(object, addr, i);
 			vmem_free(vmem, addr, size);
 			return (0);
 		}
 		KASSERT(vm_phys_domain(m) == domain,
 		    ("kmem_alloc_attr_domain: Domain mismatch %d != %d",
 		    vm_phys_domain(m), domain));
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		m->valid = VM_PAGE_BITS_ALL;
 		pmap_enter(kernel_pmap, addr + i, m, prot,
 		    prot | PMAP_ENTER_WIRED, 0);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (addr);
 }
 
 vm_offset_t
 kmem_alloc_attr(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high,
     vm_memattr_t memattr)
 {
 
 	return (kmem_alloc_attr_domainset(DOMAINSET_RR(), size, flags, low,
 	    high, memattr));
 }
 
 vm_offset_t
 kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
 	vm_offset_t addr;
 	int domain;
 
 	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
 	do {
 		addr = kmem_alloc_attr_domain(domain, size, flags, low, high,
 		    memattr);
 		if (addr != 0)
 			break;
 	} while (vm_domainset_iter_policy(&di, &domain) == 0);
 
 	return (addr);
 }
 
 /*
  *	Allocates a region from the kernel address map and physically
  *	contiguous pages within the specified address range to the kernel
  *	object.  Creates a wired mapping from this region to these pages, and
  *	returns the region's starting virtual address.  If M_ZERO is specified
  *	through the given flags, then the pages are zeroed before they are
  *	mapped.
  */
 static vm_offset_t
 kmem_alloc_contig_domain(int domain, vm_size_t size, int flags, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr)
 {
 	vmem_t *vmem;
 	vm_object_t object = kernel_object;
 	vm_offset_t addr, offset, tmp;
 	vm_page_t end_m, m;
 	u_long npages;
 	int pflags, tries;
  
 	size = round_page(size);
 	vmem = vm_dom[domain].vmd_kernel_arena;
 	if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	pflags |= VM_ALLOC_NOWAIT;
 	npages = atop(size);
 	VM_OBJECT_WLOCK(object);
 	tries = 0;
 retry:
 	m = vm_page_alloc_contig_domain(object, atop(offset), domain, pflags,
 	    npages, low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
 			if (!vm_page_reclaim_contig_domain(domain, pflags,
 			    npages, low, high, alignment, boundary) &&
 			    (flags & M_WAITOK) != 0)
 				vm_wait_domain(domain);
 			VM_OBJECT_WLOCK(object);
 			tries++;
 			goto retry;
 		}
 		vmem_free(vmem, addr, size);
 		return (0);
 	}
 	KASSERT(vm_phys_domain(m) == domain,
 	    ("kmem_alloc_contig_domain: Domain mismatch %d != %d",
 	    vm_phys_domain(m), domain));
 	end_m = m + npages;
 	tmp = addr;
 	for (; m < end_m; m++) {
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		m->valid = VM_PAGE_BITS_ALL;
 		pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW,
 		    VM_PROT_RW | PMAP_ENTER_WIRED, 0);
 		tmp += PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (addr);
 }
 
 vm_offset_t
 kmem_alloc_contig(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr)
 {
 
 	return (kmem_alloc_contig_domainset(DOMAINSET_RR(), size, flags, low,
 	    high, alignment, boundary, memattr));
 }
 
 vm_offset_t
 kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
 	vm_offset_t addr;
 	int domain;
 
 	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
 	do {
 		addr = kmem_alloc_contig_domain(domain, size, flags, low, high,
 		    alignment, boundary, memattr);
 		if (addr != 0)
 			break;
 	} while (vm_domainset_iter_policy(&di, &domain) == 0);
 
 	return (addr);
 }
 
 /*
  *	kmem_suballoc:
  *
  *	Allocates a map to manage a subrange
  *	of the kernel virtual address space.
  *
  *	Arguments are as follows:
  *
  *	parent		Map to take range from
  *	min, max	Returned endpoints of map
  *	size		Size of range to find
  *	superpage_align	Request that min is superpage aligned
  */
 vm_map_t
 kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max,
     vm_size_t size, boolean_t superpage_align)
 {
 	int ret;
 	vm_map_t result;
 
 	size = round_page(size);
 
 	*min = vm_map_min(parent);
 	ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ?
 	    VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
 	    MAP_ACC_NO_CHARGE);
 	if (ret != KERN_SUCCESS)
 		panic("kmem_suballoc: bad status return of %d", ret);
 	*max = *min + size;
 	result = vm_map_create(vm_map_pmap(parent), *min, *max);
 	if (result == NULL)
 		panic("kmem_suballoc: cannot create submap");
 	if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS)
 		panic("kmem_suballoc: unable to change range to submap");
 	return (result);
 }
 
 /*
  *	kmem_malloc_domain:
  *
  *	Allocate wired-down pages in the kernel's address space.
  */
 static vm_offset_t
 kmem_malloc_domain(int domain, vm_size_t size, int flags)
 {
 	vmem_t *arena;
 	vm_offset_t addr;
 	int rv;
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_true((flags & M_EXEC) == 0))
 		arena = vm_dom[domain].vmd_kernel_arena;
 	else
 		arena = vm_dom[domain].vmd_kernel_rwx_arena;
 #else
 	arena = vm_dom[domain].vmd_kernel_arena;
 #endif
 	size = round_page(size);
 	if (vmem_alloc(arena, size, flags | M_BESTFIT, &addr))
 		return (0);
 
 	rv = kmem_back_domain(domain, kernel_object, addr, size, flags);
 	if (rv != KERN_SUCCESS) {
 		vmem_free(arena, addr, size);
 		return (0);
 	}
 	return (addr);
 }
 
 vm_offset_t
 kmem_malloc(vm_size_t size, int flags)
 {
 
 	return (kmem_malloc_domainset(DOMAINSET_RR(), size, flags));
 }
 
 vm_offset_t
 kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags)
 {
 	struct vm_domainset_iter di;
 	vm_offset_t addr;
 	int domain;
 
 	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
 	do {
 		addr = kmem_malloc_domain(domain, size, flags);
 		if (addr != 0)
 			break;
 	} while (vm_domainset_iter_policy(&di, &domain) == 0);
 
 	return (addr);
 }
 
 /*
  *	kmem_back_domain:
  *
  *	Allocate physical pages from the specified domain for the specified
  *	virtual address range.
  */
 int
 kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr,
     vm_size_t size, int flags)
 {
 	vm_offset_t offset, i;
 	vm_page_t m, mpred;
 	vm_prot_t prot;
 	int pflags;
 
 	KASSERT(object == kernel_object,
 	    ("kmem_back_domain: only supports kernel object."));
 
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	if (flags & M_WAITOK)
 		pflags |= VM_ALLOC_WAITFAIL;
 	prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW;
 
 	i = 0;
 	VM_OBJECT_WLOCK(object);
 retry:
 	mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i));
 	for (; i < size; i += PAGE_SIZE, mpred = m) {
 		m = vm_page_alloc_domain_after(object, atop(offset + i),
 		    domain, pflags, mpred);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
 		 * to lock page queues here as we know that the pages we got
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
 			if ((flags & M_NOWAIT) == 0)
 				goto retry;
 			VM_OBJECT_WUNLOCK(object);
 			kmem_unback(object, addr, i);
 			return (KERN_NO_SPACE);
 		}
 		KASSERT(vm_phys_domain(m) == domain,
 		    ("kmem_back_domain: Domain mismatch %d != %d",
 		    vm_phys_domain(m), domain));
 		if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
 		    ("kmem_malloc: page %p is managed", m));
 		m->valid = VM_PAGE_BITS_ALL;
 		pmap_enter(kernel_pmap, addr + i, m, prot,
 		    prot | PMAP_ENTER_WIRED, 0);
 #if VM_NRESERVLEVEL > 0
 		if (__predict_false((prot & VM_PROT_EXECUTE) != 0))
 			m->oflags |= VPO_KMEM_EXEC;
 #endif
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	kmem_back:
  *
  *	Allocate physical pages for the specified virtual address range.
  */
 int
 kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
 {
 	vm_offset_t end, next, start;
 	int domain, rv;
 
 	KASSERT(object == kernel_object,
 	    ("kmem_back: only supports kernel object."));
 
 	for (start = addr, end = addr + size; addr < end; addr = next) {
 		/*
 		 * We must ensure that pages backing a given large virtual page
 		 * all come from the same physical domain.
 		 */
 		if (vm_ndomains > 1) {
 			domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains;
 			while (VM_DOMAIN_EMPTY(domain))
 				domain++;
 			next = roundup2(addr + 1, KVA_QUANTUM);
 			if (next > end || next < start)
 				next = end;
 		} else {
 			domain = 0;
 			next = end;
 		}
 		rv = kmem_back_domain(domain, object, addr, next - addr, flags);
 		if (rv != KERN_SUCCESS) {
 			kmem_unback(object, start, addr - start);
 			break;
 		}
 	}
 	return (rv);
 }
 
 /*
  *	kmem_unback:
  *
  *	Unmap and free the physical pages underlying the specified virtual
  *	address range.
  *
  *	A physical page must exist within the specified object at each index
  *	that is being unmapped.
  */
 static struct vmem *
 _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
 	struct vmem *arena;
 	vm_page_t m, next;
 	vm_offset_t end, offset;
 	int domain;
 
 	KASSERT(object == kernel_object,
 	    ("kmem_unback: only supports kernel object."));
 
 	if (size == 0)
 		return (NULL);
 	pmap_remove(kernel_pmap, addr, addr + size);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	end = offset + size;
 	VM_OBJECT_WLOCK(object);
 	m = vm_page_lookup(object, atop(offset)); 
 	domain = vm_phys_domain(m);
 #if VM_NRESERVLEVEL > 0
 	if (__predict_true((m->oflags & VPO_KMEM_EXEC) == 0))
 		arena = vm_dom[domain].vmd_kernel_arena;
 	else
 		arena = vm_dom[domain].vmd_kernel_rwx_arena;
 #else
 	arena = vm_dom[domain].vmd_kernel_arena;
 #endif
 	for (; offset < end; offset += PAGE_SIZE, m = next) {
 		next = vm_page_next(m);
 		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	return (arena);
 }
 
 void
 kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
 
 	(void)_kmem_unback(object, addr, size);
 }
 
 /*
  *	kmem_free:
  *
  *	Free memory allocated with kmem_malloc.  The size must match the
  *	original allocation.
  */
 void
 kmem_free(vm_offset_t addr, vm_size_t size)
 {
 	struct vmem *arena;
 
 	size = round_page(size);
 	arena = _kmem_unback(kernel_object, addr, size);
 	if (arena != NULL)
 		vmem_free(arena, addr, size);
 }
 
 /*
  *	kmap_alloc_wait:
  *
  *	Allocates pageable memory from a sub-map of the kernel.  If the submap
  *	has no room, the caller sleeps waiting for more memory in the submap.
  *
  *	This routine may block.
  */
 vm_offset_t
 kmap_alloc_wait(vm_map_t map, vm_size_t size)
 {
 	vm_offset_t addr;
 
 	size = round_page(size);
 	if (!swap_reserve(size))
 		return (0);
 
 	for (;;) {
 		/*
 		 * To make this work for more than one map, use the map's lock
 		 * to lock out sleepers/wakers.
 		 */
 		vm_map_lock(map);
-		if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0)
+		addr = vm_map_findspace(map, vm_map_min(map), size);
+		if (addr + size <= vm_map_max(map))
 			break;
 		/* no space now; see if we can ever get space */
 		if (vm_map_max(map) - vm_map_min(map) < size) {
 			vm_map_unlock(map);
 			swap_release(size);
 			return (0);
 		}
 		map->needs_wakeup = TRUE;
 		vm_map_unlock_and_wait(map, 0);
 	}
 	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW,
 	    MAP_ACC_CHARGED);
 	vm_map_unlock(map);
 	return (addr);
 }
 
 /*
  *	kmap_free_wakeup:
  *
  *	Returns memory to a submap of the kernel, and wakes up any processes
  *	waiting for memory in that map.
  */
 void
 kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size)
 {
 
 	vm_map_lock(map);
 	(void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
 	if (map->needs_wakeup) {
 		map->needs_wakeup = FALSE;
 		vm_map_wakeup(map);
 	}
 	vm_map_unlock(map);
 }
 
 void
 kmem_init_zero_region(void)
 {
 	vm_offset_t addr, i;
 	vm_page_t m;
 
 	/*
 	 * Map a single physical page of zeros to a larger virtual range.
 	 * This requires less looping in places that want large amounts of
 	 * zeros, while not using much more physical resources.
 	 */
 	addr = kva_alloc(ZERO_REGION_SIZE);
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
 		pmap_qenter(addr + i, &m, 1);
 	pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ);
 
 	zero_region = (const void *)addr;
 }
 
 /*
  * Import KVA from the kernel map into the kernel arena.
  */
 static int
 kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp)
 {
 	vm_offset_t addr;
 	int result;
 
 	KASSERT((size % KVA_QUANTUM) == 0,
 	    ("kva_import: Size %jd is not a multiple of %d",
 	    (intmax_t)size, (int)KVA_QUANTUM));
 	addr = vm_map_min(kernel_map);
 	result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0,
 	    VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 	if (result != KERN_SUCCESS)
                 return (ENOMEM);
 
 	*addrp = addr;
 
 	return (0);
 }
 
 /*
  * Import KVA from a parent arena into a per-domain arena.  Imports must be
  * KVA_QUANTUM-aligned and a multiple of KVA_QUANTUM in size.
  */
 static int
 kva_import_domain(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp)
 {
 
 	KASSERT((size % KVA_QUANTUM) == 0,
 	    ("kva_import_domain: Size %jd is not a multiple of %d",
 	    (intmax_t)size, (int)KVA_QUANTUM));
 	return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN,
 	    VMEM_ADDR_MAX, flags, addrp));
 }
 
 /*
  * 	kmem_init:
  *
  *	Create the kernel map; insert a mapping covering kernel text, 
  *	data, bss, and all space allocated thus far (`boostrap' data).  The 
  *	new map will thus map the range between VM_MIN_KERNEL_ADDRESS and 
  *	`start' as allocated, and the range between `start' and `end' as free.
  *	Create the kernel vmem arena and its per-domain children.
  */
 void
 kmem_init(vm_offset_t start, vm_offset_t end)
 {
 	vm_map_t m;
 	int domain;
 
 	m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end);
 	m->system_map = 1;
 	vm_map_lock(m);
 	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
 	kernel_map = m;
 	(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0,
 #ifdef __amd64__
 	    KERNBASE,
 #else		     
 	    VM_MIN_KERNEL_ADDRESS,
 #endif
 	    start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
 
 	/*
 	 * Initialize the kernel_arena.  This can grow on demand.
 	 */
 	vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0);
 	vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM);
 
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		/*
 		 * Initialize the per-domain arenas.  These are used to color
 		 * the KVA space in a way that ensures that virtual large pages
 		 * are backed by memory from the same physical domain,
 		 * maximizing the potential for superpage promotion.
 		 */
 		vm_dom[domain].vmd_kernel_arena = vmem_create(
 		    "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK);
 		vmem_set_import(vm_dom[domain].vmd_kernel_arena,
 		    kva_import_domain, NULL, kernel_arena, KVA_QUANTUM);
 
 		/*
 		 * In architectures with superpages, maintain separate arenas
 		 * for allocations with permissions that differ from the
 		 * "standard" read/write permissions used for kernel memory,
 		 * so as not to inhibit superpage promotion.
 		 */
 #if VM_NRESERVLEVEL > 0
 		vm_dom[domain].vmd_kernel_rwx_arena = vmem_create(
 		    "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK);
 		vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena,
 		    kva_import_domain, (vmem_release_t *)vmem_xfree,
 		    kernel_arena, KVA_QUANTUM);
 #endif
 	}
 }
 
 /*
  *	kmem_bootstrap_free:
  *
  *	Free pages backing preloaded data (e.g., kernel modules) to the
  *	system.  Currently only supported on platforms that create a
  *	vm_phys segment for preloaded data.
  */
 void
 kmem_bootstrap_free(vm_offset_t start, vm_size_t size)
 {
 #if defined(__i386__) || defined(__amd64__)
 	struct vm_domain *vmd;
 	vm_offset_t end, va;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	end = trunc_page(start + size);
 	start = round_page(start);
 
 	for (va = start; va < end; va += PAGE_SIZE) {
 		pa = pmap_kextract(va);
 		m = PHYS_TO_VM_PAGE(pa);
 
 		vmd = vm_pagequeue_domain(m);
 		vm_domain_free_lock(vmd);
 		vm_phys_free_pages(m, 0);
 		vm_domain_free_unlock(vmd);
 
 		vm_domain_freecnt_inc(vmd, 1);
 		vm_cnt.v_page_count++;
 	}
 	pmap_remove(kernel_pmap, start, end);
 	(void)vmem_add(kernel_arena, start, end - start, M_WAITOK);
 #endif
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Allow userspace to directly trigger the VM drain routine for testing
  * purposes.
  */
 static int
 debug_vm_lowmem(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	i = 0;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error)
 		return (error);
 	if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0)
 		return (EINVAL);
 	if (i != 0)
 		EVENTHANDLER_INVOKE(vm_lowmem, i);
 	return (0);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
     debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags");
 #endif
Index: projects/capsicum-test/sys/vm/vm_map.c
===================================================================
--- projects/capsicum-test/sys/vm/vm_map.c	(revision 345709)
+++ projects/capsicum-test/sys/vm/vm_map.c	(revision 345710)
@@ -1,4582 +1,4738 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/file.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a self-adjusting binary search tree of these
  *	entries is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
 static int vmspace_zinit(void *mem, int size, int flags);
 static int vm_map_zinit(void *mem, int ize, int flags);
 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
     vm_offset_t max);
-static int vm_map_alignspace(vm_map_t map, vm_object_t object,
-    vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length,
-    vm_offset_t max_addr, vm_offset_t alignment);
 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
     vm_map_entry_t gap_entry);
 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
     int cow);
 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr);
 
 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 #ifdef INVARIANTS
 	    vm_map_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
 	PMAP_LOCK_INIT(vmspace_pmap(vm));
 	return (0);
 }
 
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	memset(map, 0, sizeof(*map));
 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 }
 static void
 vm_map_zdtor(void *mem, int size, void *arg)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	KASSERT(map->nentries == 0,
 	    ("map %p nentries == %d on free.",
 	    map, map->nentries));
 	KASSERT(map->size == 0,
 	    ("map %p size == %lu on free.",
 	    map, (unsigned long)map->size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  *
  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
  */
 struct vmspace *
 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
 	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
 	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
 #ifdef RACCT
 static void
 vmspace_container_reset(struct proc *p)
 {
 
 	PROC_LOCK(p);
 	racct_set(p, RACCT_DATA, 0);
 	racct_set(p, RACCT_STACK, 0);
 	racct_set(p, RACCT_RSS, 0);
 	racct_set(p, RACCT_MEMLOCK, 0);
 	racct_set(p, RACCT_VMEM, 0);
 	PROC_UNLOCK(p);
 }
 #endif
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
 	    vm_map_max(&vm->vm_map));
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "vmspace_free() called");
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
 	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
 
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 *
 	 * The last exiting process to reach this point releases as
 	 * much of the environment as it can. vmspace_dofree() is the
 	 * slower fallback in case another process had a temporary
 	 * reference to the vmspace.
 	 */
 
 	p = td->td_proc;
 	vm = p->p_vmspace;
 	atomic_add_int(&vmspace0.vm_refcnt, 1);
 	refcnt = vm->vm_refcnt;
 	do {
 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 			/* Switch now since other proc might free vmspace */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
 	if (refcnt == 1) {
 		if (p->p_vmspace != vm) {
 			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
 		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
 #ifdef RACCT
 	if (racct_enable)
 		vmspace_container_reset(p);
 #endif
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
 	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	if (vm == NULL) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
 	refcnt = vm->vm_refcnt;
 	do {
 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
 			PROC_VMSPACE_UNLOCK(p);
 			return (NULL);
 		}
 	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 /*
  * Switch between vmspaces in an AIO kernel process.
  *
  * The AIO kernel processes switch to and from a user process's
  * vmspace while performing an I/O operation on behalf of a user
  * process.  The new vmspace is either the vmspace of a user process
  * obtained from an active AIO request or the initial vmspace of the
  * AIO kernel process (when it is idling).  Because user processes
  * will block to drain any active AIO requests before proceeding in
  * exit() or execve(), the vmspace reference count for these vmspaces
  * can never be 0.  This allows for a much simpler implementation than
  * the loop in vmspace_acquire_ref() above.  Similarly, AIO kernel
  * processes hold an extra reference on their initial vmspace for the
  * life of the process so that this guarantee is true for any vmspace
  * passed as 'newvm'.
  */
 void
 vmspace_switch_aio(struct vmspace *newvm)
 {
 	struct vmspace *oldvm;
 
 	/* XXX: Need some way to assert that this is an aio daemon. */
 
 	KASSERT(newvm->vm_refcnt > 0,
 	    ("vmspace_switch_aio: newvm unreferenced"));
 
 	oldvm = curproc->p_vmspace;
 	if (oldvm == newvm)
 		return;
 
 	/*
 	 * Point to the new address space and refer to it.
 	 */
 	curproc->p_vmspace = newvm;
 	atomic_add_int(&newvm->vm_refcnt, 1);
 
 	/* Activate the new mapping. */
 	pmap_activate(curthread);
 
 	/* Remove the daemon's reference to the old address space. */
 	KASSERT(oldvm->vm_refcnt > 1,
 	    ("vmspace_switch_aio: oldvm dropping last reference"));
 	vmspace_free(oldvm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xlock_(&map->lock, file, line);
 	map->timestamp++;
 }
 
 static void
 vm_map_process_deferred(void)
 {
 	struct thread *td;
 	vm_map_entry_t entry, next;
 	vm_object_t object;
 
 	td = curthread;
 	entry = td->td_map_def_user;
 	td->td_map_def_user = NULL;
 	while (entry != NULL) {
 		next = entry->next;
 		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
 			/*
 			 * Decrement the object's writemappings and
 			 * possibly the vnode's v_writecount.
 			 */
 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 			    ("Submap with writecount"));
 			object = entry->object.vm_object;
 			KASSERT(object != NULL, ("No object for writecount"));
 			vnode_pager_release_writecount(object, entry->start,
 			    entry->end);
 		}
 		vm_map_entry_deallocate(entry, FALSE);
 		entry = next;
 	}
 }
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_xunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_slock_(&map->lock, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_sunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_xlock_(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_slock_(&map->lock, file, line);
 	return (error == 0);
 }
 
 /*
  *	_vm_map_lock_upgrade:	[ internal use only ]
  *
  *	Tries to upgrade a read (shared) lock on the specified map to a write
  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
  *	returned without a read or write lock held.
  *
  *	Requires that the map be read locked.
  */
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 	unsigned int last_timestamp;
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		if (!sx_try_upgrade_(&map->lock, file, line)) {
 			last_timestamp = map->timestamp;
 			sx_sunlock_(&map->lock, file, line);
 			vm_map_process_deferred();
 			/*
 			 * If the map's timestamp does not change while the
 			 * map is unlocked, then the upgrade succeeds.
 			 */
 			sx_xlock_(&map->lock, file, line);
 			if (last_timestamp != map->timestamp) {
 				sx_xunlock_(&map->lock, file, line);
 				return (1);
 			}
 		}
 	}
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		sx_downgrade_(&map->lock, file, line);
 }
 
 /*
  *	vm_map_locked:
  *
  *	Returns a non-zero value if the caller holds a write (exclusive) lock
  *	on the specified map and the value "0" otherwise.
  */
 int
 vm_map_locked(vm_map_t map)
 {
 
 	if (map->system_map)
 		return (mtx_owned(&map->system_mtx));
 	else
 		return (sx_xlocked(&map->lock));
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	else
 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
 }
 
 #define	VM_MAP_ASSERT_LOCKED(map) \
     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
+
+static void
+_vm_map_assert_consistent(vm_map_t map)
+{
+	vm_map_entry_t entry;
+	vm_map_entry_t child;
+	vm_size_t max_left, max_right;
+
+	for (entry = map->header.next; entry != &map->header;
+	    entry = entry->next) {
+		KASSERT(entry->prev->end <= entry->start,
+		    ("map %p prev->end = %jx, start = %jx", map,
+		    (uintmax_t)entry->prev->end, (uintmax_t)entry->start));
+		KASSERT(entry->start < entry->end,
+		    ("map %p start = %jx, end = %jx", map,
+		    (uintmax_t)entry->start, (uintmax_t)entry->end));
+		KASSERT(entry->end <= entry->next->start,
+		    ("map %p end = %jx, next->start = %jx", map,
+		    (uintmax_t)entry->end, (uintmax_t)entry->next->start));
+		KASSERT(entry->left == NULL ||
+		    entry->left->start < entry->start,
+		    ("map %p left->start = %jx, start = %jx", map,
+		    (uintmax_t)entry->left->start, (uintmax_t)entry->start));
+		KASSERT(entry->right == NULL ||
+		    entry->start < entry->right->start,
+		    ("map %p start = %jx, right->start = %jx", map,
+		    (uintmax_t)entry->start, (uintmax_t)entry->right->start));
+		child = entry->left;
+		max_left = (child != NULL) ? child->max_free :
+			entry->start - entry->prev->end;
+		child = entry->right;
+		max_right = (child != NULL) ? child->max_free :
+			entry->next->start - entry->end;
+		KASSERT(entry->max_free == MAX(max_left, max_right),
+		    ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
+		     (uintmax_t)entry->max_free,
+		     (uintmax_t)max_left, (uintmax_t)max_right));
+	}	
+}
+
+#define VM_MAP_ASSERT_CONSISTENT(map) \
+    _vm_map_assert_consistent(map)
 #else
 #define	VM_MAP_ASSERT_LOCKED(map)
+#define VM_MAP_ASSERT_CONSISTENT(map)
 #endif
 
 /*
  *	_vm_map_unlock_and_wait:
  *
  *	Atomically releases the lock on the specified map and puts the calling
  *	thread to sleep.  The calling thread will remain asleep until either
  *	vm_map_wakeup() is performed on the map or the specified timeout is
  *	exceeded.
  *
  *	WARNING!  This function does not perform deferred deallocations of
  *	objects and map	entries.  Therefore, the calling thread is expected to
  *	reacquire the map lock after reawakening and later perform an ordinary
  *	unlock operation, such as vm_map_unlock(), before completing its
  *	operation on the map.
  */
 int
 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
 {
 
 	mtx_lock(&map_sleep_mtx);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xunlock_(&map->lock, file, line);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 	    timo));
 }
 
 /*
  *	vm_map_wakeup:
  *
  *	Awaken any threads that have slept on the map using
  *	vm_map_unlock_and_wait().
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the map unlock
 	 * and the msleep() in _vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 void
 vm_map_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	map->busy++;
 }
 
 void
 vm_map_unbusy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
 		wakeup(&map->busy);
 	}
 }
 
 void 
 vm_map_wait_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	while (map->busy) {
 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
 		if (map->system_map)
 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
 		else
 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
 	}
 	map->timestamp++;
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 	vm_map_t result;
 
 	result = uma_zalloc(mapzone, M_WAITOK);
 	CTR1(KTR_VM, "vm_map_create: %p", result);
 	_vm_map_init(result, pmap, min, max);
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  */
 static void
 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.next = map->header.prev = &map->header;
 	map->header.eflags = MAP_ENTRY_HEADER;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
 	map->header.end = min;
 	map->header.start = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
 	map->busy = 0;
 	map->anon_loc = 0;
 }
 
 void
 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	_vm_map_init(map, pmap, min, max);
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_set_max_free:
  *
  *	Set the max_free field in a vm_map_entry.
  */
 static inline void
 vm_map_entry_set_max_free(vm_map_entry_t entry)
 {
+	vm_map_entry_t child;
+	vm_size_t max_left, max_right;
 
-	entry->max_free = entry->adj_free;
-	if (entry->left != NULL && entry->left->max_free > entry->max_free)
-		entry->max_free = entry->left->max_free;
-	if (entry->right != NULL && entry->right->max_free > entry->max_free)
-		entry->max_free = entry->right->max_free;
+	child = entry->left;
+	max_left = (child != NULL) ? child->max_free :
+	    entry->start - entry->prev->end;
+	child = entry->right;
+	max_right = (child != NULL) ? child->max_free :
+	    entry->next->start - entry->end;
+	entry->max_free = MAX(max_left, max_right);
 }
 
+#define SPLAY_LEFT_STEP(root, y, rlist, test) do {	\
+	y = root->left;					\
+	if (y != NULL && (test)) {			\
+		/* Rotate right and make y root. */	\
+		root->left = y->right;			\
+		y->right = root;			\
+		vm_map_entry_set_max_free(root);	\
+		root = y;				\
+		y = root->left;				\
+	}						\
+	/* Put root on rlist. */			\
+	root->left = rlist;				\
+	rlist = root;					\
+	root = y;					\
+} while (0)
+
+#define SPLAY_RIGHT_STEP(root, y, llist, test) do {	\
+	y = root->right;				\
+	if (y != NULL && (test)) {			\
+		/* Rotate left and make y root. */	\
+		root->right = y->left;			\
+		y->left = root;				\
+		vm_map_entry_set_max_free(root);	\
+		root = y;				\
+		y = root->right;			\
+	}						\
+	/* Put root on llist. */			\
+	root->right = llist;				\
+	llist = root;					\
+	root = y;					\
+} while (0)
+
 /*
- *	vm_map_entry_splay:
- *
- *	The Sleator and Tarjan top-down splay algorithm with the
- *	following variation.  Max_free must be computed bottom-up, so
- *	on the downward pass, maintain the left and right spines in
- *	reverse order.  Then, make a second pass up each side to fix
- *	the pointers and compute max_free.  The time bound is O(log n)
- *	amortized.
- *
- *	The new root is the vm_map_entry containing "addr", or else an
- *	adjacent entry (lower or higher) if addr is not in the tree.
- *
- *	The map must be locked, and leaves it so.
- *
- *	Returns: the new root.
+ * Walk down the tree until we find addr or a NULL pointer where addr would go,
+ * breaking off left and right subtrees of nodes less than, or greater than
+ * addr.  Treat pointers to nodes with max_free < length as NULL pointers.
+ * llist and rlist are the two sides in reverse order (bottom-up), with llist
+ * linked by the right pointer and rlist linked by the left pointer in the
+ * vm_map_entry.
  */
 static vm_map_entry_t
-vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
+vm_map_splay_split(vm_offset_t addr, vm_size_t length,
+    vm_map_entry_t root, vm_map_entry_t *out_llist, vm_map_entry_t *out_rlist)
 {
 	vm_map_entry_t llist, rlist;
-	vm_map_entry_t ltree, rtree;
 	vm_map_entry_t y;
 
-	/* Special case of empty tree. */
-	if (root == NULL)
-		return (root);
-
-	/*
-	 * Pass One: Splay down the tree until we find addr or a NULL
-	 * pointer where addr would go.  llist and rlist are the two
-	 * sides in reverse order (bottom-up), with llist linked by
-	 * the right pointer and rlist linked by the left pointer in
-	 * the vm_map_entry.  Wait until Pass Two to set max_free on
-	 * the two spines.
-	 */
 	llist = NULL;
 	rlist = NULL;
-	for (;;) {
-		/* root is never NULL in here. */
+	while (root != NULL && root->max_free >= length) {
 		if (addr < root->start) {
-			y = root->left;
-			if (y == NULL)
-				break;
-			if (addr < y->start && y->left != NULL) {
-				/* Rotate right and put y on rlist. */
-				root->left = y->right;
-				y->right = root;
-				vm_map_entry_set_max_free(root);
-				root = y->left;
-				y->left = rlist;
-				rlist = y;
-			} else {
-				/* Put root on rlist. */
-				root->left = rlist;
-				rlist = root;
-				root = y;
-			}
+			SPLAY_LEFT_STEP(root, y, rlist,
+			    y->max_free >= length && addr < y->start);
 		} else if (addr >= root->end) {
-			y = root->right;
-			if (y == NULL)
-				break;
-			if (addr >= y->end && y->right != NULL) {
-				/* Rotate left and put y on llist. */
-				root->right = y->left;
-				y->left = root;
-				vm_map_entry_set_max_free(root);
-				root = y->right;
-				y->right = llist;
-				llist = y;
-			} else {
-				/* Put root on llist. */
-				root->right = llist;
-				llist = root;
-				root = y;
-			}
+			SPLAY_RIGHT_STEP(root, y, llist,
+			    y->max_free >= length && addr >= y->end);
 		} else
 			break;
 	}
+	*out_llist = llist;
+	*out_rlist = rlist;
+	return (root);
+}
 
-	/*
-	 * Pass Two: Walk back up the two spines, flip the pointers
-	 * and set max_free.  The subtrees of the root go at the
-	 * bottom of llist and rlist.
-	 */
-	ltree = root->left;
+static void
+vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *iolist)
+{
+	vm_map_entry_t rlist, y;
+
+	root = root->right;
+	rlist = *iolist;
+	while (root != NULL)
+		SPLAY_LEFT_STEP(root, y, rlist, true);
+	*iolist = rlist;
+}
+
+static void
+vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *iolist)
+{
+	vm_map_entry_t llist, y;
+
+	root = root->left;
+	llist = *iolist;
+	while (root != NULL)
+		SPLAY_RIGHT_STEP(root, y, llist, true);
+	*iolist = llist;
+}
+
+/*
+ * Walk back up the two spines, flip the pointers and set max_free.  The
+ * subtrees of the root go at the bottom of llist and rlist.
+ */
+static vm_map_entry_t
+vm_map_splay_merge(vm_map_entry_t root,
+    vm_map_entry_t llist, vm_map_entry_t rlist,
+    vm_map_entry_t ltree, vm_map_entry_t rtree)
+{
+	vm_map_entry_t y;
+
 	while (llist != NULL) {
 		y = llist->right;
 		llist->right = ltree;
 		vm_map_entry_set_max_free(llist);
 		ltree = llist;
 		llist = y;
 	}
-	rtree = root->right;
 	while (rlist != NULL) {
 		y = rlist->left;
 		rlist->left = rtree;
 		vm_map_entry_set_max_free(rlist);
 		rtree = rlist;
 		rlist = y;
 	}
 
 	/*
 	 * Final assembly: add ltree and rtree as subtrees of root.
 	 */
 	root->left = ltree;
 	root->right = rtree;
 	vm_map_entry_set_max_free(root);
 
 	return (root);
 }
 
 /*
+ *	vm_map_entry_splay:
+ *
+ *	The Sleator and Tarjan top-down splay algorithm with the
+ *	following variation.  Max_free must be computed bottom-up, so
+ *	on the downward pass, maintain the left and right spines in
+ *	reverse order.  Then, make a second pass up each side to fix
+ *	the pointers and compute max_free.  The time bound is O(log n)
+ *	amortized.
+ *
+ *	The new root is the vm_map_entry containing "addr", or else an
+ *	adjacent entry (lower if possible) if addr is not in the tree.
+ *
+ *	The map must be locked, and leaves it so.
+ *
+ *	Returns: the new root.
+ */
+static vm_map_entry_t
+vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
+{
+	vm_map_entry_t llist, rlist;
+
+	root = vm_map_splay_split(addr, 0, root, &llist, &rlist);
+	if (root != NULL) {
+		/* do nothing */
+	} else if (llist != NULL) {
+		/*
+		 * Recover the greatest node in the left
+		 * subtree and make it the root.
+		 */
+		root = llist;
+		llist = root->right;
+		root->right = NULL;
+	} else if (rlist != NULL) {
+		/*
+		 * Recover the least node in the right
+		 * subtree and make it the root.
+		 */
+		root = rlist;
+		rlist = root->left;
+		root->left = NULL;
+	} else {
+		/* There is no root. */
+		return (NULL);
+	}
+	return (vm_map_splay_merge(root, llist, rlist,
+	    root->left, root->right));
+}
+
+/*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 static void
 vm_map_entry_link(vm_map_t map,
-		  vm_map_entry_t after_where,
 		  vm_map_entry_t entry)
 {
+	vm_map_entry_t llist, rlist, root;
 
-	CTR4(KTR_VM,
-	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
-	    map->nentries, entry, after_where);
+	CTR3(KTR_VM,
+	    "vm_map_entry_link: map %p, nentries %d, entry %p", map,
+	    map->nentries, entry);
 	VM_MAP_ASSERT_LOCKED(map);
-	KASSERT(after_where->end <= entry->start,
-	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
-	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
-	KASSERT(entry->end <= after_where->next->start,
-	    ("vm_map_entry_link: new end %jx next start %jx overlap",
-	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
-
 	map->nentries++;
-	entry->prev = after_where;
-	entry->next = after_where->next;
-	entry->next->prev = entry;
-	after_where->next = entry;
-
-	if (after_where != &map->header) {
-		if (after_where != map->root)
-			vm_map_entry_splay(after_where->start, map->root);
-		entry->right = after_where->right;
-		entry->left = after_where;
-		after_where->right = NULL;
-		after_where->adj_free = entry->start - after_where->end;
-		vm_map_entry_set_max_free(after_where);
-	} else {
-		entry->right = map->root;
-		entry->left = NULL;
-	}
-	entry->adj_free = entry->next->start - entry->end;
-	vm_map_entry_set_max_free(entry);
+	root = map->root;
+	root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
+	KASSERT(root == NULL,
+	    ("vm_map_entry_link: link object already mapped"));
+	entry->prev = (llist == NULL) ? &map->header : llist;
+	entry->next = (rlist == NULL) ? &map->header : rlist;
+	entry->prev->next = entry->next->prev = entry;
+	root = vm_map_splay_merge(entry, llist, rlist, NULL, NULL);
 	map->root = entry;
+	VM_MAP_ASSERT_CONSISTENT(map);
 }
 
+enum unlink_merge_type {
+	UNLINK_MERGE_PREV,
+	UNLINK_MERGE_NONE,
+	UNLINK_MERGE_NEXT
+};
+
 static void
 vm_map_entry_unlink(vm_map_t map,
-		    vm_map_entry_t entry)
+		    vm_map_entry_t entry,
+		    enum unlink_merge_type op)
 {
-	vm_map_entry_t next, prev, root;
+	vm_map_entry_t llist, rlist, root, y;
 
 	VM_MAP_ASSERT_LOCKED(map);
-	if (entry != map->root)
-		vm_map_entry_splay(entry->start, map->root);
-	if (entry->left == NULL)
-		root = entry->right;
-	else {
-		root = vm_map_entry_splay(entry->start, entry->left);
-		root->right = entry->right;
-		root->adj_free = entry->next->start - root->end;
-		vm_map_entry_set_max_free(root);
+	llist = entry->prev;
+	rlist = entry->next;
+	llist->next = rlist;
+	rlist->prev = llist;
+	root = map->root;
+	root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
+	KASSERT(root != NULL,
+	    ("vm_map_entry_unlink: unlink object not mapped"));
+
+	switch (op) {
+	case UNLINK_MERGE_PREV:
+		vm_map_splay_findprev(root, &llist);
+		llist->end = root->end;
+		y = root->right;
+		root = llist;
+		llist = root->right;
+		root->right = y;
+		break;
+	case UNLINK_MERGE_NEXT:
+		vm_map_splay_findnext(root, &rlist);
+		rlist->start = root->start;
+		rlist->offset = root->offset;
+		y = root->left;
+		root = rlist;
+		rlist = root->left;
+		root->left = y;
+		break;
+	case UNLINK_MERGE_NONE:
+		vm_map_splay_findprev(root, &llist);
+		vm_map_splay_findnext(root, &rlist);
+		if (llist != NULL) {
+			root = llist;
+			llist = root->right;
+			root->right = NULL;
+		} else if (rlist != NULL) {
+			root = rlist;
+			rlist = root->left;
+			root->left = NULL;
+		} else
+			root = NULL;
+		break;
 	}
+	if (root != NULL)
+		root = vm_map_splay_merge(root, llist, rlist,
+		    root->left, root->right);
 	map->root = root;
-
-	prev = entry->prev;
-	next = entry->next;
-	next->prev = prev;
-	prev->next = next;
+	VM_MAP_ASSERT_CONSISTENT(map);
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize_free:
  *
- *	Recompute the amount of free space following a vm_map_entry
- *	and propagate that value up the tree.  Call this function after
- *	resizing a map entry in-place, that is, without a call to
- *	vm_map_entry_link() or _unlink().
+ *	Recompute the amount of free space following a modified vm_map_entry
+ *	and propagate those values up the tree.  Call this function after
+ *	resizing a map entry in-place by changing the end value, without a
+ *	call to vm_map_entry_link() or _unlink().
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
 {
+	vm_map_entry_t llist, rlist, root;
 
-	/*
-	 * Using splay trees without parent pointers, propagating
-	 * max_free up the tree is done by moving the entry to the
-	 * root and making the change there.
-	 */
-	if (entry != map->root)
-		map->root = vm_map_entry_splay(entry->start, map->root);
-
-	entry->adj_free = entry->next->start - entry->end;
-	vm_map_entry_set_max_free(entry);
+	VM_MAP_ASSERT_LOCKED(map);
+	root = map->root;
+	root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
+	KASSERT(root != NULL,
+	    ("vm_map_entry_resize_free: resize_free object not mapped"));
+	vm_map_splay_findnext(root, &rlist);
+	root->right = NULL;
+	map->root = vm_map_splay_merge(root, llist, rlist,
+	    root->left, root->right);
+	VM_MAP_ASSERT_CONSISTENT(map);
+	CTR3(KTR_VM, "vm_map_entry_resize_free: map %p, nentries %d, entry %p", map,
+	    map->nentries, entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
-	vm_map_entry_t cur;
+	vm_map_entry_t cur, lbound;
 	boolean_t locked;
 
 	/*
 	 * If the map is empty, then the map entry immediately preceding
 	 * "address" is the map's header.
 	 */
 	cur = map->root;
-	if (cur == NULL)
+	if (cur == NULL) {
 		*entry = &map->header;
-	else if (address >= cur->start && cur->end > address) {
+		return (FALSE);
+	}
+	if (address >= cur->start && cur->end > address) {
 		*entry = cur;
 		return (TRUE);
-	} else if ((locked = vm_map_locked(map)) ||
+	}
+	if ((locked = vm_map_locked(map)) ||
 	    sx_try_upgrade(&map->lock)) {
 		/*
 		 * Splay requires a write lock on the map.  However, it only
 		 * restructures the binary search tree; it does not otherwise
 		 * change the map.  Thus, the map's timestamp need not change
 		 * on a temporary upgrade.
 		 */
 		map->root = cur = vm_map_entry_splay(address, cur);
+		VM_MAP_ASSERT_CONSISTENT(map);
 		if (!locked)
 			sx_downgrade(&map->lock);
 
 		/*
 		 * If "address" is contained within a map entry, the new root
 		 * is that map entry.  Otherwise, the new root is a map entry
 		 * immediately before or after "address".
 		 */
-		if (address >= cur->start) {
+		if (address < cur->start) {
+			*entry = &map->header;
+			return (FALSE);
+		}
+		*entry = cur;
+		return (address < cur->end);
+	}
+	/*
+	 * Since the map is only locked for read access, perform a
+	 * standard binary search tree lookup for "address".
+	 */
+	lbound = &map->header;
+	do {
+		if (address < cur->start) {
+			cur = cur->left;
+		} else if (cur->end <= address) {
+			lbound = cur;
+			cur = cur->right;
+		} else {
 			*entry = cur;
-			if (cur->end > address)
-				return (TRUE);
-		} else
-			*entry = cur->prev;
-	} else
-		/*
-		 * Since the map is only locked for read access, perform a
-		 * standard binary search tree lookup for "address".
-		 */
-		for (;;) {
-			if (address < cur->start) {
-				if (cur->left == NULL) {
-					*entry = cur->prev;
-					break;
-				}
-				cur = cur->left;
-			} else if (cur->end > address) {
-				*entry = cur;
-				return (TRUE);
-			} else {
-				if (cur->right == NULL) {
-					*entry = cur;
-					break;
-				}
-				cur = cur->right;
-			}
+			return (TRUE);
 		}
+	} while (cur != NULL);
+	*entry = lbound;
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry, temp_entry;
 	struct ucred *cred;
 	vm_eflags_t protoeflags;
 	vm_inherit_t inheritance;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(object != kernel_object ||
 	    (cow & MAP_COPY_ON_WRITE) == 0,
 	    ("vm_map_insert: kernel object and COW"));
 	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
 	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	KASSERT((prot & ~max) == 0,
 	    ("prot %#x is not subset of max_prot %#x", prot, max));
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if (start < vm_map_min(map) || end > vm_map_max(map) ||
 	    start >= end)
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	if (prev_entry->next->start < end)
 		return (KERN_NO_SPACE);
 
 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
 	    max != VM_PROT_NONE))
 		return (KERN_INVALID_ARGUMENT);
 
 	protoeflags = 0;
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
 	if (cow & MAP_NOFAULT)
 		protoeflags |= MAP_ENTRY_NOFAULT;
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 	if (cow & MAP_STACK_GROWS_DOWN)
 		protoeflags |= MAP_ENTRY_GROWS_DOWN;
 	if (cow & MAP_STACK_GROWS_UP)
 		protoeflags |= MAP_ENTRY_GROWS_UP;
 	if (cow & MAP_VN_WRITECOUNT)
 		protoeflags |= MAP_ENTRY_VN_WRITECNT;
 	if ((cow & MAP_CREATE_GUARD) != 0)
 		protoeflags |= MAP_ENTRY_GUARD;
 	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
 		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
 	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
 		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
 	if (cow & MAP_INHERIT_SHARE)
 		inheritance = VM_INHERIT_SHARE;
 	else
 		inheritance = VM_INHERIT_DEFAULT;
 
 	cred = NULL;
 	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
 		goto charged;
 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
 			return (KERN_RESOURCE_SHORTAGE);
 		KASSERT(object == NULL ||
 		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
 		    object->cred == NULL,
 		    ("overcommit: vm_map_insert o %p", object));
 		cred = curthread->td_ucred;
 	}
 
 charged:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map && end > kernel_vm_end)
 		pmap_growkernel(end);
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		VM_OBJECT_WLOCK(object);
 		if (object->ref_count > 1 || object->shadow_count != 0)
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(object);
 	} else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
 	    protoeflags &&
 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
 	    prev_entry->end == start && (prev_entry->cred == cred ||
 	    (prev_entry->object.vm_object != NULL &&
 	    prev_entry->object.vm_object->cred == cred)) &&
 	    vm_object_coalesce(prev_entry->object.vm_object,
 	    prev_entry->offset,
 	    (vm_size_t)(prev_entry->end - prev_entry->start),
 	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
 	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if (prev_entry->inheritance == inheritance &&
 		    prev_entry->protection == prot &&
 		    prev_entry->max_protection == max &&
 		    prev_entry->wired_count == 0) {
 			KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
 			    0, ("prev_entry %p has incoherent wiring",
 			    prev_entry));
 			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
 				map->size += end - prev_entry->end;
 			prev_entry->end = end;
 			vm_map_entry_resize_free(map, prev_entry);
 			vm_map_simplify_entry(map, prev_entry);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 		    (prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 		if (cred != NULL && object != NULL && object->cred != NULL &&
 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			/* Object already accounts for this uid. */
 			cred = NULL;
 		}
 	}
 	if (cred != NULL)
 		crhold(cred);
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 	new_entry->cred = NULL;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 
 	new_entry->inheritance = inheritance;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
 	new_entry->next_read = start;
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
 	new_entry->cred = cred;
 
 	/*
 	 * Insert the new entry into the list
 	 */
-	vm_map_entry_link(map, prev_entry, new_entry);
+	vm_map_entry_link(map, new_entry);
 	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * Try to coalesce the new entry with both the previous and next
 	 * entries in the list.  Previously, we only attempted to coalesce
 	 * with the previous entry when object is NULL.  Here, we handle the
 	 * other cases, which are less common.
 	 */
 	vm_map_simplify_entry(map, new_entry);
 
 	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
 		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
 		    end - start, cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
- *	In a vm_map_entry, "adj_free" is the amount of free space
- *	adjacent (higher address) to this entry, and "max_free" is the
- *	maximum amount of contiguous free space in its subtree.  This
- *	allows finding a free region in one path down the tree, so
- *	O(log n) amortized with splay trees.
+ *	In a vm_map_entry, "max_free" is the maximum amount of
+ *	contiguous free space between an entry in its subtree and a
+ *	neighbor of that entry.  This allows finding a free region in
+ *	one path down the tree, so O(log n) amortized with splay
+ *	trees.
  *
  *	The map must be locked, and leaves it so.
  *
- *	Returns: 0 on success, and starting address in *addr,
- *		 1 if insufficient space.
+ *	Returns: starting address if sufficient space,
+ *		 vm_map_max(map)-length+1 if insufficient space.
  */
-int
-vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
-    vm_offset_t *addr)	/* OUT */
+vm_offset_t
+vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
 {
-	vm_map_entry_t entry;
-	vm_offset_t st;
+	vm_map_entry_t llist, rlist, root, y;
+	vm_size_t left_length;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	start = MAX(start, vm_map_min(map));
 	if (start + length > vm_map_max(map) || start + length < start)
-		return (1);
+		return (vm_map_max(map) - length + 1);
 
 	/* Empty tree means wide open address space. */
-	if (map->root == NULL) {
-		*addr = start;
-		return (0);
-	}
+	if (map->root == NULL)
+		return (start);
 
 	/*
 	 * After splay, if start comes before root node, then there
 	 * must be a gap from start to the root.
 	 */
-	map->root = vm_map_entry_splay(start, map->root);
-	if (start + length <= map->root->start) {
-		*addr = start;
-		return (0);
+	root = vm_map_splay_split(start, length, map->root,
+	    &llist, &rlist);
+	if (root != NULL)
+		start = root->end;
+	else if (rlist != NULL) {
+		root = rlist;
+		rlist = root->left;
+		root->left = NULL;
+	} else {
+		root = llist;
+		llist = root->right;
+		root->right = NULL;
 	}
+	map->root = vm_map_splay_merge(root, llist, rlist,
+	    root->left, root->right);
+	VM_MAP_ASSERT_CONSISTENT(map);
+	if (start + length <= root->start)
+		return (start);
 
 	/*
 	 * Root is the last node that might begin its gap before
 	 * start, and this is the last comparison where address
 	 * wrap might be a problem.
 	 */
-	st = (start > map->root->end) ? start : map->root->end;
-	if (length <= map->root->end + map->root->adj_free - st) {
-		*addr = st;
-		return (0);
-	}
+	if (root->right == NULL &&
+	    start + length <= vm_map_max(map))
+		return (start);
 
 	/* With max_free, can immediately tell if no solution. */
-	entry = map->root->right;
-	if (entry == NULL || length > entry->max_free)
-		return (1);
+	if (root->right == NULL || length > root->right->max_free)
+		return (vm_map_max(map) - length + 1);
 
 	/*
-	 * Search the right subtree in the order: left subtree, root,
-	 * right subtree (first fit).  The previous splay implies that
-	 * all regions in the right subtree have addresses > start.
+	 * Splay for the least large-enough gap in the right subtree.
 	 */
-	while (entry != NULL) {
-		if (entry->left != NULL && entry->left->max_free >= length)
-			entry = entry->left;
-		else if (entry->adj_free >= length) {
-			*addr = entry->end;
-			return (0);
-		} else
-			entry = entry->right;
+	llist = NULL;
+        rlist = NULL;
+	for (left_length = 0; ;
+	     left_length = root->left != NULL ?
+	     root->left->max_free : root->start - llist->end) {
+		if (length <= left_length)
+			SPLAY_LEFT_STEP(root, y, rlist,
+			    length <= (y->left != NULL ?
+			    y->left->max_free : y->start - llist->end));
+		else
+			SPLAY_RIGHT_STEP(root, y, llist,
+			    length > (y->left != NULL ?
+			    y->left->max_free : y->start - root->end));
+		if (root == NULL)
+			break;
 	}
-
-	/* Can't get here, so panic if we do. */
-	panic("vm_map_findspace: max_free corrupt");
+	root = llist;
+	llist = root->right;
+	if ((y = rlist) == NULL)
+		root->right = NULL;
+	else {
+		rlist = y->left;
+		y->left = NULL;
+		root->right = y->right;
+	}
+	root = vm_map_splay_merge(root, llist, rlist,
+	    root->left, root->right);
+	if (y != NULL) {
+		y->right = root->right;
+		vm_map_entry_set_max_free(y);
+		root->right = y;
+		vm_map_entry_set_max_free(root);
+	}
+	map->root = root;
+	VM_MAP_ASSERT_CONSISTENT(map);
+	return (root->end);
 }
 
 int
 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_size_t length, vm_prot_t prot,
     vm_prot_t max, int cow)
 {
 	vm_offset_t end;
 	int result;
 
 	end = start + length;
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_fixed: non-NULL backing object for stack"));
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if ((cow & MAP_CHECK_EXCL) == 0)
 		vm_map_delete(map, start, end);
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		result = vm_map_stack_locked(map, start, length, sgrowsiz,
 		    prot, max, cow);
 	} else {
 		result = vm_map_insert(map, object, offset, start, end,
 		    prot, max, cow);
 	}
 	vm_map_unlock(map);
 	return (result);
 }
 
 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
 
 static int cluster_anon = 1;
 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
     &cluster_anon, 0,
     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
 
 static bool
 clustering_anon_allowed(vm_offset_t addr)
 {
 
 	switch (cluster_anon) {
 	case 0:
 		return (false);
 	case 1:
 		return (addr == 0);
 	case 2:
 	default:
 		return (true);
 	}
 }
 
 static long aslr_restarts;
 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
     &aslr_restarts, 0,
     "Number of aslr failures");
 
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 
 /*
  * Searches for the specified amount of free space in the given map with the
  * specified alignment.  Performs an address-ordered, first-fit search from
  * the given address "*addr", with an optional upper bound "max_addr".  If the
  * parameter "alignment" is zero, then the alignment is computed from the
  * given (object, offset) pair so as to enable the greatest possible use of
  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
  *
  * The map must be locked.  Initially, there must be at least "length" bytes
  * of free space at the given address.
  */
 static int
 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
     vm_offset_t alignment)
 {
 	vm_offset_t aligned_addr, free_addr;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	free_addr = *addr;
-	KASSERT(!vm_map_findspace(map, free_addr, length, addr) &&
-	    free_addr == *addr, ("caller provided insufficient free space"));
+	KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
+	    ("caller failed to provide space %d at address %p",
+	     (int)length, (void*)free_addr));
 	for (;;) {
 		/*
 		 * At the start of every iteration, the free space at address
 		 * "*addr" is at least "length" bytes.
 		 */
 		if (alignment == 0)
 			pmap_align_superpage(object, offset, addr, length);
 		else if ((*addr & (alignment - 1)) != 0) {
 			*addr &= ~(alignment - 1);
 			*addr += alignment;
 		}
 		aligned_addr = *addr;
 		if (aligned_addr == free_addr) {
 			/*
 			 * Alignment did not change "*addr", so "*addr" must
 			 * still provide sufficient free space.
 			 */
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * Test for address wrap on "*addr".  A wrapped "*addr" could
 		 * be a valid address, in which case vm_map_findspace() cannot
 		 * be relied upon to fail.
 		 */
-		if (aligned_addr < free_addr ||
-		    vm_map_findspace(map, aligned_addr, length, addr) ||
+		if (aligned_addr < free_addr)
+			return (KERN_NO_SPACE);
+		*addr = vm_map_findspace(map, aligned_addr, length);
+		if (*addr + length > vm_map_max(map) ||
 		    (max_addr != 0 && *addr + length > max_addr))
 			return (KERN_NO_SPACE);
 		free_addr = *addr;
 		if (free_addr == aligned_addr) {
 			/*
 			 * If a successful call to vm_map_findspace() did not
 			 * change "*addr", then "*addr" must still be aligned
 			 * and provide sufficient free space.
 			 */
 			return (KERN_SUCCESS);
 		}
 	}
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, vm_offset_t max_addr, int find_space,
 	    vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_offset_t alignment, curr_min_addr, min_addr;
 	int gap, pidx, rv, try;
 	bool cluster, en_aslr, update_anon;
 
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_find: non-NULL backing object for stack"));
 	MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
 	    (object->flags & OBJ_COLORED) == 0))
 		find_space = VMFS_ANY_SPACE;
 	if (find_space >> 8 != 0) {
 		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
 		alignment = (vm_offset_t)1 << (find_space >> 8);
 	} else
 		alignment = 0;
 	en_aslr = (map->flags & MAP_ASLR) != 0;
 	update_anon = cluster = clustering_anon_allowed(*addr) &&
 	    (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
 	    find_space != VMFS_NO_SPACE && object == NULL &&
 	    (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
 	    MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
 	curr_min_addr = min_addr = *addr;
 	if (en_aslr && min_addr == 0 && !cluster &&
 	    find_space != VMFS_NO_SPACE &&
 	    (map->flags & MAP_ASLR_IGNSTART) != 0)
 		curr_min_addr = min_addr = vm_map_min(map);
 	try = 0;
 	vm_map_lock(map);
 	if (cluster) {
 		curr_min_addr = map->anon_loc;
 		if (curr_min_addr == 0)
 			cluster = false;
 	}
 	if (find_space != VMFS_NO_SPACE) {
 		KASSERT(find_space == VMFS_ANY_SPACE ||
 		    find_space == VMFS_OPTIMAL_SPACE ||
 		    find_space == VMFS_SUPER_SPACE ||
 		    alignment != 0, ("unexpected VMFS flag"));
 again:
 		/*
 		 * When creating an anonymous mapping, try clustering
 		 * with an existing anonymous mapping first.
 		 *
 		 * We make up to two attempts to find address space
 		 * for a given find_space value. The first attempt may
 		 * apply randomization or may cluster with an existing
 		 * anonymous mapping. If this first attempt fails,
 		 * perform a first-fit search of the available address
 		 * space.
 		 *
 		 * If all tries failed, and find_space is
 		 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
 		 * Again enable clustering and randomization.
 		 */
 		try++;
 		MPASS(try <= 2);
 
 		if (try == 2) {
 			/*
 			 * Second try: we failed either to find a
 			 * suitable region for randomizing the
 			 * allocation, or to cluster with an existing
 			 * mapping.  Retry with free run.
 			 */
 			curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
 			    vm_map_min(map) : min_addr;
 			atomic_add_long(&aslr_restarts, 1);
 		}
 
 		if (try == 1 && en_aslr && !cluster) {
 			/*
 			 * Find space for allocation, including
 			 * gap needed for later randomization.
 			 */
 			pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
 			    (find_space == VMFS_SUPER_SPACE || find_space ==
 			    VMFS_OPTIMAL_SPACE) ? 1 : 0;
 			gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
 			    (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
 			    aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
-			if (vm_map_findspace(map, curr_min_addr, length +
-			    gap * pagesizes[pidx], addr))
+			*addr = vm_map_findspace(map, curr_min_addr,
+			    length + gap * pagesizes[pidx]);
+			if (*addr + length + gap * pagesizes[pidx] >
++			    vm_map_max(map))
 				goto again;
 			/* And randomize the start address. */
 			*addr += (arc4random() % gap) * pagesizes[pidx];
 			if (max_addr != 0 && *addr + length > max_addr)
 				goto again;
-		} else if (vm_map_findspace(map, curr_min_addr, length, addr) ||
-		    (max_addr != 0 && *addr + length > max_addr)) {
-			if (cluster) {
-				cluster = false;
-				MPASS(try == 1);
-				goto again;
+		} else {
+			*addr = vm_map_findspace(map, curr_min_addr, length);
+			if (*addr + length > vm_map_max(map) ||
+			    (max_addr != 0 && *addr + length > max_addr)) {
+				if (cluster) {
+					cluster = false;
+					MPASS(try == 1);
+					goto again;
+				}
+				rv = KERN_NO_SPACE;
+				goto done;
 			}
-			rv = KERN_NO_SPACE;
-			goto done;
 		}
 
 		if (find_space != VMFS_ANY_SPACE &&
 		    (rv = vm_map_alignspace(map, object, offset, addr, length,
 		    max_addr, alignment)) != KERN_SUCCESS) {
 			if (find_space == VMFS_OPTIMAL_SPACE) {
 				find_space = VMFS_ANY_SPACE;
 				curr_min_addr = min_addr;
 				cluster = update_anon;
 				try = 0;
 				goto again;
 			}
 			goto done;
 		}
 	} else if ((cow & MAP_REMAP) != 0) {
 		if (*addr < vm_map_min(map) ||
 		    *addr + length > vm_map_max(map) ||
 		    *addr + length <= length) {
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		vm_map_delete(map, *addr, *addr + length);
 	}
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
 		    max, cow);
 	} else {
 		rv = vm_map_insert(map, object, offset, *addr, *addr + length,
 		    prot, max, cow);
 	}
 	if (rv == KERN_SUCCESS && update_anon)
 		map->anon_loc = *addr + length;
 done:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_find_min() is a variant of vm_map_find() that takes an
  *	additional parameter (min_addr) and treats the given address
  *	(*addr) differently.  Specifically, it treats *addr as a hint
  *	and not as the minimum address where the mapping is created.
  *
  *	This function works in two phases.  First, it tries to
  *	allocate above the hint.  If that fails and the hint is
  *	greater than min_addr, it performs a second pass, replacing
  *	the hint with min_addr as the minimum address for the
  *	allocation.
  */
 int
 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
     int cow)
 {
 	vm_offset_t hint;
 	int rv;
 
 	hint = *addr;
 	for (;;) {
 		rv = vm_map_find(map, object, offset, addr, length, max_addr,
 		    find_space, prot, max, cow);
 		if (rv == KERN_SUCCESS || min_addr >= hint)
 			return (rv);
 		*addr = hint = min_addr;
 	}
 }
 
 /*
  * A map entry with any of the following flags set must not be merged with
  * another entry.
  */
 #define	MAP_ENTRY_NOMERGE_MASK	(MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
 	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)
 
 static bool
 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
 {
 
 	KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
 	    (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
 	    ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
 	    prev, entry));
 	return (prev->end == entry->start &&
 	    prev->object.vm_object == entry->object.vm_object &&
 	    (prev->object.vm_object == NULL ||
 	    prev->offset + (prev->end - prev->start) == entry->offset) &&
 	    prev->eflags == entry->eflags &&
 	    prev->protection == entry->protection &&
 	    prev->max_protection == entry->max_protection &&
 	    prev->inheritance == entry->inheritance &&
 	    prev->wired_count == entry->wired_count &&
 	    prev->cred == entry->cred);
 }
 
 static void
 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * If the backing object is a vnode object, vm_object_deallocate()
 	 * calls vrele().  However, vrele() does not lock the vnode because
 	 * the vnode has additional references.  Thus, the map lock can be
 	 * kept without causing a lock-order reversal with the vnode lock.
 	 *
 	 * Since we count the number of virtual page mappings in
 	 * object->un_pager.vnp.writemappings, the writemappings value
 	 * should not be adjusted when the entry is disposed of.
 	 */
 	if (entry->object.vm_object != NULL)
 		vm_object_deallocate(entry->object.vm_object);
 	if (entry->cred != NULL)
 		crfree(entry->cred);
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_simplify_entry:
  *
  *	Simplify the given map entry by merging with either neighbor.  This
  *	routine also has the ability to merge with both neighbors.
  *
  *	The map must be locked.
  *
  *	This routine guarantees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
 void
 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev;
 
 	if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) != 0)
 		return;
 	prev = entry->prev;
 	if (vm_map_mergeable_neighbors(prev, entry)) {
-		vm_map_entry_unlink(map, prev);
-		entry->start = prev->start;
-		entry->offset = prev->offset;
-		if (entry->prev != &map->header)
-			vm_map_entry_resize_free(map, entry->prev);
+		vm_map_entry_unlink(map, prev, UNLINK_MERGE_NEXT);
 		vm_map_merged_neighbor_dispose(map, prev);
 	}
 	next = entry->next;
 	if (vm_map_mergeable_neighbors(entry, next)) {
-		vm_map_entry_unlink(map, next);
-		entry->end = next->end;
-		vm_map_entry_resize_free(map, entry);
+		vm_map_entry_unlink(map, next, UNLINK_MERGE_PREV);
 		vm_map_merged_neighbor_dispose(map, next);
 	}
 }
 
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->end > start && entry->start < start,
 	    ("_vm_map_clip_start: invalid clip of entry %p", entry));
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 	vm_map_simplify_entry(map, entry);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map &&
 	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			object->cred = entry->cred;
 			object->charge = entry->end - entry->start;
 			entry->cred = NULL;
 		}
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
 		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 
-	vm_map_entry_link(map, entry->prev, new_entry);
+	vm_map_entry_link(map, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 		/*
 		 * The object->un_pager.vnp.writemappings for the
 		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
 		 * kept as is here.  The virtual pages are
 		 * re-distributed among the clipped entries, so the sum is
 		 * left the same.
 		 */
 	}
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if ((endaddr) < (entry->end)) \
 		_vm_map_clip_end((map), (entry), (endaddr)); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->start < end && entry->end > end,
 	    ("_vm_map_clip_end: invalid clip of entry %p", entry));
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map &&
 	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			object->cred = entry->cred;
 			object->charge = entry->end - entry->start;
 			entry->cred = NULL;
 		}
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
 		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 
-	vm_map_entry_link(map, entry, new_entry);
+	vm_map_entry_link(map, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result;
 
 	result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(submap);
 	submap->flags |= MAP_IS_SUB_MAP;
 	vm_map_unlock(submap);
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
 	    (entry->object.vm_object == NULL)) {
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	if (result != KERN_SUCCESS) {
 		vm_map_lock(submap);
 		submap->flags &= ~MAP_IS_SUB_MAP;
 		vm_map_unlock(submap);
 	}
 	return (result);
 }
 
 /*
  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload the specified map's pmap with mappings to the specified
  *	object's memory-resident pages.  No further physical pages are
  *	allocated, and no further virtual pages are retrieved from secondary
  *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
  *	limited number of page mappings are created at the low-end of the
  *	specified address range.  (For this purpose, a superpage mapping
  *	counts as one page mapping.)  Otherwise, all resident pages within
  *	the specified address range are mapped.
  */
 static void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t mask, psize, threshold, tmpidx;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	VM_OBJECT_RLOCK(object);
 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 		VM_OBJECT_RUNLOCK(object);
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 			pmap_object_init_pt(map->pmap, addr, object, pindex,
 			    size);
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 		VM_OBJECT_LOCK_DOWNGRADE(object);
 	}
 
 	psize = atop(size);
 	if (psize + pindex > object->size) {
 		if (object->size < pindex) {
 			VM_OBJECT_RUNLOCK(object);
 			return;
 		}
 		psize = object->size - pindex;
 	}
 
 	start = 0;
 	p_start = NULL;
 	threshold = MAX_INIT_PT;
 
 	p = vm_page_find_least(object, pindex);
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
 		    vm_page_count_severe()) ||
 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
 		    tmpidx >= threshold)) {
 			psize = tmpidx;
 			break;
 		}
 		if (p->valid == VM_PAGE_BITS_ALL) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 			/* Jump ahead if a superpage mapping is possible. */
 			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
 			    (pagesizes[p->psind] - 1)) == 0) {
 				mask = atop(pagesizes[p->psind]) - 1;
 				if (tmpidx + mask < psize &&
 				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
 					p += mask;
 					threshold += mask;
 				}
 			}
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 	VM_OBJECT_RUNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t current, entry;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 
 	vm_map_lock(map);
 
 	/*
 	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
 	 * need to fault pages into the map and will drop the map lock while
 	 * doing so, and the VM object may end up in an inconsistent state if we
 	 * update the protection on the map entry in between faults.
 	 */
 	vm_map_wait_busy(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 	}
 
 	/*
 	 * Do an accounting pass for private read-only mappings that
 	 * now will do cow due to allowed write (e.g. debugger sets
 	 * breakpoint on text segment)
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 
 		vm_map_clip_end(map, current, end);
 
 		if (set_max ||
 		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
 		    ENTRY_CHARGED(current) ||
 		    (current->eflags & MAP_ENTRY_GUARD) != 0) {
 			continue;
 		}
 
 		cred = curthread->td_ucred;
 		obj = current->object.vm_object;
 
 		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			if (!swap_reserve(current->end - current->start)) {
 				vm_map_unlock(map);
 				return (KERN_RESOURCE_SHORTAGE);
 			}
 			crhold(cred);
 			current->cred = cred;
 			continue;
 		}
 
 		VM_OBJECT_WLOCK(obj);
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
 			VM_OBJECT_WUNLOCK(obj);
 			continue;
 		}
 
 		/*
 		 * Charge for the whole object allocation now, since
 		 * we cannot distinguish between non-charged and
 		 * charged clipped mapping of the same object later.
 		 */
 		KASSERT(obj->charge == 0,
 		    ("vm_map_protect: object %p overcharged (entry %p)",
 		    obj, current));
 		if (!swap_reserve(ptoa(obj->size))) {
 			VM_OBJECT_WUNLOCK(obj);
 			vm_map_unlock(map);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 
 		crhold(cred);
 		obj->cred = cred;
 		obj->charge = ptoa(obj->size);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 
 		old_prot = current->protection;
 
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		/*
 		 * For user wired map entries, the normal lazy evaluation of
 		 * write access upgrades through soft page faults is
 		 * undesirable.  Instead, immediately copy any pages that are
 		 * copy-on-write and enable write access in the physical map.
 		 */
 		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
 		    (current->protection & VM_PROT_WRITE) != 0 &&
 		    (old_prot & VM_PROT_WRITE) == 0)
 			vm_fault_copy_entry(map, map, current, current, NULL);
 
 		/*
 		 * When restricting access, update the physical map.  Worry
 		 * about copy-on-write here.
 		 */
 		if ((old_prot & ~current->protection) != 0) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, current->start,
 			    current->end,
 			    current->protection & MASK(current));
 #undef	MASK
 		}
 		vm_map_simplify_entry(map, current);
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t current, entry;
 	bool modify_map;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		if (start == end)
 			return (0);
 		modify_map = true;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		if (start == end)
 			return (0);
 		modify_map = false;
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		if (modify_map)
 			vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (current = entry; current->start < end;
 		    current = current->next) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			vm_map_clip_end(map, current, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				current->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				current->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_simplify_entry(map, current);
 		}
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pstart, pend;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		for (current = entry; current->start < end;
 		    current = current->next) {
 			vm_offset_t useEnd, useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			pstart = OFF_TO_IDX(current->offset);
 			pend = pstart + atop(current->end - current->start);
 			useStart = current->start;
 			useEnd = current->end;
 
 			if (current->start < start) {
 				pstart += atop(start - current->start);
 				useStart = start;
 			}
 			if (current->end > end) {
 				pend -= atop(current->end - end);
 				useEnd = end;
 			}
 
 			if (pstart >= pend)
 				continue;
 
 			/*
 			 * Perform the pmap_advise() before clearing
 			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
 			 * concurrent pmap operation, such as pmap_remove(),
 			 * could clear a reference in the pmap and set
 			 * PGA_REFERENCED on the page before the pmap_advise()
 			 * had completed.  Consequently, the page would appear
 			 * referenced based upon an old reference that
 			 * occurred before this pmap_advise() ran.
 			 */
 			if (behav == MADV_DONTNEED || behav == MADV_FREE)
 				pmap_advise(map->pmap, useStart, useEnd,
 				    behav);
 
 			vm_object_madvise(current->object.vm_object, pstart,
 			    pend, behav);
 
 			/*
 			 * Pre-populate paging structures in the
 			 * WILLNEED case.  For wired entries, the
 			 * paging structures are already populated.
 			 */
 			if (behav == MADV_WILLNEED &&
 			    current->wired_count == 0) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    current->protection,
 				    current->object.vm_object,
 				    pstart,
 				    ptoa(pend - pstart),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vmspace_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 	case VM_INHERIT_ZERO:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	if (start == end)
 		return (KERN_SUCCESS);
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 	while (entry->start < end) {
 		vm_map_clip_end(map, entry, end);
 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
 		    new_inheritance != VM_INHERIT_ZERO)
 			entry->inheritance = new_inheritance;
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_unwire;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, 0)) {
 				/*
 				 * Allow interruption of user unwiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp+1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * First_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && entry->next->start > entry->end)) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
 	for (entry = first_entry; entry->start < end; entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for draining
 		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
 		 * could be simultaneously wiring this new mapping
 		 * entry.  Detect these cases and skip any entries
 		 * marked as in transition by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
 			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			if (entry->wired_count == 1)
 				vm_map_entry_unwire(map, entry);
 			else
 				entry->wired_count--;
 		}
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_unwire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_unwire: alien wire %p", entry));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  *	vm_map_wire_entry_failure:
  *
  *	Handle a wiring failure on the given entry.
  *
  *	The map should be locked.
  */
 static void
 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
 	    entry->wired_count == 1,
 	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
 	KASSERT(failed_addr < entry->end,
 	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
 
 	/*
 	 * If any pages at the start of this entry were successfully wired,
 	 * then unwire them.
 	 */
 	if (failed_addr > entry->start) {
 		pmap_unwire(map->pmap, entry->start, failed_addr);
 		vm_object_unwire(entry->object.vm_object, entry->offset,
 		    failed_addr - entry->start, PQ_ACTIVE);
 	}
 
 	/*
 	 * Assign an out-of-range value to represent the failure to wire this
 	 * entry.
 	 */
 	entry->wired_count = -1;
 }
 
 /*
  *	vm_map_wire:
  *
  *	Implements both kernel and user wiring.
  */
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t faddr, saved_end, saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_wire;
 	vm_prot_t prot;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	prot = 0;
 	if (flags & VM_MAP_WIRE_WRITE)
 		prot |= VM_PROT_WRITE;
 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, 0)) {
 				/*
 				 * Allow interruption of user wiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * first_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
 		    || (entry->protection & prot) != prot) {
 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
 			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
 				end = entry->end;
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 			goto next_entry;
 		}
 		if (entry->wired_count == 0) {
 			entry->wired_count++;
 			saved_start = entry->start;
 			saved_end = entry->end;
 
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.  Mark the map busy for fork.
 			 */
 			vm_map_busy(map);
 			vm_map_unlock(map);
 
 			faddr = saved_start;
 			do {
 				/*
 				 * Simulate a fault to get the page and enter
 				 * it into the physical map.
 				 */
 				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
 				    VM_FAULT_WIRE)) != KERN_SUCCESS)
 					break;
 			} while ((faddr += PAGE_SIZE) < saved_end);
 			vm_map_lock(map);
 			vm_map_unbusy(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				result = vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry);
 				KASSERT(result, ("vm_map_wire: lookup failed"));
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 				while (entry->end < saved_end) {
 					/*
 					 * In case of failure, handle entries
 					 * that were not fully wired here;
 					 * fully wired entries are handled
 					 * later.
 					 */
 					if (rv != KERN_SUCCESS &&
 					    faddr < entry->end)
 						vm_map_wire_entry_failure(map,
 						    entry, faddr);
 					entry = entry->next;
 				}
 			}
 			last_timestamp = map->timestamp;
 			if (rv != KERN_SUCCESS) {
 				vm_map_wire_entry_failure(map, entry, faddr);
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 	next_entry:
 		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
 		    entry->end < end && entry->next->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
 	for (entry = first_entry; entry->start < end; entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for faulting in the
 		 * pages or draining MAP_ENTRY_IN_TRANSITION.
 		 * Moreover, another thread could be simultaneously
 		 * wiring this new mapping entry.  Detect these cases
 		 * and skip any entries marked as in transition not by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
 			    ("vm_map_wire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
 			goto next_entry_done;
 
 		if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else if (!user_wire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			/*
 			 * Undo the wiring.  Wiring succeeded on this entry
 			 * but failed on a later entry.  
 			 */
 			if (entry->wired_count == 1)
 				vm_map_entry_unwire(map, entry);
 			else
 				entry->wired_count--;
 		}
 	next_entry_done:
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_wire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_wire: alien wire %p", entry));
 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
 		    MAP_ENTRY_WIRE_SKIPPED);
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	unsigned int last_timestamp;
 	boolean_t failed;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = entry->start;
 		end = entry->end;
 	}
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    current->end != current->next->start) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 	failed = FALSE;
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current->start < end;) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		vm_object_reference(object);
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
 			failed = TRUE;
 		start += size;
 		vm_object_deallocate(object);
 		vm_map_lock_read(map);
 		if (last_timestamp == map->timestamp ||
 		    !vm_map_lookup_entry(map, start, &current))
 			current = current->next;
 	}
 
 	vm_map_unlock_read(map);
 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->wired_count > 0,
 	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
 	pmap_unwire(map->pmap, entry->start, entry->end);
 	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
 	    entry->start, PQ_ACTIVE);
 	entry->wired_count = 0;
 }
 
 static void
 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
 {
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
 		vm_object_deallocate(entry->object.vm_object);
 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count, size1;
 	vm_size_t size;
 
-	vm_map_entry_unlink(map, entry);
+	vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
 	object = entry->object.vm_object;
 
 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
 		MPASS(entry->cred == NULL);
 		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
 		MPASS(object == NULL);
 		vm_map_entry_deallocate(entry, map->system_map);
 		return;
 	}
 
 	size = entry->end - entry->start;
 	map->size -= size;
 
 	if (entry->cred != NULL) {
 		swap_release_by_cred(size, entry->cred);
 		crfree(entry->cred);
 	}
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 	    (object != NULL)) {
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
 		count = atop(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_WLOCK(object);
 		if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
 		    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
 		    object == kernel_object)) {
 			vm_object_collapse(object);
 
 			/*
 			 * The option OBJPR_NOTMAPPED can be passed here
 			 * because vm_map_delete() already performed
 			 * pmap_remove() on the only mapping to this range
 			 * of pages. 
 			 */
 			vm_object_page_remove(object, offidxstart, offidxend,
 			    OBJPR_NOTMAPPED);
 			if (object->type == OBJT_SWAP)
 				swap_pager_freespace(object, offidxstart,
 				    count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size) {
 				size1 = object->size;
 				object->size = offidxstart;
 				if (object->cred != NULL) {
 					size1 -= object->size;
 					KASSERT(object->charge >= ptoa(size1),
 					    ("object %p charge < 0", object));
 					swap_release_by_cred(ptoa(size1),
 					    object->cred);
 					object->charge -= ptoa(size1);
 				}
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 	} else
 		entry->object.vm_object = NULL;
 	if (map->system_map)
 		vm_map_entry_deallocate(entry, TRUE);
 	else {
 		entry->next = curthread->td_map_def_user;
 		curthread->td_map_def_user = entry;
 	}
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if (start == end)
 		return (KERN_SUCCESS);
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 	}
 
 	/*
 	 * Step through all entries in this region
 	 */
 	while (entry->start < end) {
 		vm_map_entry_t next;
 
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 			vm_map_entry_t tmp_entry;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, 0);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 							 &tmp_entry))
 					entry = tmp_entry->next;
 				else {
 					entry = tmp_entry;
 					vm_map_clip_start(map, entry,
 							  saved_start);
 				}
 			}
 			continue;
 		}
 		vm_map_clip_end(map, entry, end);
 
 		next = entry->next;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0)
 			vm_map_entry_unwire(map, entry);
 
 		/*
 		 * Remove mappings for the pages, but only if the
 		 * mappings could exist.  For instance, it does not
 		 * make sense to call pmap_remove() for guard entries.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
 		    entry->object.vm_object != NULL)
 			pmap_remove(map->pmap, entry->start, entry->end);
 
 		if (entry->end == map->anon_loc)
 			map->anon_loc = entry->start;
 
 		/*
 		 * Delete the entry only after removing all pmap
 		 * entries pointing to its pages.  (Otherwise, its
 		 * page frames may be reallocated, and any modify bits
 		 * will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry,
 	vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	vm_map_entry_t fake_entry;
 	vm_offset_t size;
 	struct ucred *cred;
 	int charged;
 
 	VM_MAP_ASSERT_LOCKED(dst_map);
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0 ||
 	    (src_entry->protection & VM_PROT_WRITE) == 0) {
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
 		    (src_entry->protection & VM_PROT_WRITE) != 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		size = src_entry->end - src_entry->start;
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			VM_OBJECT_WLOCK(src_object);
 			charged = ENTRY_CHARGED(src_entry);
 			if (src_object->handle == NULL &&
 			    (src_object->type == OBJT_DEFAULT ||
 			    src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
 				if ((src_object->flags & (OBJ_NOSPLIT |
 				    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_object_split(src_entry);
 					src_object =
 					    src_entry->object.vm_object;
 				}
 			}
 			vm_object_reference_locked(src_object);
 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 			if (src_entry->cred != NULL &&
 			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 				KASSERT(src_object->cred == NULL,
 				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
 				     src_object));
 				src_object->cred = src_entry->cred;
 				src_object->charge = size;
 			}
 			VM_OBJECT_WUNLOCK(src_object);
 			dst_entry->object.vm_object = src_object;
 			if (charged) {
 				cred = curthread->td_ucred;
 				crhold(cred);
 				dst_entry->cred = cred;
 				*fork_charge += size;
 				if (!(src_entry->eflags &
 				      MAP_ENTRY_NEEDS_COPY)) {
 					crhold(cred);
 					src_entry->cred = cred;
 					*fork_charge += size;
 				}
 			}
 			src_entry->eflags |= MAP_ENTRY_COW |
 			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->eflags |= MAP_ENTRY_COW |
 			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->offset = src_entry->offset;
 			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				/*
 				 * MAP_ENTRY_VN_WRITECNT cannot
 				 * indicate write reference from
 				 * src_entry, since the entry is
 				 * marked as needs copy.  Allocate a
 				 * fake entry that is used to
 				 * decrement object->un_pager.vnp.writecount
 				 * at the appropriate time.  Attach
 				 * fake_entry to the deferred list.
 				 */
 				fake_entry = vm_map_entry_create(dst_map);
 				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
 				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
 				vm_object_reference(src_object);
 				fake_entry->object.vm_object = src_object;
 				fake_entry->start = src_entry->start;
 				fake_entry->end = src_entry->end;
 				fake_entry->next = curthread->td_map_def_user;
 				curthread->td_map_def_user = fake_entry;
 			}
 
 			pmap_copy(dst_map->pmap, src_map->pmap,
 			    dst_entry->start, dst_entry->end - dst_entry->start,
 			    src_entry->start);
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 			if (src_entry->cred != NULL) {
 				dst_entry->cred = curthread->td_ucred;
 				crhold(dst_entry->cred);
 				*fork_charge += size;
 			}
 		}
 	} else {
 		/*
 		 * We don't want to make writeable wired pages copy-on-write.
 		 * Immediately copy these pages into the new map by simulating
 		 * page faults.  The new pages are pageable.
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
 		    fork_charge);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
 		return;
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 {
 	struct vmspace *vm2;
 	vm_map_t new_map, old_map;
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
 	int error, locked;
 	vm_inherit_t inh;
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
 	    pmap_pinit);
 	if (vm2 == NULL)
 		return (NULL);
 
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	vm_map_lock(old_map);
 	if (old_map->busy)
 		vm_map_wait_busy(old_map);
 	new_map = &vm2->vm_map;
 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
 	KASSERT(locked, ("vmspace_fork: lock failed"));
 
 	error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
 	if (error != 0) {
 		sx_xunlock(&old_map->lock);
 		sx_xunlock(&new_map->lock);
 		vm_map_process_deferred();
 		vmspace_free(vm2);
 		return (NULL);
 	}
 
 	new_map->anon_loc = old_map->anon_loc;
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
 		inh = old_entry->inheritance;
 		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
 		    inh != VM_INHERIT_NONE)
 			inh = VM_INHERIT_COPY;
 
 		switch (inh) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				object = vm_object_allocate(OBJT_DEFAULT,
 					atop(old_entry->end - old_entry->start));
 				old_entry->object.vm_object = object;
 				old_entry->offset = 0;
 				if (old_entry->cred != NULL) {
 					object->cred = old_entry->cred;
 					object->charge = old_entry->end -
 					    old_entry->start;
 					old_entry->cred = NULL;
 				}
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 				    &old_entry->offset,
 				    old_entry->end - old_entry->start);
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				/* Transfer the second reference too. */
 				vm_object_reference(
 				    old_entry->object.vm_object);
 
 				/*
 				 * As in vm_map_simplify_entry(), the
 				 * vnode lock will not be acquired in
 				 * this call to vm_object_deallocate().
 				 */
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			}
 			VM_OBJECT_WLOCK(object);
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			if (old_entry->cred != NULL) {
 				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
 				object->cred = old_entry->cred;
 				object->charge = old_entry->end - old_entry->start;
 				old_entry->cred = NULL;
 			}
 
 			/*
 			 * Assert the correct state of the vnode
 			 * v_writecount while the object is locked, to
 			 * not relock it later for the assertion
 			 * correctness.
 			 */
 			if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
 			    object->type == OBJT_VNODE) {
 				KASSERT(((struct vnode *)object->handle)->
 				    v_writecount > 0,
 				    ("vmspace_fork: v_writecount %p", object));
 				KASSERT(object->un_pager.vnp.writemappings > 0,
 				    ("vmspace_fork: vnp.writecount %p",
 				    object));
 			}
 			VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				vnode_pager_update_writecount(object,
 				    new_entry->start, new_entry->end);
 			}
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
-			vm_map_entry_link(new_map, new_map->header.prev,
-			    new_entry);
+			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			/*
 			 * Copied entry is COW over the old object.
 			 */
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->cred = NULL;
-			vm_map_entry_link(new_map, new_map->header.prev,
-			    new_entry);
+			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry, fork_charge);
 			break;
 
 		case VM_INHERIT_ZERO:
 			/*
 			 * Create a new anonymous mapping entry modelled from
 			 * the old one.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			memset(new_entry, 0, sizeof(*new_entry));
 
 			new_entry->start = old_entry->start;
 			new_entry->end = old_entry->end;
 			new_entry->eflags = old_entry->eflags &
 			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
 			    MAP_ENTRY_VN_WRITECNT);
 			new_entry->protection = old_entry->protection;
 			new_entry->max_protection = old_entry->max_protection;
 			new_entry->inheritance = VM_INHERIT_ZERO;
 
-			vm_map_entry_link(new_map, new_map->header.prev,
-			    new_entry);
+			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			new_entry->cred = curthread->td_ucred;
 			crhold(new_entry->cred);
 			*fork_charge += (new_entry->end - new_entry->start);
 
 			break;
 		}
 		old_entry = old_entry->next;
 	}
 	/*
 	 * Use inlined vm_map_unlock() to postpone handling the deferred
 	 * map entries, which cannot be done until both old_map and
 	 * new_map locks are released.
 	 */
 	sx_xunlock(&old_map->lock);
 	sx_xunlock(&new_map->lock);
 	vm_map_process_deferred();
 
 	return (vm2);
 }
 
 /*
  * Create a process's stack for exec_new_vmspace().  This function is never
  * asked to wire the newly created stack.
  */
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_size_t growsize, init_ssize;
 	rlim_t vmemlim;
 	int rv;
 
 	MPASS((map->flags & MAP_WIREFUTURE) == 0);
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 	vm_map_lock(map);
 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
 	    max, cow);
 out:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 static int stack_guard_page = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
     &stack_guard_page, 0,
     "Specifies the number of guard pages for a stack that grows");
 
 static int
 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, gap_bot, gap_top, top;
 	vm_size_t init_ssize, sgp;
 	int orient, rv;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
 	KASSERT(orient != 0, ("No stack grow direction"));
 	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
 	    ("bi-dir stack"));
 
 	if (addrbos < vm_map_min(map) ||
 	    addrbos + max_ssize > vm_map_max(map) ||
 	    addrbos + max_ssize <= addrbos)
 		return (KERN_INVALID_ADDRESS);
 	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
 	if (sgp >= max_ssize)
 		return (KERN_INVALID_ARGUMENT);
 
 	init_ssize = growsize;
 	if (max_ssize < init_ssize + sgp)
 		init_ssize = max_ssize - sgp;
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If we can't accommodate max_ssize in the current mapping, no go.
 	 */
 	if (prev_entry->next->start < addrbos + max_ssize)
 		return (KERN_NO_SPACE);
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN) {
 		bot = addrbos + max_ssize - init_ssize;
 		top = bot + init_ssize;
 		gap_bot = addrbos;
 		gap_top = bot;
 	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
 		bot = addrbos;
 		top = bot + init_ssize;
 		gap_bot = top;
 		gap_top = addrbos + max_ssize;
 	}
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	new_entry = prev_entry->next;
 	KASSERT(new_entry->end == top || new_entry->start == bot,
 	    ("Bad entry start/end for new stack entry"));
 	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
 	if (rv != KERN_SUCCESS)
 		(void)vm_map_delete(map, bot, top);
 	return (rv);
 }
 
 /*
  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
  * successfully grow the stack.
  */
 static int
 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
 {
 	vm_map_entry_t stack_entry;
 	struct proc *p;
 	struct vmspace *vm;
 	struct ucred *cred;
 	vm_offset_t gap_end, gap_start, grow_start;
 	size_t grow_amount, guard, max_grow;
 	rlim_t lmemlim, stacklim, vmemlim;
 	int rv, rv1;
 	bool gap_deleted, grow_down, is_procstack;
 #ifdef notyet
 	uint64_t limit;
 #endif
 #ifdef RACCT
 	int error;
 #endif
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	/*
 	 * Disallow stack growth when the access is performed by a
 	 * debugger or AIO daemon.  The reason is that the wrong
 	 * resource limits are applied.
 	 */
 	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
 		return (KERN_FAILURE);
 
 	MPASS(!map->system_map);
 
 	guard = stack_guard_page * PAGE_SIZE;
 	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
 	stacklim = lim_cur(curthread, RLIMIT_STACK);
 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 retry:
 	/* If addr is not in a hole for a stack grow area, no need to grow. */
 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
 		return (KERN_FAILURE);
 	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		return (KERN_SUCCESS);
 	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
 		stack_entry = gap_entry->next;
 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
 		    stack_entry->start != gap_entry->end)
 			return (KERN_FAILURE);
 		grow_amount = round_page(stack_entry->start - addr);
 		grow_down = true;
 	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
 		stack_entry = gap_entry->prev;
 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
 		    stack_entry->end != gap_entry->start)
 			return (KERN_FAILURE);
 		grow_amount = round_page(addr + 1 - stack_entry->end);
 		grow_down = false;
 	} else {
 		return (KERN_FAILURE);
 	}
 	max_grow = gap_entry->end - gap_entry->start;
 	if (guard > max_grow)
 		return (KERN_NO_SPACE);
 	max_grow -= guard;
 	if (grow_amount > max_grow)
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
 	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
 		return (KERN_NO_SPACE);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		if (is_procstack && racct_set(p, RACCT_STACK,
 		    ctob(vm->vm_ssize) + grow_amount)) {
 			PROC_UNLOCK(p);
 			return (KERN_NO_SPACE);
 		}
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	grow_amount = roundup(grow_amount, sgrowsiz);
 	if (grow_amount > max_grow)
 		grow_amount = max_grow;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = trunc_page((vm_size_t)stacklim) -
 		    ctob(vm->vm_ssize);
 	}
 
 #ifdef notyet
 	PROC_LOCK(p);
 	limit = racct_get_available(p, RACCT_STACK);
 	PROC_UNLOCK(p);
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
 		grow_amount = limit - ctob(vm->vm_ssize);
 #endif
 
 	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(p);
 			if (racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
 				PROC_UNLOCK(p);
 				rv = KERN_NO_SPACE;
 				goto out;
 			}
 			PROC_UNLOCK(p);
 		}
 #endif
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
 			PROC_UNLOCK(p);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	if (vm_map_lock_upgrade(map)) {
 		gap_entry = NULL;
 		vm_map_lock_read(map);
 		goto retry;
 	}
 
 	if (grow_down) {
 		grow_start = gap_entry->end - grow_amount;
 		if (gap_entry->start + grow_amount == gap_entry->end) {
 			gap_start = gap_entry->start;
 			gap_end = gap_entry->end;
 			vm_map_entry_delete(map, gap_entry);
 			gap_deleted = true;
 		} else {
 			MPASS(gap_entry->start < gap_entry->end - grow_amount);
 			gap_entry->end -= grow_amount;
 			vm_map_entry_resize_free(map, gap_entry);
 			gap_deleted = false;
 		}
 		rv = vm_map_insert(map, NULL, 0, grow_start,
 		    grow_start + grow_amount,
 		    stack_entry->protection, stack_entry->max_protection,
 		    MAP_STACK_GROWS_DOWN);
 		if (rv != KERN_SUCCESS) {
 			if (gap_deleted) {
 				rv1 = vm_map_insert(map, NULL, 0, gap_start,
 				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
 				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
 				MPASS(rv1 == KERN_SUCCESS);
 			} else {
 				gap_entry->end += grow_amount;
 				vm_map_entry_resize_free(map, gap_entry);
 			}
 		}
 	} else {
 		grow_start = stack_entry->end;
 		cred = stack_entry->cred;
 		if (cred == NULL && stack_entry->object.vm_object != NULL)
 			cred = stack_entry->object.vm_object->cred;
 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
 			rv = KERN_NO_SPACE;
 		/* Grow the underlying object if applicable. */
 		else if (stack_entry->object.vm_object == NULL ||
 		    vm_object_coalesce(stack_entry->object.vm_object,
 		    stack_entry->offset,
 		    (vm_size_t)(stack_entry->end - stack_entry->start),
 		    (vm_size_t)grow_amount, cred != NULL)) {
 			if (gap_entry->start + grow_amount == gap_entry->end)
 				vm_map_entry_delete(map, gap_entry);
 			else
 				gap_entry->start += grow_amount;
 			stack_entry->end += grow_amount;
 			map->size += grow_amount;
 			vm_map_entry_resize_free(map, stack_entry);
 			rv = KERN_SUCCESS;
 		} else
 			rv = KERN_FAILURE;
 	}
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
 		vm_map_unlock(map);
 		vm_map_wire(map, grow_start, grow_start + grow_amount,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		vm_map_lock_read(map);
 	} else
 		vm_map_lock_downgrade(map);
 
 out:
 #ifdef RACCT
 	if (racct_enable && rv != KERN_SUCCESS) {
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_VMEM, map->size);
 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
 		if (!old_mlock) {
 			error = racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)));
 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
 		}
 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("vmspace_exec recursed"));
 	newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	curthread->td_pflags |= TDP_EXECVMSPC;
 	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 	vm_ooffset_t fork_charge;
 
 	if (oldvmspace->vm_refcnt == 1)
 		return (0);
 	fork_charge = 0;
 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
 		vmspace_free(newvmspace);
 		return (ENOMEM);
 	}
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 	return (0);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 	vm_object_t eobject;
 	vm_size_t size;
 	struct ucred *cred;
 
 RetryLookup:
 
 	vm_map_lock_read(map);
 
 RetryLookupLocked:
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	entry = *out_entry;
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
 		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
 		if (prot == VM_PROT_NONE && map != kernel_map &&
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
 		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
 		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
 		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
 			goto RetryLookupLocked;
 	}
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
 	    ("entry %p flags %x", entry, entry->eflags));
 	if ((fault_typea & VM_PROT_COPY) != 0 &&
 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 	size = entry->end - entry->start;
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if ((fault_type & VM_PROT_WRITE) != 0 ||
 		    (fault_typea & VM_PROT_COPY) != 0) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			if (entry->cred == NULL) {
 				/*
 				 * The debugger owner is charged for
 				 * the memory.
 				 */
 				cred = curthread->td_ucred;
 				crhold(cred);
 				if (!swap_reserve_by_cred(size, cred)) {
 					crfree(cred);
 					vm_map_unlock(map);
 					return (KERN_RESOURCE_SHORTAGE);
 				}
 				entry->cred = cred;
 			}
 			vm_object_shadow(&entry->object.vm_object,
 			    &entry->offset, size);
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 			eobject = entry->object.vm_object;
 			if (eobject->cred != NULL) {
 				/*
 				 * The object was not shadowed.
 				 */
 				swap_release_by_cred(size, entry->cred);
 				crfree(entry->cred);
 				entry->cred = NULL;
 			} else if (entry->cred != NULL) {
 				VM_OBJECT_WLOCK(eobject);
 				eobject->cred = entry->cred;
 				eobject->charge = size;
 				VM_OBJECT_WUNLOCK(eobject);
 				entry->cred = NULL;
 			}
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL &&
 	    !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(size));
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			VM_OBJECT_WLOCK(entry->object.vm_object);
 			entry->object.vm_object->cred = entry->cred;
 			entry->object.vm_object->charge = size;
 			VM_OBJECT_WUNLOCK(entry->object.vm_object);
 			entry->cred = NULL;
 		}
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
 		return (KERN_INVALID_ADDRESS);
 
 	entry = *out_entry;
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 vm_offset_t
 vm_map_max_KBI(const struct vm_map *map)
 {
 
 	return (vm_map_max(map));
 }
 
 vm_offset_t
 vm_map_min_KBI(const struct vm_map *map)
 {
 
 	return (vm_map_min(map));
 }
 
 pmap_t
 vm_map_pmap_KBI(vm_map_t map)
 {
 
 	return (map->pmap);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 static void
 vm_map_print(vm_map_t map)
 {
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end,
 		    entry->eflags);
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
 				vm_map_print((vm_map_t)entry->object.sub_map);
 				db_indent -= 2;
 			}
 		} else {
 			if (entry->cred != NULL)
 				db_printf(", ruid %d", entry->cred->cr_ruid);
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->object.vm_object && entry->object.vm_object->cred)
 				db_printf(", obj ruid %d charge %jx",
 				    entry->object.vm_object->cred->cr_ruid,
 				    (uintmax_t)entry->object.vm_object->charge);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						0, 0, (char *)0);
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
 }
 
 DB_SHOW_COMMAND(map, map)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show map <addr>\n");
 		return;
 	}
 	vm_map_print((vm_map_t)addr);
 }
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = db_lookup_proc(addr);
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
 }
 
 #endif /* DDB */
Index: projects/capsicum-test/sys/vm/vm_map.h
===================================================================
--- projects/capsicum-test/sys/vm/vm_map.h	(revision 345709)
+++ projects/capsicum-test/sys/vm/vm_map.h	(revision 345710)
@@ -1,429 +1,428 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_map.h	8.9 (Berkeley) 5/17/95
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory map module definitions.
  */
 #ifndef	_VM_MAP_
 #define	_VM_MAP_
 
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/_mutex.h>
 
 /*
  *	Types defined:
  *
  *	vm_map_t		the high-level address map data structure.
  *	vm_map_entry_t		an entry in an address map.
  */
 
 typedef u_char vm_flags_t;
 typedef u_int vm_eflags_t;
 
 /*
  *	Objects which live in maps may be either VM objects, or
  *	another map (called a "sharing map") which denotes read-write
  *	sharing with other maps.
  */
 union vm_map_object {
 	struct vm_object *vm_object;	/* object object */
 	struct vm_map *sub_map;		/* belongs to another map */
 };
 
 /*
  *	Address map entries consist of start and end addresses,
  *	a VM object (or sharing map) and offset into that object,
  *	and user-exported inheritance and protection information.
  *	Also included is control information for virtual copy operations.
  */
 struct vm_map_entry {
 	struct vm_map_entry *prev;	/* previous entry */
 	struct vm_map_entry *next;	/* next entry */
 	struct vm_map_entry *left;	/* left child in binary search tree */
 	struct vm_map_entry *right;	/* right child in binary search tree */
 	vm_offset_t start;		/* start address */
 	vm_offset_t end;		/* end address */
 	vm_offset_t next_read;		/* vaddr of the next sequential read */
-	vm_size_t adj_free;		/* amount of adjacent free space */
 	vm_size_t max_free;		/* max free space in subtree */
 	union vm_map_object object;	/* object I point to */
 	vm_ooffset_t offset;		/* offset into object */
 	vm_eflags_t eflags;		/* map entry flags */
 	vm_prot_t protection;		/* protection code */
 	vm_prot_t max_protection;	/* maximum protection */
 	vm_inherit_t inheritance;	/* inheritance */
 	uint8_t read_ahead;		/* pages in the read-ahead window */
 	int wired_count;		/* can be paged if = 0 */
 	struct ucred *cred;		/* tmp storage for creator ref */
 	struct thread *wiring_thread;
 };
 
 #define MAP_ENTRY_NOSYNC		0x0001
 #define MAP_ENTRY_IS_SUB_MAP		0x0002
 #define MAP_ENTRY_COW			0x0004
 #define MAP_ENTRY_NEEDS_COPY		0x0008
 #define MAP_ENTRY_NOFAULT		0x0010
 #define MAP_ENTRY_USER_WIRED		0x0020
 
 #define MAP_ENTRY_BEHAV_NORMAL		0x0000	/* default behavior */
 #define MAP_ENTRY_BEHAV_SEQUENTIAL	0x0040	/* expect sequential access */
 #define MAP_ENTRY_BEHAV_RANDOM		0x0080	/* expect random access */
 #define MAP_ENTRY_BEHAV_RESERVED	0x00C0	/* future use */
 
 #define MAP_ENTRY_BEHAV_MASK		0x00C0
 
 #define MAP_ENTRY_IN_TRANSITION		0x0100	/* entry being changed */
 #define MAP_ENTRY_NEEDS_WAKEUP		0x0200	/* waiters in transition */
 #define MAP_ENTRY_NOCOREDUMP		0x0400	/* don't include in a core */
 
 #define	MAP_ENTRY_GROWS_DOWN		0x1000	/* Top-down stacks */
 #define	MAP_ENTRY_GROWS_UP		0x2000	/* Bottom-up stacks */
 
 #define	MAP_ENTRY_WIRE_SKIPPED		0x4000
 #define	MAP_ENTRY_VN_WRITECNT		0x8000	/* writeable vnode mapping */
 #define	MAP_ENTRY_GUARD			0x10000
 #define	MAP_ENTRY_STACK_GAP_DN		0x20000
 #define	MAP_ENTRY_STACK_GAP_UP		0x40000
 #define	MAP_ENTRY_HEADER		0x80000
 
 #ifdef	_KERNEL
 static __inline u_char
 vm_map_entry_behavior(vm_map_entry_t entry)
 {
 	return (entry->eflags & MAP_ENTRY_BEHAV_MASK);
 }
 
 static __inline int
 vm_map_entry_user_wired_count(vm_map_entry_t entry)
 {
 	if (entry->eflags & MAP_ENTRY_USER_WIRED)
 		return (1);
 	return (0);
 }
 
 static __inline int
 vm_map_entry_system_wired_count(vm_map_entry_t entry)
 {
 	return (entry->wired_count - vm_map_entry_user_wired_count(entry));
 }
 #endif	/* _KERNEL */
 
 /*
  *	A map is a set of map entries.  These map entries are
  *	organized both as a binary search tree and as a doubly-linked
  *	list.  Both structures are ordered based upon the start and
  *	end addresses contained within each map entry.
  *
  *	Sleator and Tarjan's top-down splay algorithm is employed to
  *	control height imbalance in the binary search tree.
  *
  *	The map's min offset value is stored in map->header.end, and
  *	its max offset value is stored in map->header.start.  These
  *	values act as sentinels for any forward or backward address
  *	scan of the list.  The map header has a special value for the
  *	eflags field, MAP_ENTRY_HEADER, that is set initially, is
  *	never changed, and prevents an eflags match of the header
  *	with any other map entry.
  *
  *	List of locks
  *	(c)	const until freed
  */
 struct vm_map {
 	struct vm_map_entry header;	/* List of entries */
 	struct sx lock;			/* Lock for map data */
 	struct mtx system_mtx;
 	int nentries;			/* Number of entries */
 	vm_size_t size;			/* virtual size */
 	u_int timestamp;		/* Version number */
 	u_char needs_wakeup;
 	u_char system_map;		/* (c) Am I a system map? */
 	vm_flags_t flags;		/* flags for this vm_map */
 	vm_map_entry_t root;		/* Root of a binary search tree */
 	pmap_t pmap;			/* (c) Physical map */
 	vm_offset_t anon_loc;
 	int busy;
 };
 
 /*
  * vm_flags_t values
  */
 #define MAP_WIREFUTURE		0x01	/* wire all future pages */
 #define	MAP_BUSY_WAKEUP		0x02
 #define	MAP_IS_SUB_MAP		0x04	/* has parent */
 #define	MAP_ASLR		0x08	/* enabled ASLR */
 #define	MAP_ASLR_IGNSTART	0x10
 
 #ifdef	_KERNEL
 #if defined(KLD_MODULE) && !defined(KLD_TIED)
 #define	vm_map_max(map)		vm_map_max_KBI((map))
 #define	vm_map_min(map)		vm_map_min_KBI((map))
 #define	vm_map_pmap(map)	vm_map_pmap_KBI((map))
 #else
 static __inline vm_offset_t
 vm_map_max(const struct vm_map *map)
 {
 
 	return (map->header.start);
 }
 
 static __inline vm_offset_t
 vm_map_min(const struct vm_map *map)
 {
 
 	return (map->header.end);
 }
 
 static __inline pmap_t
 vm_map_pmap(vm_map_t map)
 {
 	return (map->pmap);
 }
 
 static __inline void
 vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear)
 {
 	map->flags = (map->flags | set) & ~clear;
 }
 #endif	/* KLD_MODULE */
 #endif	/* _KERNEL */
 
 /*
  * Shareable process virtual address space.
  *
  * List of locks
  *	(c)	const until freed
  */
 struct vmspace {
 	struct vm_map vm_map;	/* VM address map */
 	struct shmmap_state *vm_shm;	/* SYS5 shared memory private data XXX */
 	segsz_t vm_swrss;	/* resident set size before last swap */
 	segsz_t vm_tsize;	/* text size (pages) XXX */
 	segsz_t vm_dsize;	/* data size (pages) XXX */
 	segsz_t vm_ssize;	/* stack size (pages) */
 	caddr_t vm_taddr;	/* (c) user virtual address of text */
 	caddr_t vm_daddr;	/* (c) user virtual address of data */
 	caddr_t vm_maxsaddr;	/* user VA at max stack growth */
 	volatile int vm_refcnt;	/* number of references */
 	/*
 	 * Keep the PMAP last, so that CPU-specific variations of that
 	 * structure on a single architecture don't result in offset
 	 * variations of the machine-independent fields in the vmspace.
 	 */
 	struct pmap vm_pmap;	/* private physical map */
 };
 
 #ifdef	_KERNEL
 static __inline pmap_t
 vmspace_pmap(struct vmspace *vmspace)
 {
 	return &vmspace->vm_pmap;
 }
 #endif	/* _KERNEL */
 
 #ifdef	_KERNEL
 /*
  *	Macros:		vm_map_lock, etc.
  *	Function:
  *		Perform locking on the data portion of a map.  Note that
  *		these macros mimic procedure calls returning void.  The
  *		semicolon is supplied by the user of these macros, not
  *		by the macros themselves.  The macros can safely be used
  *		as unbraced elements in a higher level statement.
  */
 
 void _vm_map_lock(vm_map_t map, const char *file, int line);
 void _vm_map_unlock(vm_map_t map, const char *file, int line);
 int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line);
 void _vm_map_lock_read(vm_map_t map, const char *file, int line);
 void _vm_map_unlock_read(vm_map_t map, const char *file, int line);
 int _vm_map_trylock(vm_map_t map, const char *file, int line);
 int _vm_map_trylock_read(vm_map_t map, const char *file, int line);
 int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line);
 void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line);
 int vm_map_locked(vm_map_t map);
 void vm_map_wakeup(vm_map_t map);
 void vm_map_busy(vm_map_t map);
 void vm_map_unbusy(vm_map_t map);
 void vm_map_wait_busy(vm_map_t map);
 vm_offset_t vm_map_max_KBI(const struct vm_map *map);
 vm_offset_t vm_map_min_KBI(const struct vm_map *map);
 pmap_t vm_map_pmap_KBI(vm_map_t map);
 
 #define	vm_map_lock(map)	_vm_map_lock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock(map)	_vm_map_unlock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock_and_wait(map, timo)	\
 			_vm_map_unlock_and_wait(map, timo, LOCK_FILE, LOCK_LINE)
 #define	vm_map_lock_read(map)	_vm_map_lock_read(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock_read(map)	_vm_map_unlock_read(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_trylock(map)	_vm_map_trylock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_trylock_read(map)	\
 			_vm_map_trylock_read(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_lock_upgrade(map)	\
 			_vm_map_lock_upgrade(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_lock_downgrade(map)	\
 			_vm_map_lock_downgrade(map, LOCK_FILE, LOCK_LINE)
 
 long vmspace_resident_count(struct vmspace *vmspace);
 #endif	/* _KERNEL */
 
 
 /* XXX: number of kernel maps to statically allocate */
 #define MAX_KMAP	10
 
 /*
  * Copy-on-write flags for vm_map operations
  */
 #define MAP_INHERIT_SHARE	0x0001
 #define MAP_COPY_ON_WRITE	0x0002
 #define MAP_NOFAULT		0x0004
 #define MAP_PREFAULT		0x0008
 #define MAP_PREFAULT_PARTIAL	0x0010
 #define MAP_DISABLE_SYNCER	0x0020
 #define	MAP_CHECK_EXCL		0x0040
 #define	MAP_CREATE_GUARD	0x0080
 #define MAP_DISABLE_COREDUMP	0x0100
 #define MAP_PREFAULT_MADVISE	0x0200	/* from (user) madvise request */
 #define	MAP_VN_WRITECOUNT	0x0400
 #define	MAP_REMAP		0x0800
 #define	MAP_STACK_GROWS_DOWN	0x1000
 #define	MAP_STACK_GROWS_UP	0x2000
 #define	MAP_ACC_CHARGED		0x4000
 #define	MAP_ACC_NO_CHARGE	0x8000
 #define	MAP_CREATE_STACK_GAP_UP	0x10000
 #define	MAP_CREATE_STACK_GAP_DN	0x20000
 
 /*
  * vm_fault option flags
  */
 #define	VM_FAULT_NORMAL	0		/* Nothing special */
 #define	VM_FAULT_WIRE	1		/* Wire the mapped page */
 #define	VM_FAULT_DIRTY	2		/* Dirty the page; use w/VM_PROT_COPY */
 
 /*
  * Initially, mappings are slightly sequential.  The maximum window size must
  * account for the map entry's "read_ahead" field being defined as an uint8_t.
  */
 #define	VM_FAULT_READ_AHEAD_MIN		7
 #define	VM_FAULT_READ_AHEAD_INIT	15
 #define	VM_FAULT_READ_AHEAD_MAX		min(atop(MAXPHYS) - 1, UINT8_MAX)
 
 /*
  * The following "find_space" options are supported by vm_map_find().
  *
  * For VMFS_ALIGNED_SPACE, the desired alignment is specified to
  * the macro argument as log base 2 of the desired alignment.
  */
 #define	VMFS_NO_SPACE		0	/* don't find; use the given range */
 #define	VMFS_ANY_SPACE		1	/* find a range with any alignment */
 #define	VMFS_OPTIMAL_SPACE	2	/* find a range with optimal alignment*/
 #define	VMFS_SUPER_SPACE	3	/* find a superpage-aligned range */
 #define	VMFS_ALIGNED_SPACE(x)	((x) << 8) /* find a range with fixed alignment */
 
 /*
  * vm_map_wire and vm_map_unwire option flags
  */
 #define VM_MAP_WIRE_SYSTEM	0	/* wiring in a kernel map */
 #define VM_MAP_WIRE_USER	1	/* wiring in a user map */
 
 #define VM_MAP_WIRE_NOHOLES	0	/* region must not have holes */
 #define VM_MAP_WIRE_HOLESOK	2	/* region may have holes */
 
 #define VM_MAP_WIRE_WRITE	4	/* Validate writable. */
 
 #ifdef _KERNEL
 boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t);
 vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t);
 int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t);
 int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
     vm_offset_t, int, vm_prot_t, vm_prot_t, int);
 int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
     vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
 int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
     vm_prot_t, vm_prot_t, int);
-int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *);
+vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
 int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t);
 void vm_map_init(vm_map_t, pmap_t, vm_offset_t, vm_offset_t);
 int vm_map_insert (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int);
 int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *,
     vm_pindex_t *, vm_prot_t *, boolean_t *);
 int vm_map_lookup_locked(vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *,
     vm_pindex_t *, vm_prot_t *, boolean_t *);
 void vm_map_lookup_done (vm_map_t, vm_map_entry_t);
 boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);
 int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t);
 int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t);
 void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry);
 void vm_map_startup (void);
 int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t);
 int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
 int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
 int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
 int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);
 int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);
 long vmspace_swap_count(struct vmspace *vmspace);
 #endif				/* _KERNEL */
 #endif				/* _VM_MAP_ */
Index: projects/capsicum-test/tests/sys/capsicum/Makefile
===================================================================
--- projects/capsicum-test/tests/sys/capsicum/Makefile	(revision 345709)
+++ projects/capsicum-test/tests/sys/capsicum/Makefile	(revision 345710)
@@ -1,12 +1,12 @@
 # $FreeBSD$
 
 TESTSDIR=	${TESTSBASE}/sys/capsicum
 
 ATF_TESTS_C+=	bindat_connectat
 ATF_TESTS_C+=	ioctls_test
 
-CFLAGS.bindat_connectat.c+=	-I${SRCTOP}/tests
+CFLAGS+=	-I${SRCTOP}/tests
 
 WARNS?=	6
 
 .include <bsd.test.mk>
Index: projects/capsicum-test/tests/sys/capsicum/ioctls_test.c
===================================================================
--- projects/capsicum-test/tests/sys/capsicum/ioctls_test.c	(revision 345709)
+++ projects/capsicum-test/tests/sys/capsicum/ioctls_test.c	(revision 345710)
@@ -1,126 +1,130 @@
 /*-
  * Copyright (c) 2018 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/capsicum.h>
 #include <sys/filio.h>
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <netinet/in.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 #include <atf-c.h>
 
+#include "freebsd_test_suite/macros.h"
+
 /*
  * A variant of ATF_REQUIRE that is suitable for use in child
  * processes.  This only works if the parent process is tripped up by
  * the early exit and fails some requirement itself.
  */
 #define	CHILD_REQUIRE(exp) do {						\
 		if (!(exp))						\
 			child_fail_require(__FILE__, __LINE__,		\
 			    #exp " not met");				\
 	} while (0)
 
 static __dead2 void
 child_fail_require(const char *file, int line, const char *str)
 {
 	char buf[128];
 
 	snprintf(buf, sizeof(buf), "%s:%d: %s\n", file, line, str);
 	write(2, buf, strlen(buf));
 	_exit(32);
 }
 
 /*
  * Exercise the edge case of a custom ioctl list being copied from a
  * listen socket to an accepted socket.
  */
 ATF_TC_WITHOUT_HEAD(cap_ioctls__listen_copy);
 ATF_TC_BODY(cap_ioctls__listen_copy, tc)
 {
 	struct sockaddr_in sin;
 	cap_rights_t rights;
 	u_long cmds[] = { FIONREAD };
 	socklen_t len;
 	pid_t pid;
 	char dummy;
 	int s[2], status;
+
+	ATF_REQUIRE_FEATURE("security_capabilities");
 
 	s[0] = socket(AF_INET, SOCK_STREAM, 0);
 	ATF_REQUIRE(s[0] > 0);
 
 	/* Bind to an arbitrary unused port. */
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof(sin);
 	sin.sin_family = AF_INET;
 	sin.sin_port = 0;
 	sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 	ATF_REQUIRE(bind(s[0], (struct sockaddr *)&sin, sizeof(sin)) == 0);
 
 	CHILD_REQUIRE(listen(s[0], 1) == 0);
 
 	len = sizeof(sin);
 	ATF_REQUIRE(getsockname(s[0], (struct sockaddr *)&sin, &len) == 0);
 	ATF_REQUIRE(len == sizeof(sin));
 
 	cap_rights_init(&rights, CAP_ACCEPT, CAP_IOCTL);
 	ATF_REQUIRE(cap_rights_limit(s[0], &rights) == 0);
 	ATF_REQUIRE(cap_ioctls_limit(s[0], cmds, nitems(cmds)) == 0);
 
 	pid = fork();
 	if (pid == 0) {
 		s[1] = accept(s[0], NULL, NULL);
 		CHILD_REQUIRE(s[1] > 0);
 
 		/* Close both sockets during exit(). */
 		exit(0);
 	}
 
 	ATF_REQUIRE(pid > 0);
 
 	ATF_REQUIRE(close(s[0]) == 0);
 	s[1] = socket(AF_INET, SOCK_STREAM, 0);
 	ATF_REQUIRE(s[1] > 0);
 	ATF_REQUIRE(connect(s[1], (struct sockaddr *)&sin, sizeof(sin)) == 0);
 	ATF_REQUIRE(read(s[1], &dummy, sizeof(dummy)) == 0);
 	ATF_REQUIRE(close(s[1]) == 0);
 
 	ATF_REQUIRE(wait(&status) == pid);
 	ATF_REQUIRE(WIFEXITED(status));
 	ATF_REQUIRE(WEXITSTATUS(status) == 0);
 }
 
 ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, cap_ioctls__listen_copy);
 
 	return (atf_no_error());
 }
Index: projects/capsicum-test/usr.bin/dtc/Makefile
===================================================================
--- projects/capsicum-test/usr.bin/dtc/Makefile	(revision 345709)
+++ projects/capsicum-test/usr.bin/dtc/Makefile	(revision 345710)
@@ -1,13 +1,13 @@
 # $FreeBSD$
 
 PROG_CXX=dtc
 SRCS=	dtc.cc input_buffer.cc string.cc dtb.cc fdt.cc checking.cc
 MAN=	dtc.1
 
 WARNS?=	3
 
-CXXFLAGS+=	-std=c++11 -fno-rtti -fno-exceptions
+CXXFLAGS+=	-fno-rtti -fno-exceptions
 
 NO_SHARED?=NO
 
 .include <bsd.prog.mk>
Index: projects/capsicum-test/usr.sbin/iostat/iostat.c
===================================================================
--- projects/capsicum-test/usr.sbin/iostat/iostat.c	(revision 345709)
+++ projects/capsicum-test/usr.sbin/iostat/iostat.c	(revision 345710)
@@ -1,1026 +1,1026 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1997, 1998, 2000, 2001  Kenneth D. Merry
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 /*
  * Parts of this program are derived from the original FreeBSD iostat
  * program:
  */
 /*-
  * Copyright (c) 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Ideas for the new iostat statistics output modes taken from the NetBSD
  * version of iostat:
  */
 /*
  * Copyright (c) 1996 John M. Vinopal
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project
  *      by John M. Vinopal.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/resource.h>
 #include <sys/sysctl.h>
 
 #include <ctype.h>
 #include <devstat.h>
 #include <err.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <kvm.h>
 #include <limits.h>
 #include <math.h>
 #include <nlist.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <termios.h>
 #include <unistd.h>
 
 static struct nlist namelist[] = {
 #define X_TTY_NIN	0
 	{ .n_name = "_tty_nin",
 	  .n_type = 0, .n_other = 0, .n_desc = 0, .n_value = 0 },
 #define X_TTY_NOUT	1
 	{ .n_name = "_tty_nout",
 	  .n_type = 0, .n_other = 0, .n_desc = 0, .n_value = 0 },
 #define X_BOOTTIME	2
 	{ .n_name = "_boottime",
 	  .n_type = 0, .n_other = 0, .n_desc = 0, .n_value = 0 },
 #define X_END		2
 	{ .n_name = NULL,
 	  .n_type = 0, .n_other = 0, .n_desc = 0, .n_value = 0 },
 };
 
 #define	IOSTAT_DEFAULT_ROWS	20	/* Traditional default `wrows' */
 
 static struct statinfo cur, last;
 static int num_devices;
 static struct device_selection *dev_select;
 static int maxshowdevs;
 static volatile sig_atomic_t headercount;
 static volatile sig_atomic_t wresized;	/* Tty resized, when non-zero. */
 static volatile sig_atomic_t alarm_rang;
 static volatile sig_atomic_t return_requested;
 static unsigned short wrows;		/* Current number of tty rows. */
 static int dflag = 0, Iflag = 0, Cflag = 0, Tflag = 0, oflag = 0, Kflag = 0;
 static int xflag = 0, zflag = 0;
 
 /* local function declarations */
 static void usage(void);
 static void needhdr(int signo);
 static void needresize(int signo);
 static void needreturn(int signo);
 static void alarm_clock(int signo);
 static void doresize(void);
 static void phdr(void);
 static void devstats(int perf_select, long double etime, int havelast);
 static void cpustats(void);
 static int readvar(kvm_t *kd, const char *name, int nlid, void *ptr,
 		   size_t len);
 
 static void
 usage(void)
 {
 	/*
 	 * We also support the following 'traditional' syntax:
 	 * iostat [drives] [wait [count]]
 	 * This isn't mentioned in the man page, or the usage statement,
 	 * but it is supported.
 	 */
 	fprintf(stderr, "usage: iostat [-CdhIKoTxz?] [-c count] [-M core]"
 		" [-n devs] [-N system]\n"
 		"\t      [-t type,if,pass] [-w wait] [drives]\n");
 }
 
 int
 main(int argc, char **argv)
 {
 	int c, i;
 	int tflag = 0, hflag = 0, cflag = 0, wflag = 0, nflag = 0;
 	int count = 0, waittime = 0;
 	char *memf = NULL, *nlistf = NULL;
 	struct devstat_match *matches;
 	struct itimerval alarmspec;
 	int num_matches = 0;
 	char errbuf[_POSIX2_LINE_MAX];
 	kvm_t *kd = NULL;
 	long generation;
 	int num_devices_specified;
 	int num_selected, num_selections;
 	long select_generation;
 	char **specified_devices;
 	devstat_select_mode select_mode;
 	float f;
 	int havelast = 0;
 
 	matches = NULL;
 	maxshowdevs = 3;
 
 	while ((c = getopt(argc, argv, "c:CdhIKM:n:N:ot:Tw:xz?")) != -1) {
 		switch(c) {
 			case 'c':
 				cflag++;
 				count = atoi(optarg);
 				if (count < 1)
 					errx(1, "count %d is < 1", count);
 				break;
 			case 'C':
 				Cflag++;
 				break;
 			case 'd':
 				dflag++;
 				break;
 			case 'h':
 				hflag++;
 				break;
 			case 'I':
 				Iflag++;
 				break;
 			case 'K':
 				Kflag++;
 				break;
 			case 'M':
 				memf = optarg;
 				break;
 			case 'n':
 				nflag++;
 				maxshowdevs = atoi(optarg);
 				if (maxshowdevs < 0)
 					errx(1, "number of devices %d is < 0",
 					     maxshowdevs);
 				break;
 			case 'N':
 				nlistf = optarg;
 				break;
 			case 'o':
 				oflag++;
 				break;
 			case 't':
 				tflag++;
 				if (devstat_buildmatch(optarg, &matches,
 						       &num_matches) != 0)
 					errx(1, "%s", devstat_errbuf);
 				break;
 			case 'T':
 				Tflag++;
 				break;
 			case 'w':
 				wflag++;
 				f = atof(optarg);
 				waittime = f * 1000;
 				if (waittime < 1)
 					errx(1, "wait time is < 1ms");
 				break;
 			case 'x':
 				xflag++;
 				break;
 			case 'z':
 				zflag++;
 				break;
 			default:
 				usage();
 				exit(1);
 				break;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (nlistf != NULL || memf != NULL) {
 		kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf);
 
 		if (kd == NULL)
 			errx(1, "kvm_openfiles: %s", errbuf);
 
 		if (kvm_nlist(kd, namelist) == -1)
 			errx(1, "kvm_nlist: %s", kvm_geterr(kd));
 	}
 
 	/*
 	 * Make sure that the userland devstat version matches the kernel
 	 * devstat version.  If not, exit and print a message informing
 	 * the user of his mistake.
 	 */
 	if (devstat_checkversion(kd) < 0)
 		errx(1, "%s", devstat_errbuf);
 
 	/*
 	 * Make sure Tflag and/or Cflag are set if dflag == 0.  If dflag is
 	 * greater than 0, they may be 0 or non-zero.
 	 */
 	if (dflag == 0 && xflag == 0) {
 		Cflag = 1;
 		Tflag = 1;
 	}
 
 	/* find out how many devices we have */
 	if ((num_devices = devstat_getnumdevs(kd)) < 0)
 		err(1, "can't get number of devices");
 
 	/*
 	 * Figure out how many devices we should display.
 	 */
 	if (nflag == 0) {
 		if (xflag > 0)
 			maxshowdevs = num_devices;
 		else if (oflag > 0) {
 			if ((dflag > 0) && (Cflag == 0) && (Tflag == 0))
 				maxshowdevs = 5;
 			else if ((dflag > 0) && (Tflag > 0) && (Cflag == 0))
 				maxshowdevs = 5;
 			else
 				maxshowdevs = 4;
 		} else {
 			if ((dflag > 0) && (Cflag == 0))
 				maxshowdevs = 4;
 			else
 				maxshowdevs = 3;
 		}
 	}
 
 	cur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
 	if (cur.dinfo == NULL)
 		err(1, "calloc failed");
 
 	last.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo));
 	if (last.dinfo == NULL)
 		err(1, "calloc failed");
 
 	/*
 	 * Grab all the devices.  We don't look to see if the list has
 	 * changed here, since it almost certainly has.  We only look for
 	 * errors.
 	 */
 	if (devstat_getdevs(kd, &cur) == -1)
 		errx(1, "%s", devstat_errbuf);
 
 	num_devices = cur.dinfo->numdevs;
 	generation = cur.dinfo->generation;
 
 	/*
 	 * If the user specified any devices on the command line, see if
 	 * they are in the list of devices we have now.
 	 */
 	specified_devices = (char **)malloc(sizeof(char *));
 	if (specified_devices == NULL)
 		err(1, "malloc failed");
 
 	for (num_devices_specified = 0; *argv; ++argv) {
 		if (isdigit(**argv))
 			break;
 		num_devices_specified++;
 		specified_devices = (char **)realloc(specified_devices,
 						     sizeof(char *) *
 						     num_devices_specified);
 		if (specified_devices == NULL)
 			err(1, "realloc failed");
 
 		specified_devices[num_devices_specified - 1] = *argv;
 
 	}
 	if (nflag == 0 && maxshowdevs < num_devices_specified)
 		maxshowdevs = num_devices_specified;
 
 	dev_select = NULL;
 
 	if ((num_devices_specified == 0) && (num_matches == 0))
 		select_mode = DS_SELECT_ADD;
 	else
 		select_mode = DS_SELECT_ONLY;
 
 	/*
 	 * At this point, selectdevs will almost surely indicate that the
 	 * device list has changed, so we don't look for return values of 0
 	 * or 1.  If we get back -1, though, there is an error.
 	 */
 	if (devstat_selectdevs(&dev_select, &num_selected,
 			       &num_selections, &select_generation, generation,
 			       cur.dinfo->devices, num_devices, matches,
 			       num_matches, specified_devices,
 			       num_devices_specified, select_mode, maxshowdevs,
 			       hflag) == -1)
 		errx(1, "%s", devstat_errbuf);
 
 	/*
 	 * Look for the traditional wait time and count arguments.
 	 */
 	if (*argv) {
 		f = atof(*argv);
 		waittime = f * 1000;
 
 		/* Let the user know he goofed, but keep going anyway */
 		if (wflag != 0)
 			warnx("discarding previous wait interval, using"
 			      " %g instead", waittime / 1000.0);
 		wflag++;
 
 		if (*++argv) {
 			count = atoi(*argv);
 			if (cflag != 0)
 				warnx("discarding previous count, using %d"
 				      " instead", count);
 			cflag++;
 		} else
 			count = -1;
 	}
 
 	/*
 	 * If the user specified a count, but not an interval, we default
 	 * to an interval of 1 second.
 	 */
 	if ((wflag == 0) && (cflag > 0))
 		waittime = 1 * 1000;
 
 	/*
 	 * If the user specified a wait time, but not a count, we want to
 	 * go on ad infinitum.  This can be redundant if the user uses the
 	 * traditional method of specifying the wait, since in that case we
 	 * already set count = -1 above.  Oh well.
 	 */
 	if ((wflag > 0) && (cflag == 0))
 		count = -1;
 
 	bzero(cur.cp_time, sizeof(cur.cp_time));
 	cur.tk_nout = 0;
 	cur.tk_nin = 0;
 
 	/*
 	 * Set the snap time to the system boot time (ie: zero), so the
 	 * stats are calculated since system boot.
 	 */
 	cur.snap_time = 0;
 
 	/*
 	 * If the user stops the program (control-Z) and then resumes it,
 	 * print out the header again.
 	 */
 	(void)signal(SIGCONT, needhdr);
 
 	/*
 	 * If our standard output is a tty, then install a SIGWINCH handler
 	 * and set wresized so that our first iteration through the main
 	 * iostat loop will peek at the terminal's current rows to find out
 	 * how many lines can fit in a screenful of output.
 	 */
 	if (isatty(fileno(stdout)) != 0) {
 		wresized = 1;
 		(void)signal(SIGWINCH, needresize);
 	} else {
 		wresized = 0;
 		wrows = IOSTAT_DEFAULT_ROWS;
 	}
 
 	/*
 	 * Register a SIGINT handler so that we can print out final statistics
 	 * when we get that signal
 	 */
 	(void)signal(SIGINT, needreturn);
 
 	/*
 	 * Register a SIGALRM handler to implement sleeps if the user uses the
 	 * -c or -w options
 	 */
 	(void)signal(SIGALRM, alarm_clock);
 	alarmspec.it_interval.tv_sec = waittime / 1000;
 	alarmspec.it_interval.tv_usec = 1000 * (waittime % 1000);
 	alarmspec.it_value.tv_sec = waittime / 1000;
 	alarmspec.it_value.tv_usec = 1000 * (waittime % 1000);
 	setitimer(ITIMER_REAL, &alarmspec, NULL);
 
 	for (headercount = 1;;) {
 		struct devinfo *tmp_dinfo;
 		long tmp;
 		long double etime;
 		sigset_t sigmask, oldsigmask;
 
 		if (Tflag > 0) {
 			if ((readvar(kd, "kern.tty_nin", X_TTY_NIN, &cur.tk_nin,
 			     sizeof(cur.tk_nin)) != 0)
 			 || (readvar(kd, "kern.tty_nout", X_TTY_NOUT,
 			     &cur.tk_nout, sizeof(cur.tk_nout))!= 0)) {
 				Tflag = 0;
 				warnx("disabling TTY statistics");
 			}
 		 }
 
 		if (Cflag > 0) {
 			if (kd == NULL) {
 				if (readvar(kd, "kern.cp_time", 0,
 				    &cur.cp_time, sizeof(cur.cp_time)) != 0)
 					Cflag = 0;
 			} else {
 				if (kvm_getcptime(kd, cur.cp_time) < 0) {
 					warnx("kvm_getcptime: %s",
 					    kvm_geterr(kd));
 					Cflag = 0;
 				}
 			}
 			if (Cflag == 0)
 				warnx("disabling CPU time statistics");
 		}
 
 		if (!--headercount) {
 			phdr();
 			if (wresized != 0)
 				doresize();
 			headercount = wrows;
 		}
 
 		tmp_dinfo = last.dinfo;
 		last.dinfo = cur.dinfo;
 		cur.dinfo = tmp_dinfo;
 
 		last.snap_time = cur.snap_time;
 
 		/*
 		 * Here what we want to do is refresh our device stats.
 		 * devstat_getdevs() returns 1 when the device list has changed.
 		 * If the device list has changed, we want to go through
 		 * the selection process again, in case a device that we
 		 * were previously displaying has gone away.
 		 */
 		switch (devstat_getdevs(kd, &cur)) {
 		case -1:
 			errx(1, "%s", devstat_errbuf);
 			break;
 		case 1: {
 			int retval;
 
 			num_devices = cur.dinfo->numdevs;
 			generation = cur.dinfo->generation;
 			retval = devstat_selectdevs(&dev_select, &num_selected,
 						    &num_selections,
 						    &select_generation,
 						    generation,
 						    cur.dinfo->devices,
 						    num_devices, matches,
 						    num_matches,
 						    specified_devices,
 						    num_devices_specified,
 						    select_mode, maxshowdevs,
 						    hflag);
 			switch(retval) {
 			case -1:
 				errx(1, "%s", devstat_errbuf);
 				break;
 			case 1:
 				phdr();
 				if (wresized != 0)
 					doresize();
 				headercount = wrows;
 				break;
 			default:
 				break;
 			}
 			break;
 		}
 		default:
 			break;
 		}
 
 		/*
 		 * We only want to re-select devices if we're in 'top'
 		 * mode.  This is the only mode where the devices selected
 		 * could actually change.
 		 */
 		if (hflag > 0) {
 			int retval;
 			retval = devstat_selectdevs(&dev_select, &num_selected,
 						    &num_selections,
 						    &select_generation,
 						    generation,
 						    cur.dinfo->devices,
 						    num_devices, matches,
 						    num_matches,
 						    specified_devices,
 						    num_devices_specified,
 						    select_mode, maxshowdevs,
 						    hflag);
 			switch(retval) {
 			case -1:
 				errx(1,"%s", devstat_errbuf);
 				break;
 			case 1:
 				phdr();
 				if (wresized != 0)
 					doresize();
 				headercount = wrows;
 				break;
 			default:
 				break;
 			}
 		}
 
 		if (Tflag > 0) {
 			tmp = cur.tk_nin;
 			cur.tk_nin -= last.tk_nin;
 			last.tk_nin = tmp;
 			tmp = cur.tk_nout;
 			cur.tk_nout -= last.tk_nout;
 			last.tk_nout = tmp;
 		}
 
 		etime = cur.snap_time - last.snap_time;
 
 		if (etime == 0.0)
 			etime = 1.0;
 
 		for (i = 0; i < CPUSTATES; i++) {
 			tmp = cur.cp_time[i];
 			cur.cp_time[i] -= last.cp_time[i];
 			last.cp_time[i] = tmp;
 		}
 
 		if (xflag == 0 && Tflag > 0)
 			printf("%4.0Lf %5.0Lf", cur.tk_nin / etime,
 			    cur.tk_nout / etime);
 
 		devstats(hflag, etime, havelast);
 
 		if (xflag == 0) {
 			if (Cflag > 0)
 				cpustats();
 
 			printf("\n");
 		}
 		fflush(stdout);
 
 		if ((count >= 0 && --count <= 0) || return_requested)
 			break;
 
 		/*
 		 * Use sigsuspend to safely sleep until either signal is
 		 * received
 		 */
 		alarm_rang = 0;
 		sigemptyset(&sigmask);
 		sigaddset(&sigmask, SIGINT);
 		sigaddset(&sigmask, SIGALRM);
 		sigprocmask(SIG_BLOCK, &sigmask, &oldsigmask);
 		while (! (alarm_rang || return_requested) ) {
 			sigsuspend(&oldsigmask);
 		}
 		sigprocmask(SIG_UNBLOCK, &sigmask, NULL);
 
 		havelast = 1;
 	}
 
 	exit(0);
 }
 
 /*
  * Force a header to be prepended to the next output.
  */
 void
 needhdr(int signo __unused)
 {
 
 	headercount = 1;
 }
 
 /*
  * When the terminal is resized, force an update of the maximum number of rows
  * printed between each header repetition.  Then force a new header to be
  * prepended to the next output.
  */
 void
 needresize(int signo __unused)
 {
 
 	wresized = 1;
 	headercount = 1;
 }
 
 /*
  * Record the alarm so the main loop can break its sleep
  */
 void
 alarm_clock(int signo __unused)
 {
 	alarm_rang = 1;
 }
 
 /*
  * Request that the main loop exit soon
  */
 void
 needreturn(int signo __unused)
 {
 	return_requested = 1;
 }
 
 /*
  * Update the global `wrows' count of terminal rows.
  */
 void
 doresize(void)
 {
 	int status;
 	struct winsize w;
 
 	for (;;) {
 		status = ioctl(fileno(stdout), TIOCGWINSZ, &w);
 		if (status == -1 && errno == EINTR)
 			continue;
 		else if (status == -1)
 			err(1, "ioctl");
 		if (w.ws_row > 3)
 			wrows = w.ws_row - 3;
 		else
 			wrows = IOSTAT_DEFAULT_ROWS;
 		break;
 	}
 
 	/*
 	 * Inhibit doresize() calls until we are rescheduled by SIGWINCH.
 	 */
 	wresized = 0;
 }
 
 static void
 phdr(void)
 {
 	int i, printed;
 	char devbuf[256];
 
 	/*
 	 * If xflag is set, we need a per-loop header, not a page header, so
 	 * just return.  We'll print the header in devstats().
 	 */
 	if (xflag > 0)
 		return;
 
 	if (Tflag > 0)
 		(void)printf("       tty");
 	for (i = 0, printed=0;(i < num_devices) && (printed < maxshowdevs);i++){
 		int di;
 		if ((dev_select[i].selected != 0)
 		 && (dev_select[i].selected <= maxshowdevs)) {
 			di = dev_select[i].position;
 			snprintf(devbuf, sizeof(devbuf), "%s%d", 
 					    cur.dinfo->devices[di].device_name,
 					    cur.dinfo->devices[di].unit_number);
 			if (oflag > 0)
 				(void)printf("%13.6s ", devbuf);
 			else
 				printf("%16.6s ", devbuf);
 			printed++;
 		}
 	}
 	if (Cflag > 0)
 		(void)printf("            cpu\n");
 	else
 		(void)printf("\n");
 
 	if (Tflag > 0)
 		(void)printf(" tin  tout");
 
 	for (i=0, printed = 0;(i < num_devices) && (printed < maxshowdevs);i++){
 		if ((dev_select[i].selected != 0)
 		 && (dev_select[i].selected <= maxshowdevs)) {
 			if (oflag > 0) {
 				if (Iflag == 0)
 					(void)printf(" sps tps msps ");
 				else
 					(void)printf(" blk xfr msps ");
 			} else {
 				if (Iflag == 0)
-					printf("  KB/t tps  MB/s ");
+					printf(" KB/t  tps  MB/s ");
 				else
-					printf("  KB/t xfrs   MB ");
+					printf(" KB/t xfrs    MB ");
 			}
 			printed++;
 		}
 	}
 	if (Cflag > 0)
 		(void)printf(" us ni sy in id\n");
 	else
 		printf("\n");
 
 }
 
 static void
 devstats(int perf_select, long double etime, int havelast)
 {
 	int dn;
 	long double transfers_per_second, transfers_per_second_read;
 	long double transfers_per_second_write;
 	long double kb_per_transfer, mb_per_second, mb_per_second_read;
 	long double mb_per_second_write;
 	u_int64_t total_bytes, total_transfers, total_blocks;
 	u_int64_t total_bytes_read, total_transfers_read;
 	u_int64_t total_bytes_write, total_transfers_write;
 	long double busy_pct, busy_time;
 	u_int64_t queue_len;
 	long double total_mb, blocks_per_second, total_duration;
 	long double ms_per_other, ms_per_read, ms_per_write, ms_per_transaction;
 	int firstline = 1;
 	char *devicename;
 
 	if (xflag > 0) {
 		if (Cflag > 0) {
 			printf("      cpu\n");
 			printf(" us ni sy in id\n");
 			cpustats();
 			printf("\n");
 		}
 		printf("                        extended device statistics  ");
 		if (Tflag > 0)
 			printf("      tty ");
 		printf("\n");
 		if (Iflag == 0) {
 			printf("device       r/s     w/s     kr/s     kw/s "
 			    " ms/r  ms/w  ms/o  ms/t qlen  %%b  ");
 		} else {
 			printf("device           r/i         w/i         kr/i"
 			    "         kw/i qlen   tsvc_t/i      sb/i  ");
 		}
 		if (Tflag > 0)
 			printf("tin  tout ");
 		printf("\n");
 	}
 
 	for (dn = 0; dn < num_devices; dn++) {
 		int di;
 
 		if (((perf_select == 0) && (dev_select[dn].selected == 0))
 		 || (dev_select[dn].selected > maxshowdevs))
 			continue;
 
 		di = dev_select[dn].position;
 
 		if (devstat_compute_statistics(&cur.dinfo->devices[di],
 		    havelast ? &last.dinfo->devices[di] : NULL, etime,
 		    DSM_TOTAL_BYTES, &total_bytes,
 		    DSM_TOTAL_BYTES_READ, &total_bytes_read,
 		    DSM_TOTAL_BYTES_WRITE, &total_bytes_write,
 		    DSM_TOTAL_TRANSFERS, &total_transfers,
 		    DSM_TOTAL_TRANSFERS_READ, &total_transfers_read,
 		    DSM_TOTAL_TRANSFERS_WRITE, &total_transfers_write,
 		    DSM_TOTAL_BLOCKS, &total_blocks,
 		    DSM_KB_PER_TRANSFER, &kb_per_transfer,
 		    DSM_TRANSFERS_PER_SECOND, &transfers_per_second,
 		    DSM_TRANSFERS_PER_SECOND_READ, &transfers_per_second_read,
 		    DSM_TRANSFERS_PER_SECOND_WRITE, &transfers_per_second_write,
 		    DSM_MB_PER_SECOND, &mb_per_second,
 		    DSM_MB_PER_SECOND_READ, &mb_per_second_read,
 		    DSM_MB_PER_SECOND_WRITE, &mb_per_second_write,
 		    DSM_BLOCKS_PER_SECOND, &blocks_per_second,
 		    DSM_MS_PER_TRANSACTION, &ms_per_transaction,
 		    DSM_MS_PER_TRANSACTION_READ, &ms_per_read,
 		    DSM_MS_PER_TRANSACTION_WRITE, &ms_per_write,
 		    DSM_MS_PER_TRANSACTION_OTHER, &ms_per_other,
 		    DSM_BUSY_PCT, &busy_pct,
 		    DSM_QUEUE_LENGTH, &queue_len,
 		    DSM_TOTAL_DURATION, &total_duration,
 		    DSM_TOTAL_BUSY_TIME, &busy_time,
 		    DSM_NONE) != 0)
 			errx(1, "%s", devstat_errbuf);
 
 		if (perf_select != 0) {
 			dev_select[dn].bytes = total_bytes;
 			if ((dev_select[dn].selected == 0)
 			 || (dev_select[dn].selected > maxshowdevs))
 				continue;
 		}
 
 		if (Kflag > 0 || xflag > 0) {
 			int block_size = cur.dinfo->devices[di].block_size;
 			total_blocks = total_blocks * (block_size ?
 						       block_size : 512) / 1024;
 		}
 
 		if (xflag > 0) {
 			if (asprintf(&devicename, "%s%d",
 			    cur.dinfo->devices[di].device_name,
 			    cur.dinfo->devices[di].unit_number) == -1)
 				err(1, "asprintf");
 			/*
 			 * If zflag is set, skip any devices with zero I/O.
 			 */
 			if (zflag == 0 || transfers_per_second_read > 0.05 ||
 			    transfers_per_second_write > 0.05 ||
 			    mb_per_second_read > ((long double).0005)/1024 ||
 			    mb_per_second_write > ((long double).0005)/1024 ||
 			    busy_pct > 0.5) {
 				if (Iflag == 0)
 					printf("%-8.8s %7d %7d %8.1Lf "
 					    "%8.1Lf %5d %5d %5d %5d "
 					    "%4" PRIu64 " %3.0Lf ",
 					    devicename,
 					    (int)transfers_per_second_read,
 					    (int)transfers_per_second_write,
 					    mb_per_second_read * 1024,
 					    mb_per_second_write * 1024,
 					    (int)ms_per_read, (int)ms_per_write,
 					    (int)ms_per_other,
 					    (int)ms_per_transaction,
 					    queue_len, busy_pct);
 				else
 					printf("%-8.8s %11.1Lf %11.1Lf "
 					    "%12.1Lf %12.1Lf %4" PRIu64
 					    " %10.1Lf %9.1Lf ",
 					    devicename,
 					    (long double)total_transfers_read,
 					    (long double)total_transfers_write,
 					    (long double)
 					        total_bytes_read / 1024,
 					    (long double)
 					        total_bytes_write / 1024,
 					    queue_len,
 					    total_duration, busy_time);
 				if (firstline) {
 					/*
 					 * If this is the first device
 					 * we're printing, also print
 					 * CPU or TTY stats if requested.
 					 */
 					firstline = 0;
 					if (Tflag > 0)
 						printf("%4.0Lf%5.0Lf",
 						    cur.tk_nin / etime,
 						    cur.tk_nout / etime);
 				}
 				printf("\n");
 			}
 			free(devicename);
 		} else if (oflag > 0) {
 			int msdig = (ms_per_transaction < 100.0) ? 1 : 0;
 
 			if (Iflag == 0)
 				printf("%4.0Lf%4.0Lf%5.*Lf ",
 				       blocks_per_second,
 				       transfers_per_second,
 				       msdig,
 				       ms_per_transaction);
 			else
 				printf("%4.1" PRIu64 "%4.1" PRIu64 "%5.*Lf ",
 				       total_blocks,
 				       total_transfers,
 				       msdig,
 				       ms_per_transaction);
 		} else {
 			if (Iflag == 0)
-				printf(" %5.2Lf %3.0Lf %5.2Lf ",
+				printf(" %4.1Lf %4.0Lf %5.1Lf ",
 				       kb_per_transfer,
 				       transfers_per_second,
 				       mb_per_second);
 			else {
 				total_mb = total_bytes;
 				total_mb /= 1024 * 1024;
 
-				printf(" %5.2Lf %3.1" PRIu64 " %5.2Lf ",
+				printf(" %4.1Lf %4.1" PRIu64 " %5.2Lf ",
 				       kb_per_transfer,
 				       total_transfers,
 				       total_mb);
 			}
 		}
 	}
 	if (xflag > 0 && zflag > 0 && firstline == 1 &&
 	    (Tflag > 0 || Cflag > 0)) {
 		/*
 		 * If zflag is set and we did not print any device
 		 * lines I/O because they were all zero,
 		 * print TTY/CPU stats.
 		 */
 		printf("%52s","");
 		if (Tflag > 0)
 			printf("%4.0Lf %5.0Lf", cur.tk_nin / etime,
 			    cur.tk_nout / etime);
 		if (Cflag > 0)
 			cpustats();
 		printf("\n");
 	}
 }
 
 static void
 cpustats(void)
 {
 	int state;
 	double cptime;
 
 	cptime = 0.0;
 
 	for (state = 0; state < CPUSTATES; ++state)
 		cptime += cur.cp_time[state];
 	for (state = 0; state < CPUSTATES; ++state)
 		printf(" %2.0f",
 		       rint(100. * cur.cp_time[state] / (cptime ? cptime : 1)));
 }
 
 static int
 readvar(kvm_t *kd, const char *name, int nlid, void *ptr, size_t len)
 {
 	if (kd != NULL) {
 		ssize_t nbytes;
 
 		nbytes = kvm_read(kd, namelist[nlid].n_value, ptr, len);
 
 		if (nbytes < 0) {
 			warnx("kvm_read(%s): %s", namelist[nlid].n_name,
 			    kvm_geterr(kd));
 			return (1);
 		} else if ((size_t)nbytes != len) {
 			warnx("kvm_read(%s): expected %zu bytes, got %zd bytes",
 			      namelist[nlid].n_name, len, nbytes);
 			return (1);
 		}
 	} else {
 		size_t nlen = len;
 
 		if (sysctlbyname(name, ptr, &nlen, NULL, 0) == -1) {
 			warn("sysctl(%s...) failed", name);
 			return (1);
 		}
 		if (nlen != len) {
 			warnx("sysctl(%s...): expected %lu, got %lu", name,
 			      (unsigned long)len, (unsigned long)nlen);
 			return (1);
 		}
 	}
 	return (0);
 }
Index: projects/capsicum-test/usr.sbin/pmc/Makefile
===================================================================
--- projects/capsicum-test/usr.sbin/pmc/Makefile	(revision 345709)
+++ projects/capsicum-test/usr.sbin/pmc/Makefile	(revision 345710)
@@ -1,18 +1,19 @@
 #
 # $FreeBSD$
 #
 
 .include <src.opts.mk>
 PROG_CXX=	pmc
 MAN=	
 WARNS?=	3
-CXXFLAGS+= -O0 -std=c++14
+CXXFLAGS+= -O0
+CXXSTD= c++14
 CWARNFLAGS.gcc+= -Wno-redundant-decls
 
 LIBADD=	kvm pmc m ncursesw pmcstat elf
 
 SRCS=	pmc.c pmc_util.c cmd_pmc_stat.c \
 	cmd_pmc_list.c cmd_pmc_filter.cc \
 	cmd_pmc_summary.cc
 
 .include <bsd.prog.mk>
Index: projects/capsicum-test
===================================================================
--- projects/capsicum-test	(revision 345709)
+++ projects/capsicum-test	(revision 345710)

Property changes on: projects/capsicum-test
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,2 ##
   Merged /projects/fuse2:r344664,344703,344914,345304,345421
   Merged /head:r345653-345709