Index: projects/netbsd-tests-upstream-01-2017/lib/msun/tests/Makefile
===================================================================
--- projects/netbsd-tests-upstream-01-2017/lib/msun/tests/Makefile	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/lib/msun/tests/Makefile	(revision 312218)
@@ -1,105 +1,102 @@
 # $FreeBSD$
 
 .include <bsd.own.mk>
 
 TESTSRC=	${SRCTOP}/contrib/netbsd-tests/lib/libm
 
 # All architectures on FreeBSD have fenv.h
 CFLAGS+=	-DHAVE_FENV_H
 
 # For isqemu.h
 CFLAGS+=	-I${TESTSRC:H}/libc/gen
 
 # Not sure why this isn't defined for all architectures, since most
 # have long double.
 .if ${MACHINE_CPUARCH} == "aarch64" || \
     ${MACHINE_CPUARCH} == "amd64" || \
     ${MACHINE_CPUARCH} == "i386"
 CFLAGS+=	-D__HAVE_LONG_DOUBLE
 .endif
 
 NETBSD_ATF_TESTS_C=	acos_test
 NETBSD_ATF_TESTS_C+=	asin_test
 NETBSD_ATF_TESTS_C+=	atan_test
 NETBSD_ATF_TESTS_C+=	cbrt_test
 NETBSD_ATF_TESTS_C+=	ceil_test
 NETBSD_ATF_TESTS_C+=	casinh_test
 NETBSD_ATF_TESTS_C+=	cos_test
 NETBSD_ATF_TESTS_C+=	cosh_test
 NETBSD_ATF_TESTS_C+=	erf_test
 NETBSD_ATF_TESTS_C+=	exp_test
 NETBSD_ATF_TESTS_C+=	fmod_test
 NETBSD_ATF_TESTS_C+=	fe_round_test
 NETBSD_ATF_TESTS_C+=	infinity_test
 NETBSD_ATF_TESTS_C+=	ilogb_test
 NETBSD_ATF_TESTS_C+=	ldexp_test
 NETBSD_ATF_TESTS_C+=	log_test
 NETBSD_ATF_TESTS_C+=	pow_test
 NETBSD_ATF_TESTS_C+=	precision_test
 NETBSD_ATF_TESTS_C+=	round_test
 NETBSD_ATF_TESTS_C+=	scalbn_test
 NETBSD_ATF_TESTS_C+=	sin_test
 NETBSD_ATF_TESTS_C+=	sinh_test
 NETBSD_ATF_TESTS_C+=	sqrt_test
 NETBSD_ATF_TESTS_C+=	tan_test
 NETBSD_ATF_TESTS_C+=	tanh_test
 
 TAP_TESTS_C+=	cexp_test
 TAP_TESTS_C+=	conj_test
 .if ${MACHINE_CPUARCH} != "aarch64"
 # Hits an assert in llvm when building for arm64:
 # https://llvm.org/bugs/show_bug.cgi?id=26081
 TAP_TESTS_C+=	csqrt_test
 .endif
 TAP_TESTS_C+=	ctrig_test
 TAP_TESTS_C+=	exponential_test
 TAP_TESTS_C+=	fenv_test
 TAP_TESTS_C+=	fma_test
-# clang 3.8.0 fails always fails this test. See: bug 208703
-.if ! (${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 30800)
 TAP_TESTS_C+=	fmaxmin_test
-.endif
 TAP_TESTS_C+=	ilogb2_test
 TAP_TESTS_C+=	invtrig_test
 TAP_TESTS_C+=	invctrig_test
 TAP_TESTS_C+=	logarithm_test
 TAP_TESTS_C+=	lrint_test
 # XXX: the testcase crashes on all platforms, but only on head
 # (bug 205451)
 #TAP_TESTS_C+=	lround_test
 TAP_TESTS_C+=	nan_test
 TAP_TESTS_C+=	nearbyint_test
 TAP_TESTS_C+=	next_test
 TAP_TESTS_C+=	rem_test
 TAP_TESTS_C+=	trig_test
 
 .if !empty(PROG) && !empty(TAP_TESTS_C:M${PROG})
 CFLAGS+=	-O0
 .endif
 
 CSTD=		c99
 
 #COPTS+=	-Wfloat-equal
 
 IGNORE_PRAGMA=
 
 SRCS.ilogb2_test=	ilogb_test.c
 
 LIBADD+=	m
 
 # Copied from lib/msun/Makefile
 .if ${MACHINE_CPUARCH} == "i386"
 ARCH_SUBDIR= i387
 .else
 ARCH_SUBDIR= ${MACHINE_CPUARCH}
 .endif
 
 .include "../${ARCH_SUBDIR}/Makefile.inc"
 
 # XXX: for some odd reason float.h doesn't tell the full story about what the
 # precision is.
 CFLAGS+=	-DLDBL_PREC=${LDBL_PREC}
 
 .include <netbsd-tests.test.mk>
 
 .include <bsd.test.mk>
Index: projects/netbsd-tests-upstream-01-2017/lib/msun/tests/fmaxmin_test.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/lib/msun/tests/fmaxmin_test.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/lib/msun/tests/fmaxmin_test.c	(revision 312218)
@@ -1,136 +1,153 @@
 /*-
  * Copyright (c) 2008 David Schultz <das@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Tests for fmax{,f,l}() and fmin{,f,l}.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <fenv.h>
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
 
 #include "test-utils.h"
 
 #pragma STDC FENV_ACCESS ON
 
 /*
  * Test whether func(x, y) has the expected result, and make sure no
  * exceptions are raised.
  */
 #define	TEST(func, type, x, y, expected) do {				      \
 	type __x = (x);	/* convert before we clear exceptions */	      \
 	type __y = (y);							      \
 	feclearexcept(ALL_STD_EXCEPT);					      \
 	long double __result = func((__x), (__y));			      \
 	if (fetestexcept(ALL_STD_EXCEPT)) {				      \
 		fprintf(stderr, #func "(%.20Lg, %.20Lg) raised 0x%x\n",	      \
 			(x), (y), fetestexcept(FE_ALL_EXCEPT));		      \
 		ok = 0;							      \
 	}								      \
 	if (!fpequal(__result, (expected)))	{			      \
 		fprintf(stderr, #func "(%.20Lg, %.20Lg) = %.20Lg, "	      \
 			"expected %.20Lg\n", (x), (y), __result, (expected)); \
 		ok = 0;							      \
 	}								      \
 } while (0)
 
 int
 testall_r(long double big, long double small)
 {
 	int ok;
 
 	long double expected_max = isnan(big) ? small : big;
 	long double expected_min = isnan(small) ? big : small;
 	ok = 1;
 
 	TEST(fmaxf, float, big, small, expected_max);
 	TEST(fmaxf, float, small, big, expected_max);
 	TEST(fmax, double, big, small, expected_max);
 	TEST(fmax, double, small, big, expected_max);
 	TEST(fmaxl, long double, big, small, expected_max);
 	TEST(fmaxl, long double, small, big, expected_max);
 	TEST(fminf, float, big, small, expected_min);
 	TEST(fminf, float, small, big, expected_min);
 	TEST(fmin, double, big, small, expected_min);
 	TEST(fmin, double, small, big, expected_min);
 	TEST(fminl, long double, big, small, expected_min);
 	TEST(fminl, long double, small, big, expected_min);
 
 	return (ok);
 }
 
+const char *comment = NULL;
+
 /*
  * Test all the functions: fmaxf, fmax, fmaxl, fminf, fmin, and fminl,
  * in all rounding modes and with the arguments in different orders.
  * The input 'big' must be >= 'small'.
  */
 void
 testall(int testnum, long double big, long double small)
 {
 	static const int rmodes[] = {
 		FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO
 	};
 	int i;
 
 	for (i = 0; i < 4; i++) {
 		fesetround(rmodes[i]);
 		if (!testall_r(big, small)) {
 			fprintf(stderr, "FAILURE in rounding mode %d\n",
 				rmodes[i]);
 			break;
 		}
 	}
-	printf("%sok %d - big = %.20Lg, small = %.20Lg\n",
-	       (i == 4) ? "" : "not ", testnum, big, small);
+	printf("%sok %d - big = %.20Lg, small = %.20Lg%s\n",
+	       (i == 4) ? "" : "not ", testnum, big, small,
+	       comment == NULL ? "" : comment);
 }
 
+/* Clang 3.8.0+ fails the invariants for testcase 6, 7, 10, and 11. */
+#if defined(__clang__) && \
+    (__clang_major__ >= 3 && __clang_minor__ >= 8 && __clang_patchlevel__ >= 0)
+#define	affected_by_bug_208703
+#endif
+
 int
 main(int argc, char *argv[])
 {
 
 	printf("1..12\n");
 
 	testall(1, 1.0, 0.0);
 	testall(2, 42.0, nextafterf(42.0, -INFINITY));
 	testall(3, nextafterf(42.0, INFINITY), 42.0);
 	testall(4, -5.0, -5.0);
 	testall(5, -3.0, -4.0);
+#ifdef affected_by_bug_208703
+	comment = "# TODO: testcase 6-7 fails invariant with clang 3.8+ (bug 208703)";
+#endif
 	testall(6, 1.0, NAN);
 	testall(7, INFINITY, NAN);
+	comment = NULL;
 	testall(8, INFINITY, 1.0);
 	testall(9, -3.0, -INFINITY);
 	testall(10, 3.0, -INFINITY);
+#ifdef affected_by_bug_208703
+	comment = "# TODO: testcase 11-12 fails invariant with clang 3.8+ (bug 208703)";
+#endif
 	testall(11, NAN, NAN);
 
 	/* This test isn't strictly required to work by C99. */
 	testall(12, 0.0, -0.0);
+	comment = NULL;
 
 	return (0);
 }
Index: projects/netbsd-tests-upstream-01-2017/release/tools/ec2.conf
===================================================================
--- projects/netbsd-tests-upstream-01-2017/release/tools/ec2.conf	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/release/tools/ec2.conf	(revision 312218)
@@ -1,85 +1,90 @@
 #!/bin/sh
 #
 # $FreeBSD$
 #
 
 # Packages to install into the image we're creating.  This is a deliberately
 # minimalist set, providing only the packages necessary to bootstrap further
 # package installation as specified via EC2 user-data.
-export VM_EXTRA_PACKAGES="ec2-scripts firstboot-freebsd-update firstboot-pkgs"
+export VM_EXTRA_PACKAGES="ec2-scripts firstboot-freebsd-update firstboot-pkgs dual-dhclient"
 
 # Set to a list of third-party software to enable in rc.conf(5).
 export VM_RC_LIST="ec2_configinit ec2_fetchkey ec2_ephemeralswap ec2_loghostkey firstboot_freebsd_update firstboot_pkgs"
 
 # Build with a 1.5 GB UFS partition; the growfs rc.d script will expand
 # the partition to fill the root disk after the EC2 instance is launched.
 # Note that if this is set to <N>G, we will end up with an <N+1> GB disk
 # image since VMSIZE is the size of the UFS partition, not the disk which
 # it resides within.
 export VMSIZE=1536M
 
 # No swap space; the ec2_ephemeralswap rc.d script will allocate swap
 # space on EC2 ephemeral disks.  (If they exist -- the T2 low-cost instances
 # and the C4 compute-optimized instances don't have ephemeral disks.  But
 # it would be silly to bloat the image and increase costs for every instance
 # just for those two families, especially since instances ranging in size
 # from 1 GB of RAM to 60 GB of RAM would need different sizes of swap space
 # anyway.)
 export NOSWAP=YES
 
 vm_extra_pre_umount() {
 	# The firstboot_pkgs rc.d script will download the repository
 	# catalogue and install or update pkg when the instance first
 	# launches, so these files would just be replaced anyway; removing
 	# them from the image allows it to boot faster.
 	env ASSUME_ALWAYS_YES=yes pkg -c ${DESTDIR} delete -f -y pkg
 	rm ${DESTDIR}/var/db/pkg/repo-*.sqlite
 
 	# The size of the EC2 root disk can be configured at instance launch
 	# time; expand our filesystem to fill the disk.
 	echo 'growfs_enable="YES"' >> ${DESTDIR}/etc/rc.conf
 
-	# EC2 instances use DHCP to get their network configuration.
-	echo 'ifconfig_DEFAULT="SYNCDHCP"' >> ${DESTDIR}/etc/rc.conf
+	# EC2 instances use DHCP to get their network configuration.  IPv6
+	# requires accept_rtadv.
+	echo 'ifconfig_DEFAULT="SYNCDHCP accept_rtadv"' >> ${DESTDIR}/etc/rc.conf
 
 	# Unless the system has been configured via EC2 user-data, the user
 	# will need to SSH in to do anything.
 	echo 'sshd_enable="YES"' >> ${DESTDIR}/etc/rc.conf
 
 	# The AWS CLI tools are generally useful, and small enough that they
 	# will download quickly; but users will often override this setting
 	# via EC2 user-data.
 	echo 'firstboot_pkgs_list="awscli"' >> ${DESTDIR}/etc/rc.conf
+
+	# Enable IPv6 on all interfaces, and use DHCP on both IPv4 and IPv6.
+	echo 'ipv6_activate_all_interfaces="YES"' >> ${DESTDIR}/etc/rc.conf
+	echo 'dhclient_program="/usr/local/sbin/dual-dhclient"' >> ${DESTDIR}/etc/rc.conf
 
 	# The EC2 console is output-only, so while printing a backtrace can
 	# be useful, there's no point dropping into a debugger or waiting
 	# for a keypress.
 	echo 'debug.trace_on_panic=1' >> ${DESTDIR}/etc/sysctl.conf
 	echo 'debug.debugger_on_panic=0' >> ${DESTDIR}/etc/sysctl.conf
 	echo 'kern.panic_reboot_wait_time=0' >> ${DESTDIR}/etc/sysctl.conf
 
 	# The console is not interactive, so we might as well boot quickly.
 	echo 'autoboot_delay="-1"' >> ${DESTDIR}/boot/loader.conf
 	echo 'beastie_disable="YES"' >> ${DESTDIR}/boot/loader.conf
 
 	# EC2 has two consoles: An emulated serial port ("system log"),
 	# which has been present since 2006; and a VGA console ("instance
 	# screenshot") which was introduced in 2016.
 	echo 'boot_multicons="YES"' >> ${DESTDIR}/boot/loader.conf
 
 	# Some older EC2 hardware used a version of Xen with a bug in its
 	# emulated serial port.  It is not clear if EC2 still has any such
 	# nodes, but apply the workaround just in case.
 	echo 'hw.broken_txfifo="1"' >> ${DESTDIR}/boot/loader.conf
 
 	# The first time the AMI boots, the installed "first boot" scripts
 	# should be allowed to run:
 	# * ec2_configinit (download and process EC2 user-data)
 	# * ec2_fetchkey (arrange for SSH using the EC2-provided public key)
 	# * growfs (expand the filesystem to fill the provided disk)
 	# * firstboot_freebsd_update (install critical updates)
 	# * firstboot_pkgs (install packages)
 	touch ${DESTDIR}/firstboot
 
 	return 0
 }
Index: projects/netbsd-tests-upstream-01-2017/sys/ddb/db_input.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/ddb/db_input.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/ddb/db_input.c	(revision 312218)
@@ -1,367 +1,374 @@
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
  * All Rights Reserved.
  *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 /*
  *	Author: David B. Golub, Carnegie Mellon University
  *	Date:	7/90
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_output.h>
 
 /*
  * Character input and editing.
  */
 
 /*
  * We don't track output position while editing input,
  * since input always ends with a new-line.  We just
  * reset the line position at the end.
  */
 static char *	db_lbuf_start;	/* start of input line buffer */
 static char *	db_lbuf_end;	/* end of input line buffer */
 static char *	db_lc;		/* current character */
 static char *	db_le;		/* one past last character */
 
 /*
  * Simple input line history support.
  */
 static char	db_lhistory[2048];
 static int	db_lhistlsize, db_lhistidx, db_lhistcur;
 static int	db_lhist_nlines;
 
 #define	CTRL(c)		((c) & 0x1f)
 #define	BLANK		' '
 #define	BACKUP		'\b'
 
+static int	cnmaygetc(void);
 static void	db_delete(int n, int bwd);
 static int	db_inputchar(int c);
 static void	db_putnchars(int c, int count);
 static void	db_putstring(char *s, int count);
 
 static void
 db_putstring(s, count)
 	char	*s;
 	int	count;
 {
 	while (--count >= 0)
 	    cnputc(*s++);
 }
 
 static void
 db_putnchars(c, count)
 	int	c;
 	int	count;
 {
 	while (--count >= 0)
 	    cnputc(c);
 }
 
 /*
  * Delete N characters, forward or backward
  */
 #define	DEL_FWD		0
 #define	DEL_BWD		1
 static void
 db_delete(n, bwd)
 	int	n;
 	int	bwd;
 {
 	char *p;
 
 	if (bwd) {
 	    db_lc -= n;
 	    db_putnchars(BACKUP, n);
 	}
 	for (p = db_lc; p < db_le-n; p++) {
 	    *p = *(p+n);
 	    cnputc(*p);
 	}
 	db_putnchars(BLANK, n);
 	db_putnchars(BACKUP, db_le - db_lc);
 	db_le -= n;
 }
 
 /* returns true at end-of-line */
 static int
 db_inputchar(c)
 	int	c;
 {
 	static int escstate;
 
 	if (escstate == 1) {
 		/* ESC seen, look for [ or O */
 		if (c == '[' || c == 'O')
 			escstate++;
 		else
 			escstate = 0; /* re-init state machine */
 		return (0);
 	} else if (escstate == 2) {
 		escstate = 0;
 		/*
 		 * If a valid cursor key has been found, translate
 		 * into an emacs-style control key, and fall through.
 		 * Otherwise, drop off.
 		 */
 		switch (c) {
 		case 'A':	/* up */
 			c = CTRL('p');
 			break;
 		case 'B':	/* down */
 			c = CTRL('n');
 			break;
 		case 'C':	/* right */
 			c = CTRL('f');
 			break;
 		case 'D':	/* left */
 			c = CTRL('b');
 			break;
 		default:
 			return (0);
 		}
 	}
 
 	switch (c) {
 	    case CTRL('['):
 		escstate = 1;
 		break;
 	    case CTRL('b'):
 		/* back up one character */
 		if (db_lc > db_lbuf_start) {
 		    cnputc(BACKUP);
 		    db_lc--;
 		}
 		break;
 	    case CTRL('f'):
 		/* forward one character */
 		if (db_lc < db_le) {
 		    cnputc(*db_lc);
 		    db_lc++;
 		}
 		break;
 	    case CTRL('a'):
 		/* beginning of line */
 		while (db_lc > db_lbuf_start) {
 		    cnputc(BACKUP);
 		    db_lc--;
 		}
 		break;
 	    case CTRL('e'):
 		/* end of line */
 		while (db_lc < db_le) {
 		    cnputc(*db_lc);
 		    db_lc++;
 		}
 		break;
 	    case CTRL('h'):
 	    case 0177:
 		/* erase previous character */
 		if (db_lc > db_lbuf_start)
 		    db_delete(1, DEL_BWD);
 		break;
 	    case CTRL('d'):
 		/* erase next character */
 		if (db_lc < db_le)
 		    db_delete(1, DEL_FWD);
 		break;
 	    case CTRL('u'):
 		/* kill entire line: */
 		/* at first, delete to beginning of line */
 		if (db_lc > db_lbuf_start)
 		    db_delete(db_lc - db_lbuf_start, DEL_BWD);
 		/* FALLTHROUGH */
 	    case CTRL('k'):
 		/* delete to end of line */
 		if (db_lc < db_le)
 		    db_delete(db_le - db_lc, DEL_FWD);
 		break;
 	    case CTRL('t'):
 		/* twiddle last 2 characters */
 		if (db_lc >= db_lbuf_start + 2) {
 		    c = db_lc[-2];
 		    db_lc[-2] = db_lc[-1];
 		    db_lc[-1] = c;
 		    cnputc(BACKUP);
 		    cnputc(BACKUP);
 		    cnputc(db_lc[-2]);
 		    cnputc(db_lc[-1]);
 		}
 		break;
 	    case CTRL('r'):
 		db_putstring("^R\n", 3);
 	    redraw:
 		if (db_le > db_lbuf_start) {
 		    db_putstring(db_lbuf_start, db_le - db_lbuf_start);
 		    db_putnchars(BACKUP, db_le - db_lc);
 		}
 		break;
 	    case CTRL('p'):
 		/* Make previous history line the active one. */
 		if (db_lhistcur >= 0) {
 		    bcopy(db_lhistory + db_lhistcur * db_lhistlsize,
 			  db_lbuf_start, db_lhistlsize);
 		    db_lhistcur--;
 		    goto hist_redraw;
 		}
 		break;
 	    case CTRL('n'):
 		/* Make next history line the active one. */
 		if (db_lhistcur < db_lhistidx - 1) {
 		    db_lhistcur += 2;
 		    bcopy(db_lhistory + db_lhistcur * db_lhistlsize,
 			  db_lbuf_start, db_lhistlsize);
 		} else {
 		    /*
 		     * ^N through tail of history, reset the
 		     * buffer to zero length.
 		     */
 		    *db_lbuf_start = '\0';
 		    db_lhistcur = db_lhistidx;
 		}
 
 	    hist_redraw:
 		db_putnchars(BACKUP, db_lc - db_lbuf_start);
 		db_putnchars(BLANK, db_le - db_lbuf_start);
 		db_putnchars(BACKUP, db_le - db_lbuf_start);
 		db_le = strchr(db_lbuf_start, '\0');
 		if (db_le[-1] == '\r' || db_le[-1] == '\n')
 		    *--db_le = '\0';
 		db_lc = db_le;
 		goto redraw;
 
 	    case -1:
 		/*
 		 * eek! the console returned eof.
 		 * probably that means we HAVE no console.. we should try bail
 		 * XXX
 		 */
 		c = '\r';
 	    case '\n':
 		/* FALLTHROUGH */
 	    case '\r':
 		*db_le++ = c;
 		return (1);
 	    default:
 		if (db_le == db_lbuf_end) {
 		    cnputc('\007');
 		}
 		else if (c >= ' ' && c <= '~') {
 		    char *p;
 
 		    for (p = db_le; p > db_lc; p--)
 			*p = *(p-1);
 		    *db_lc++ = c;
 		    db_le++;
 		    cnputc(c);
 		    db_putstring(db_lc, db_le - db_lc);
 		    db_putnchars(BACKUP, db_le - db_lc);
 		}
 		break;
 	}
 	return (0);
 }
 
+static int
+cnmaygetc()
+{
+	return (-1);
+}
+
 int
 db_readline(lstart, lsize)
 	char *	lstart;
 	int	lsize;
 {
 
 	if (lsize < 2)
 		return (0);
 	if (lsize != db_lhistlsize) {
 		/*
 		 * (Re)initialize input line history.  Throw away any
 		 * existing history.
 		 */
 		db_lhist_nlines = sizeof(db_lhistory) / lsize;
 		db_lhistlsize = lsize;
 		db_lhistidx = -1;
 	}
 	db_lhistcur = db_lhistidx;
 
 	db_force_whitespace();	/* synch output position */
 
 	db_lbuf_start = lstart;
 	db_lbuf_end   = lstart + lsize - 2;	/* Will append NL and NUL. */
 	db_lc = lstart;
 	db_le = lstart;
 
 	while (!db_inputchar(cngetc()))
 	    continue;
 
 	db_capture_write(lstart, db_le - db_lbuf_start);
 	db_printf("\n");	/* synch output position */
 	*db_le = 0;
 
 	if (db_le - db_lbuf_start > 1) {
 	    /* Maintain input line history for non-empty lines. */
 	    if (++db_lhistidx == db_lhist_nlines) {
 		/* Rotate history. */
 		bcopy(db_lhistory + db_lhistlsize, db_lhistory,
 		      db_lhistlsize * (db_lhist_nlines - 1));
 		db_lhistidx--;
 	    }
 	    bcopy(lstart, db_lhistory + db_lhistidx * db_lhistlsize,
 		  db_lhistlsize);
 	}
 
 	return (db_le - db_lbuf_start);
 }
 
 void
 db_check_interrupt(void)
 {
 	int	c;
 
-	c = cncheckc();
+	c = cnmaygetc();
 	switch (c) {
 	    case -1:		/* no character */
 		return;
 
 	    case CTRL('c'):
 		db_error((char *)0);
 		/*NOTREACHED*/
 
 	    case CTRL('s'):
 		do {
-		    c = cncheckc();
+		    c = cnmaygetc();
 		    if (c == CTRL('c'))
 			db_error((char *)0);
 		} while (c != CTRL('q'));
 		break;
 
 	    default:
 		/* drop on floor */
 		break;
 	}
 }
Index: projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c	(nonexistent)
+++ projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c	(revision 312218)
@@ -0,0 +1,960 @@
+/*-
+ * Copyright (c) 2016 Hiroki Mori
+ * Copyright (c) 2013 Luiz Otavio O Souza.
+ * Copyright (c) 2011-2012 Stefan Bethke.
+ * Copyright (c) 2012 Adrian Chadd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This is Micrel KSZ8995MA driver code. KSZ8995MA use SPI bus on control.
+ * This code development on @SRCHACK's ksz8995ma board and FON2100 with
+ * gpiospi.
+ * etherswitchcfg command port option support addtag, ingress, striptag, 
+ * dropuntagged.
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+#include <net/if_media.h>
+#include <net/if_types.h>
+
+#include <machine/bus.h>
+#include <dev/mii/mii.h>
+#include <dev/mii/miivar.h>
+
+#include <dev/etherswitch/etherswitch.h>
+
+#include <dev/spibus/spi.h>
+
+#include "spibus_if.h"
+#include "miibus_if.h"
+#include "etherswitch_if.h"
+
+#define	KSZ8995MA_SPI_READ		0x03
+#define	KSZ8995MA_SPI_WRITE		0x02
+
+#define	KSZ8995MA_CID0			0x00
+#define	KSZ8995MA_CID1			0x01
+
+#define	KSZ8995MA_GC0			0x02
+#define	KSZ8995MA_GC1			0x03
+#define	KSZ8995MA_GC2			0x04
+#define	KSZ8995MA_GC3			0x05
+
+#define	KSZ8995MA_PORT_SIZE		0x10
+
+#define	KSZ8995MA_PC0_BASE		0x10
+#define	KSZ8995MA_PC1_BASE		0x11
+#define	KSZ8995MA_PC2_BASE		0x12
+#define	KSZ8995MA_PC3_BASE		0x13
+#define	KSZ8995MA_PC4_BASE		0x14
+#define	KSZ8995MA_PC5_BASE		0x15
+#define	KSZ8995MA_PC6_BASE		0x16
+#define	KSZ8995MA_PC7_BASE		0x17
+#define	KSZ8995MA_PC8_BASE		0x18
+#define	KSZ8995MA_PC9_BASE		0x19
+#define	KSZ8995MA_PC10_BASE		0x1a
+#define	KSZ8995MA_PC11_BASE		0x1b
+#define	KSZ8995MA_PC12_BASE		0x1c
+#define	KSZ8995MA_PC13_BASE		0x1d
+
+#define	KSZ8995MA_PS0_BASE		0x1e
+
+#define	KSZ8995MA_PC14_BASE		0x1f
+
+#define	KSZ8995MA_IAC0			0x6e
+#define	KSZ8995MA_IAC1			0x6f
+#define	KSZ8995MA_IDR8			0x70
+#define	KSZ8995MA_IDR7			0x71
+#define	KSZ8995MA_IDR6			0x72
+#define	KSZ8995MA_IDR5			0x73
+#define	KSZ8995MA_IDR4			0x74
+#define	KSZ8995MA_IDR3			0x75
+#define	KSZ8995MA_IDR2			0x76
+#define	KSZ8995MA_IDR1			0x77
+#define	KSZ8995MA_IDR0			0x78
+
+#define	KSZ8995MA_FAMILI_ID		0x95
+#define	KSZ8995MA_CHIP_ID		0x00
+#define	KSZ8995MA_CHIP_ID_MASK		0xf0
+#define	KSZ8995MA_START			0x01
+#define	KSZ8995MA_VLAN_ENABLE		0x80
+#define	KSZ8995MA_TAG_INS		0x04
+#define	KSZ8995MA_TAG_RM		0x02
+#define	KSZ8995MA_INGR_FILT		0x40
+#define	KSZ8995MA_DROP_NONPVID		0x20
+
+#define	KSZ8995MA_PDOWN			0x08
+#define	KSZ8995MA_STARTNEG		0x20
+
+#define	KSZ8995MA_MII_STAT		0x7808
+#define	KSZ8995MA_MII_PHYID_H		0x0022
+#define	KSZ8995MA_MII_PHYID_L		0x1450
+#define	KSZ8995MA_MII_AA		0x0401
+
+#define	KSZ8995MA_VLAN_TABLE_VALID	0x20
+#define	KSZ8995MA_VLAN_TABLE_READ	0x14
+#define	KSZ8995MA_VLAN_TABLE_WRITE	0x04
+
+#define	KSZ8995MA_MAX_PORT		5
+
+MALLOC_DECLARE(M_KSZ8995MA);
+MALLOC_DEFINE(M_KSZ8995MA, "ksz8995ma", "ksz8995ma data structures");
+
+struct ksz8995ma_softc {
+	struct mtx	sc_mtx;		/* serialize access to softc */
+	device_t	sc_dev;
+	int		vlan_mode;
+	int		media;		/* cpu port media */
+	int		cpuport;	/* which PHY is connected to the CPU */
+	int		phymask;	/* PHYs we manage */
+	int		numports;	/* number of ports */
+	int		ifpport[KSZ8995MA_MAX_PORT];
+	int		*portphy;
+	char		**ifname;
+	device_t	**miibus;
+	struct ifnet	**ifp;
+	struct callout	callout_tick;
+	etherswitch_info_t	info;
+};
+
+#define	KSZ8995MA_LOCK(_sc)			\
+	    mtx_lock(&(_sc)->sc_mtx)
+#define	KSZ8995MA_UNLOCK(_sc)			\
+	    mtx_unlock(&(_sc)->sc_mtx)
+#define	KSZ8995MA_LOCK_ASSERT(_sc, _what)	\
+	    mtx_assert(&(_sc)->sc_mtx, (_what))
+#define	KSZ8995MA_TRYLOCK(_sc)			\
+	    mtx_trylock(&(_sc)->sc_mtx)
+
+#if defined(DEBUG)
+#define	DPRINTF(dev, args...) device_printf(dev, args)
+#else
+#define	DPRINTF(dev, args...)
+#endif
+
+static inline int ksz8995ma_portforphy(struct ksz8995ma_softc *, int);
+static void ksz8995ma_tick(void *);
+static int ksz8995ma_ifmedia_upd(struct ifnet *);
+static void ksz8995ma_ifmedia_sts(struct ifnet *, struct ifmediareq *);
+static int ksz8995ma_readreg(device_t dev, int addr);
+static int ksz8995ma_writereg(device_t dev, int addr, int value);
+static void ksz8995ma_portvlanreset(device_t dev);
+
+static int
+ksz8995ma_probe(device_t dev)
+{
+	int id0, id1;
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+	bzero(sc, sizeof(*sc));
+
+	id0 = ksz8995ma_readreg(dev, KSZ8995MA_CID0);
+	id1 = ksz8995ma_readreg(dev, KSZ8995MA_CID1);
+	if (bootverbose)
+		device_printf(dev,"Chip Identifier Register %x %x\n", id0, id1);
+
+	/* check Product Code */
+	if (id0 != KSZ8995MA_FAMILI_ID || (id1 & KSZ8995MA_CHIP_ID_MASK) !=
+	    KSZ8995MA_CHIP_ID) {
+		return (ENXIO);
+	}
+
+	device_set_desc_copy(dev, "Micrel KSZ8995MA SPI switch driver");
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+ksz8995ma_attach_phys(struct ksz8995ma_softc *sc)
+{
+	int phy, port, err;
+	char name[IFNAMSIZ];
+
+	port = 0;
+	err = 0;
+	/* PHYs need an interface, so we generate a dummy one */
+	snprintf(name, IFNAMSIZ, "%sport", device_get_nameunit(sc->sc_dev));
+	for (phy = 0; phy < sc->numports; phy++) {
+		if (phy == sc->cpuport)
+			continue;
+		if (((1 << phy) & sc->phymask) == 0)
+			continue;
+		sc->ifpport[phy] = port;
+		sc->portphy[port] = phy;
+		sc->ifp[port] = if_alloc(IFT_ETHER);
+		sc->ifp[port]->if_softc = sc;
+		sc->ifp[port]->if_flags |= IFF_UP | IFF_BROADCAST |
+		    IFF_DRV_RUNNING | IFF_SIMPLEX;
+		if_initname(sc->ifp[port], name, port);
+		sc->miibus[port] = malloc(sizeof(device_t), M_KSZ8995MA,
+		    M_WAITOK | M_ZERO);
+		if (sc->miibus[port] == NULL) {
+			err = ENOMEM;
+			goto failed;
+		}
+		err = mii_attach(sc->sc_dev, sc->miibus[port], sc->ifp[port],
+		    ksz8995ma_ifmedia_upd, ksz8995ma_ifmedia_sts, \
+		    BMSR_DEFCAPMASK, phy, MII_OFFSET_ANY, 0);
+		DPRINTF(sc->sc_dev, "%s attached to pseudo interface %s\n",
+		    device_get_nameunit(*sc->miibus[port]),
+		    sc->ifp[port]->if_xname);
+		if (err != 0) {
+			device_printf(sc->sc_dev,
+			    "attaching PHY %d failed\n",
+			    phy);
+			goto failed;
+		}
+		++port;
+	}
+	sc->info.es_nports = port;
+	if (sc->cpuport != -1) {
+		/* cpu port is MAC5 on ksz8995ma */ 
+		sc->ifpport[sc->cpuport] = port;
+		sc->portphy[port] = sc->cpuport;
+		++sc->info.es_nports;
+	}
+
+	return (0);
+
+failed:
+	for (phy = 0; phy < sc->numports; phy++) {
+		if (((1 << phy) & sc->phymask) == 0)
+			continue;
+		port = ksz8995ma_portforphy(sc, phy);
+		if (sc->miibus[port] != NULL)
+			device_delete_child(sc->sc_dev, (*sc->miibus[port]));
+		if (sc->ifp[port] != NULL)
+			if_free(sc->ifp[port]);
+		if (sc->ifname[port] != NULL)
+			free(sc->ifname[port], M_KSZ8995MA);
+		if (sc->miibus[port] != NULL)
+			free(sc->miibus[port], M_KSZ8995MA);
+	}
+	return (err);
+}
+
+static int
+ksz8995ma_attach(device_t dev)
+{
+	struct ksz8995ma_softc	*sc;
+	int			 err, reg;
+
+	err = 0;
+	sc = device_get_softc(dev);
+
+	sc->sc_dev = dev;
+	mtx_init(&sc->sc_mtx, "ksz8995ma", NULL, MTX_DEF);
+	strlcpy(sc->info.es_name, device_get_desc(dev),
+	    sizeof(sc->info.es_name));
+
+	/* KSZ8995MA Defaults */
+	sc->numports = KSZ8995MA_MAX_PORT;
+	sc->phymask = (1 << (KSZ8995MA_MAX_PORT + 1)) - 1;
+	sc->cpuport = -1;
+	sc->media = 100;
+
+	(void) resource_int_value(device_get_name(dev), device_get_unit(dev),
+	    "cpuport", &sc->cpuport);
+
+	sc->info.es_nvlangroups = 16;
+	sc->info.es_vlan_caps = ETHERSWITCH_VLAN_PORT | ETHERSWITCH_VLAN_DOT1Q;
+
+	sc->ifp = malloc(sizeof(struct ifnet *) * sc->numports, M_KSZ8995MA,
+	    M_WAITOK | M_ZERO);
+	sc->ifname = malloc(sizeof(char *) * sc->numports, M_KSZ8995MA,
+	    M_WAITOK | M_ZERO);
+	sc->miibus = malloc(sizeof(device_t *) * sc->numports, M_KSZ8995MA,
+	    M_WAITOK | M_ZERO);
+	sc->portphy = malloc(sizeof(int) * sc->numports, M_KSZ8995MA,
+	    M_WAITOK | M_ZERO);
+
+	if (sc->ifp == NULL || sc->ifname == NULL || sc->miibus == NULL ||
+	    sc->portphy == NULL) {
+		err = ENOMEM;
+		goto failed;
+	}
+
+	/*
+	 * Attach the PHYs and complete the bus enumeration.
+	 */
+	err = ksz8995ma_attach_phys(sc);
+	if (err != 0)
+		goto failed;
+
+	bus_generic_probe(dev);
+	bus_enumerate_hinted_children(dev);
+	err = bus_generic_attach(dev);
+	if (err != 0)
+		goto failed;
+	
+	callout_init(&sc->callout_tick, 0);
+
+	ksz8995ma_tick(sc);
+	
+	/* start switch */
+	sc->vlan_mode = 0;
+	reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3);
+	ksz8995ma_writereg(dev, KSZ8995MA_GC3, 
+	    reg & ~KSZ8995MA_VLAN_ENABLE);
+	ksz8995ma_portvlanreset(dev);
+	ksz8995ma_writereg(dev, KSZ8995MA_CID1, KSZ8995MA_START);
+
+	return (0);
+
+failed:
+	if (sc->portphy != NULL)
+		free(sc->portphy, M_KSZ8995MA);
+	if (sc->miibus != NULL)
+		free(sc->miibus, M_KSZ8995MA);
+	if (sc->ifname != NULL)
+		free(sc->ifname, M_KSZ8995MA);
+	if (sc->ifp != NULL)
+		free(sc->ifp, M_KSZ8995MA);
+
+	return (err);
+}
+
+static int
+ksz8995ma_detach(device_t dev)
+{
+	struct ksz8995ma_softc	*sc;
+	int			 i, port;
+
+	sc = device_get_softc(dev);
+
+	callout_drain(&sc->callout_tick);
+
+	for (i = 0; i < KSZ8995MA_MAX_PORT; i++) {
+		if (((1 << i) & sc->phymask) == 0)
+			continue;
+		port = ksz8995ma_portforphy(sc, i);
+		if (sc->miibus[port] != NULL)
+			device_delete_child(dev, (*sc->miibus[port]));
+		if (sc->ifp[port] != NULL)
+			if_free(sc->ifp[port]);
+		free(sc->ifname[port], M_KSZ8995MA);
+		free(sc->miibus[port], M_KSZ8995MA);
+	}
+
+	free(sc->portphy, M_KSZ8995MA);
+	free(sc->miibus, M_KSZ8995MA);
+	free(sc->ifname, M_KSZ8995MA);
+	free(sc->ifp, M_KSZ8995MA);
+
+	bus_generic_detach(dev);
+	mtx_destroy(&sc->sc_mtx);
+
+	return (0);
+}
+
+/*
+ * Convert PHY number to port number.
+ */
+static inline int
+ksz8995ma_portforphy(struct ksz8995ma_softc *sc, int phy)
+{
+
+	return (sc->ifpport[phy]);
+}
+
+static inline struct mii_data *
+ksz8995ma_miiforport(struct ksz8995ma_softc *sc, int port)
+{
+
+	if (port < 0 || port > sc->numports)
+		return (NULL);
+	if (port == sc->cpuport)
+		return (NULL);
+	return (device_get_softc(*sc->miibus[port]));
+}
+
+static inline struct ifnet *
+ksz8995ma_ifpforport(struct ksz8995ma_softc *sc, int port)
+{
+
+	if (port < 0 || port > sc->numports)
+		return (NULL);
+	return (sc->ifp[port]);
+}
+
+/*
+ * Poll the status for all PHYs.
+ */
+static void
+ksz8995ma_miipollstat(struct ksz8995ma_softc *sc)
+{
+	int i, port;
+	struct mii_data *mii;
+	struct mii_softc *miisc;
+
+	KSZ8995MA_LOCK_ASSERT(sc, MA_NOTOWNED);
+
+	for (i = 0; i < KSZ8995MA_MAX_PORT; i++) {
+		if (i == sc->cpuport)
+			continue;
+		if (((1 << i) & sc->phymask) == 0)
+			continue;
+		port = ksz8995ma_portforphy(sc, i);
+		if ((*sc->miibus[port]) == NULL)
+			continue;
+		mii = device_get_softc(*sc->miibus[port]);
+		LIST_FOREACH(miisc, &mii->mii_phys, mii_list) {
+			if (IFM_INST(mii->mii_media.ifm_cur->ifm_media) !=
+			    miisc->mii_inst)
+				continue;
+			ukphy_status(miisc);
+			mii_phy_update(miisc, MII_POLLSTAT);
+		}
+	}
+}
+
+static void
+ksz8995ma_tick(void *arg)
+{
+	struct ksz8995ma_softc *sc;
+
+	sc = arg;
+
+	ksz8995ma_miipollstat(sc);
+	callout_reset(&sc->callout_tick, hz, ksz8995ma_tick, sc);
+}
+
+static void
+ksz8995ma_lock(device_t dev)
+{
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	KSZ8995MA_LOCK_ASSERT(sc, MA_NOTOWNED);
+	KSZ8995MA_LOCK(sc);
+}
+
+static void
+ksz8995ma_unlock(device_t dev)
+{
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	KSZ8995MA_LOCK_ASSERT(sc, MA_OWNED);
+	KSZ8995MA_UNLOCK(sc);
+}
+
+static etherswitch_info_t *
+ksz8995ma_getinfo(device_t dev)
+{
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+	
+	return (&sc->info);
+}
+
+static int
+ksz8995ma_getport(device_t dev, etherswitch_port_t *p)
+{
+	struct ksz8995ma_softc *sc;
+	struct mii_data *mii;
+	struct ifmediareq *ifmr;
+	int phy, err;
+	int tag1, tag2, portreg;
+
+	sc = device_get_softc(dev);
+	ifmr = &p->es_ifmr;
+
+	if (p->es_port < 0 || p->es_port >= sc->numports)
+		return (ENXIO);
+
+	if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) {
+		tag1 = ksz8995ma_readreg(dev, KSZ8995MA_PC3_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		tag2 = ksz8995ma_readreg(dev, KSZ8995MA_PC4_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		p->es_pvid = (tag1 & 0x0f) << 8 | tag2;
+
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC0_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		if (portreg & KSZ8995MA_TAG_INS)
+			p->es_flags |= ETHERSWITCH_PORT_ADDTAG;
+		if (portreg & KSZ8995MA_TAG_RM)
+			p->es_flags |= ETHERSWITCH_PORT_STRIPTAG;
+
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC2_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		if (portreg & KSZ8995MA_DROP_NONPVID)
+			p->es_flags |= ETHERSWITCH_PORT_DROPUNTAGGED;
+		if (portreg & KSZ8995MA_INGR_FILT)
+			p->es_flags |= ETHERSWITCH_PORT_INGRESS;
+	}
+
+	phy = sc->portphy[p->es_port];
+	mii = ksz8995ma_miiforport(sc, p->es_port);
+	if (sc->cpuport != -1 && phy == sc->cpuport) {
+		/* fill in fixed values for CPU port */
+		p->es_flags |= ETHERSWITCH_PORT_CPU;
+		ifmr->ifm_count = 0;
+		if (sc->media == 100)
+			ifmr->ifm_current = ifmr->ifm_active =
+			    IFM_ETHER | IFM_100_TX | IFM_FDX;
+		else
+			ifmr->ifm_current = ifmr->ifm_active =
+			    IFM_ETHER | IFM_1000_T | IFM_FDX;
+		ifmr->ifm_mask = 0;
+		ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
+	} else if (mii != NULL) {
+		err = ifmedia_ioctl(mii->mii_ifp, &p->es_ifr,
+		    &mii->mii_media, SIOCGIFMEDIA);
+		if (err)
+			return (err);
+	} else {
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+static int
+ksz8995ma_setport(device_t dev, etherswitch_port_t *p)
+{
+	struct ksz8995ma_softc *sc;
+	struct mii_data *mii;
+        struct ifmedia *ifm;
+        struct ifnet *ifp;
+	int phy, err;
+	int portreg;
+
+	sc = device_get_softc(dev);
+
+	if (p->es_port < 0 || p->es_port >= sc->numports)
+		return (ENXIO);
+
+	if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) {
+		ksz8995ma_writereg(dev, KSZ8995MA_PC4_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port, p->es_pvid & 0xff);
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC3_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		ksz8995ma_writereg(dev, KSZ8995MA_PC3_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port,
+		    (portreg & 0xf0) | ((p->es_pvid >> 8) & 0x0f));
+
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC0_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		if (p->es_flags & ETHERSWITCH_PORT_ADDTAG)
+			portreg |= KSZ8995MA_TAG_INS;
+		else
+			portreg &= ~KSZ8995MA_TAG_INS;
+		if (p->es_flags & ETHERSWITCH_PORT_STRIPTAG) 
+			portreg |= KSZ8995MA_TAG_RM;
+		else
+			portreg &= ~KSZ8995MA_TAG_RM;
+		ksz8995ma_writereg(dev, KSZ8995MA_PC0_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port, portreg);
+
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC2_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port);
+		if (p->es_flags & ETHERSWITCH_PORT_DROPUNTAGGED)
+			portreg |= KSZ8995MA_DROP_NONPVID;
+		else
+			portreg &= ~KSZ8995MA_DROP_NONPVID;
+		if (p->es_flags & ETHERSWITCH_PORT_INGRESS)
+			portreg |= KSZ8995MA_INGR_FILT;
+		else
+			portreg &= ~KSZ8995MA_INGR_FILT;
+		ksz8995ma_writereg(dev, KSZ8995MA_PC2_BASE + 
+		    KSZ8995MA_PORT_SIZE * p->es_port, portreg);
+	}
+
+	phy = sc->portphy[p->es_port];
+	mii = ksz8995ma_miiforport(sc, p->es_port);
+	if (phy != sc->cpuport) {
+		if (mii == NULL)
+			return (ENXIO);
+		ifp = ksz8995ma_ifpforport(sc, p->es_port);
+		ifm = &mii->mii_media;
+		err = ifmedia_ioctl(ifp, &p->es_ifr, ifm, SIOCSIFMEDIA);
+	}
+	return (0);
+}
+
+static int
+ksz8995ma_getvgroup(device_t dev, etherswitch_vlangroup_t *vg)
+{
+	int data0, data1, data2;
+	int vlantab;
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	if (sc->vlan_mode == ETHERSWITCH_VLAN_PORT) {
+		if (vg->es_vlangroup < sc->numports) {
+			vg->es_vid = ETHERSWITCH_VID_VALID;
+			vg->es_vid |= vg->es_vlangroup;
+			data0 = ksz8995ma_readreg(dev, KSZ8995MA_PC1_BASE +
+			    KSZ8995MA_PORT_SIZE * vg->es_vlangroup);
+			vg->es_member_ports = data0 & 0x1f;
+			vg->es_untagged_ports = vg->es_member_ports;
+			vg->es_fid = 0;
+		} else {
+			vg->es_vid = 0;
+		}
+	} else if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) {
+		ksz8995ma_writereg(dev, KSZ8995MA_IAC0,
+		    KSZ8995MA_VLAN_TABLE_READ);
+		ksz8995ma_writereg(dev, KSZ8995MA_IAC1, vg->es_vlangroup);
+		data2 = ksz8995ma_readreg(dev, KSZ8995MA_IDR2);
+		data1 = ksz8995ma_readreg(dev, KSZ8995MA_IDR1);
+		data0 = ksz8995ma_readreg(dev, KSZ8995MA_IDR0);
+		vlantab = data2 << 16 | data1 << 8 | data0;
+		if (data2 & KSZ8995MA_VLAN_TABLE_VALID) {
+			vg->es_vid = ETHERSWITCH_VID_VALID;
+			vg->es_vid |= vlantab & 0xfff;
+			vg->es_member_ports = (vlantab >> 16) & 0x1f;
+			vg->es_untagged_ports = vg->es_member_ports;
+			vg->es_fid = (vlantab >> 12) & 0x0f;
+		} else {
+			vg->es_fid = 0;
+		}
+	}
+	
+	return (0);
+}
+
+static int
+ksz8995ma_setvgroup(device_t dev, etherswitch_vlangroup_t *vg)
+{
+	struct ksz8995ma_softc *sc;
+	int data0;
+
+	sc = device_get_softc(dev);
+
+	if (sc->vlan_mode == ETHERSWITCH_VLAN_PORT) {
+		data0 = ksz8995ma_readreg(dev, KSZ8995MA_PC1_BASE +
+		    KSZ8995MA_PORT_SIZE * vg->es_vlangroup);
+		ksz8995ma_writereg(dev, KSZ8995MA_PC1_BASE +
+		    KSZ8995MA_PORT_SIZE * vg->es_vlangroup,
+		    (data0 & 0xe0) | (vg->es_member_ports & 0x1f));
+	} else if (sc->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) {
+		if (vg->es_member_ports != 0) {
+			ksz8995ma_writereg(dev, KSZ8995MA_IDR2,
+			    KSZ8995MA_VLAN_TABLE_VALID |
+			    (vg->es_member_ports & 0x1f));
+			ksz8995ma_writereg(dev, KSZ8995MA_IDR1,
+			    vg->es_fid << 4 | vg->es_vid >> 8);
+			ksz8995ma_writereg(dev, KSZ8995MA_IDR0,
+			    vg->es_vid & 0xff);
+		} else {
+			ksz8995ma_writereg(dev, KSZ8995MA_IDR2, 0);
+			ksz8995ma_writereg(dev, KSZ8995MA_IDR1, 0);
+			ksz8995ma_writereg(dev, KSZ8995MA_IDR0, 0);
+		}
+		ksz8995ma_writereg(dev, KSZ8995MA_IAC0,
+		    KSZ8995MA_VLAN_TABLE_WRITE);
+		ksz8995ma_writereg(dev, KSZ8995MA_IAC1, vg->es_vlangroup);
+	}
+
+	return (0);
+}
+
+static int
+ksz8995ma_getconf(device_t dev, etherswitch_conf_t *conf)
+{
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	/* Return the VLAN mode. */
+	conf->cmd = ETHERSWITCH_CONF_VLAN_MODE;
+	conf->vlan_mode = sc->vlan_mode;
+
+	return (0);
+}
+
+static void 
+ksz8995ma_portvlanreset(device_t dev)
+{
+	int i, data;
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	for (i = 0; i < sc->numports; ++i) {
+		data = ksz8995ma_readreg(dev, KSZ8995MA_PC1_BASE +
+		    KSZ8995MA_PORT_SIZE * i);
+		ksz8995ma_writereg(dev, KSZ8995MA_PC1_BASE +
+		    KSZ8995MA_PORT_SIZE * i, (data & 0xe0) | 0x1f);
+	}
+}
+
+static int
+ksz8995ma_setconf(device_t dev, etherswitch_conf_t *conf)
+{
+	int reg;
+	struct ksz8995ma_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	if ((conf->cmd & ETHERSWITCH_CONF_VLAN_MODE) == 0)
+		return (0);
+
+	if (conf->vlan_mode == ETHERSWITCH_VLAN_PORT) {
+		sc->vlan_mode = ETHERSWITCH_VLAN_PORT;
+		reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3);
+		ksz8995ma_writereg(dev, KSZ8995MA_GC3, 
+		    reg & ~KSZ8995MA_VLAN_ENABLE);
+		ksz8995ma_portvlanreset(dev);
+	} else if (conf->vlan_mode == ETHERSWITCH_VLAN_DOT1Q) {
+		sc->vlan_mode = ETHERSWITCH_VLAN_DOT1Q;
+		reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3);
+		ksz8995ma_writereg(dev, KSZ8995MA_GC3, 
+		    reg | KSZ8995MA_VLAN_ENABLE);
+	} else {
+		sc->vlan_mode = 0;
+		reg = ksz8995ma_readreg(dev, KSZ8995MA_GC3);
+		ksz8995ma_writereg(dev, KSZ8995MA_GC3, 
+		    reg & ~KSZ8995MA_VLAN_ENABLE);
+		ksz8995ma_portvlanreset(dev);
+	}
+	return (0);
+}
+
+static void
+ksz8995ma_statchg(device_t dev)
+{
+
+	DPRINTF(dev, "%s\n", __func__);
+}
+
+static int
+ksz8995ma_ifmedia_upd(struct ifnet *ifp)
+{
+	struct ksz8995ma_softc *sc;
+	struct mii_data *mii;
+
+	sc = ifp->if_softc;
+	mii = ksz8995ma_miiforport(sc, ifp->if_dunit);
+
+	DPRINTF(sc->sc_dev, "%s\n", __func__);
+	if (mii == NULL)
+		return (ENXIO);
+	mii_mediachg(mii);
+	return (0);
+}
+
+static void
+ksz8995ma_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+	struct ksz8995ma_softc *sc;
+	struct mii_data *mii;
+
+	sc = ifp->if_softc;
+	mii = ksz8995ma_miiforport(sc, ifp->if_dunit);
+
+	DPRINTF(sc->sc_dev, "%s\n", __func__);
+
+	if (mii == NULL)
+		return;
+	mii_pollstat(mii);
+	ifmr->ifm_active = mii->mii_media_active;
+	ifmr->ifm_status = mii->mii_media_status;
+}
+
+static int
+ksz8995ma_readphy(device_t dev, int phy, int reg)
+{
+int portreg;
+
+	/* 
+	 * This is no mdio/mdc connection code.
+         * simulate MIIM Registers via the SPI interface
+	 */
+	if (reg == MII_BMSR) {
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PS0_BASE + 
+			KSZ8995MA_PORT_SIZE * phy);
+		return (KSZ8995MA_MII_STAT | 
+		    (portreg & 0x20 ? BMSR_LINK : 0x00) |
+		    (portreg & 0x40 ? BMSR_ACOMP : 0x00));
+	} else if (reg == MII_PHYIDR1) {
+		return (KSZ8995MA_MII_PHYID_H);
+	} else if (reg == MII_PHYIDR2) {
+		return (KSZ8995MA_MII_PHYID_L);
+	} else if (reg == MII_ANAR) {
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC12_BASE + 
+			KSZ8995MA_PORT_SIZE * phy);
+		return (KSZ8995MA_MII_AA | (portreg & 0x0f) << 5);
+	} else if (reg == MII_ANLPAR) {
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PS0_BASE + 
+			KSZ8995MA_PORT_SIZE * phy);
+		return (((portreg & 0x0f) << 5) | 0x01);
+	}
+
+	return (0);
+}
+
+static int
+ksz8995ma_writephy(device_t dev, int phy, int reg, int data)
+{
+int portreg;
+
+	/* 
+	 * This is no mdio/mdc connection code.
+         * simulate MIIM Registers via the SPI interface
+	 */
+	if (reg == MII_BMCR) {
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC13_BASE + 
+			KSZ8995MA_PORT_SIZE * phy);
+		if (data & BMCR_PDOWN)
+			portreg |= KSZ8995MA_PDOWN;
+		else
+			portreg &= ~KSZ8995MA_PDOWN;
+		if (data & BMCR_STARTNEG)
+			portreg |= KSZ8995MA_STARTNEG;
+		else
+			portreg &= ~KSZ8995MA_STARTNEG;
+		ksz8995ma_writereg(dev, KSZ8995MA_PC13_BASE + 
+			KSZ8995MA_PORT_SIZE * phy, portreg);
+	} else if (reg == MII_ANAR) {
+		portreg = ksz8995ma_readreg(dev, KSZ8995MA_PC12_BASE + 
+			KSZ8995MA_PORT_SIZE * phy);
+		portreg &= 0xf;
+		portreg |= ((data >> 5) & 0x0f);
+		ksz8995ma_writereg(dev, KSZ8995MA_PC12_BASE + 
+			KSZ8995MA_PORT_SIZE * phy, portreg);
+	}
+	return (0);
+}
+
+static int
+ksz8995ma_readreg(device_t dev, int addr)
+{
+	uint8_t txBuf[8], rxBuf[8];
+	struct spi_command cmd;
+	int err;
+
+	memset(&cmd, 0, sizeof(cmd));
+	memset(txBuf, 0, sizeof(txBuf));
+	memset(rxBuf, 0, sizeof(rxBuf));
+
+	/* read spi */
+	txBuf[0] = KSZ8995MA_SPI_READ;
+	txBuf[1] = addr;
+	cmd.tx_cmd = &txBuf;
+	cmd.rx_cmd = &rxBuf;
+	cmd.tx_cmd_sz = 3;
+	cmd.rx_cmd_sz = 3;
+        err = SPIBUS_TRANSFER(device_get_parent(dev), dev, &cmd);
+	if (err)
+		return(0);
+
+	return (rxBuf[2]);
+}
+
+static int
+ksz8995ma_writereg(device_t dev, int addr, int value)
+{
+	uint8_t txBuf[8], rxBuf[8];
+	struct spi_command cmd;
+	int err;
+
+	memset(&cmd, 0, sizeof(cmd));
+	memset(txBuf, 0, sizeof(txBuf));
+	memset(rxBuf, 0, sizeof(rxBuf));
+
+	/* write spi */
+	txBuf[0] = KSZ8995MA_SPI_WRITE;
+	txBuf[1] = addr;
+	txBuf[2] = value;
+	cmd.tx_cmd = &txBuf;
+	cmd.rx_cmd = &rxBuf;
+	cmd.tx_cmd_sz = 3;
+	cmd.rx_cmd_sz = 3;
+        err = SPIBUS_TRANSFER(device_get_parent(dev), dev, &cmd);
+	if (err)
+		return(0);
+
+	return (0);
+}
+
+static device_method_t ksz8995ma_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,			ksz8995ma_probe),
+	DEVMETHOD(device_attach,		ksz8995ma_attach),
+	DEVMETHOD(device_detach,		ksz8995ma_detach),
+	
+	/* bus interface */
+	DEVMETHOD(bus_add_child,		device_add_child_ordered),
+	
+	/* MII interface */
+	DEVMETHOD(miibus_readreg,		ksz8995ma_readphy),
+	DEVMETHOD(miibus_writereg,		ksz8995ma_writephy),
+	DEVMETHOD(miibus_statchg,		ksz8995ma_statchg),
+
+	/* etherswitch interface */
+	DEVMETHOD(etherswitch_lock,		ksz8995ma_lock),
+	DEVMETHOD(etherswitch_unlock,		ksz8995ma_unlock),
+	DEVMETHOD(etherswitch_getinfo,		ksz8995ma_getinfo),
+	DEVMETHOD(etherswitch_readreg,		ksz8995ma_readreg),
+	DEVMETHOD(etherswitch_writereg,		ksz8995ma_writereg),
+	DEVMETHOD(etherswitch_readphyreg,	ksz8995ma_readphy),
+	DEVMETHOD(etherswitch_writephyreg,	ksz8995ma_writephy),
+	DEVMETHOD(etherswitch_getport,		ksz8995ma_getport),
+	DEVMETHOD(etherswitch_setport,		ksz8995ma_setport),
+	DEVMETHOD(etherswitch_getvgroup,	ksz8995ma_getvgroup),
+	DEVMETHOD(etherswitch_setvgroup,	ksz8995ma_setvgroup),
+	DEVMETHOD(etherswitch_setconf,		ksz8995ma_setconf),
+	DEVMETHOD(etherswitch_getconf,		ksz8995ma_getconf),
+
+	DEVMETHOD_END
+};
+
+DEFINE_CLASS_0(ksz8995ma, ksz8995ma_driver, ksz8995ma_methods,
+    sizeof(struct ksz8995ma_softc));
+static devclass_t ksz8995ma_devclass;
+
+DRIVER_MODULE(ksz8995ma, spibus, ksz8995ma_driver, ksz8995ma_devclass, 0, 0);
+DRIVER_MODULE(miibus, ksz8995ma, miibus_driver, miibus_devclass, 0, 0);
+DRIVER_MODULE(etherswitch, ksz8995ma, etherswitch_driver, etherswitch_devclass,
+    0, 0);
+MODULE_VERSION(ksz8995ma, 1);
+MODULE_DEPEND(ksz8995ma, spibus, 1, 1, 1); /* XXX which versions? */
+MODULE_DEPEND(ksz8995ma, miibus, 1, 1, 1); /* XXX which versions? */
+MODULE_DEPEND(ksz8995ma, etherswitch, 1, 1, 1); /* XXX which versions? */

Property changes on: projects/netbsd-tests-upstream-01-2017/sys/dev/etherswitch/micrel/ksz8995ma.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/netbsd-tests-upstream-01-2017/sys/i386/i386/pmap.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/i386/i386/pmap.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/i386/i386/pmap.c	(revision 312218)
@@ -1,5639 +1,5645 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_smp.h"
 #include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #ifdef DEV_APIC
 #include <sys/bus.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 #endif
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
     atomic_clear_int((u_int *)(pte), PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 int pgeflag = 0;		/* PG_G or-in */
 int pseflag = 0;		/* PG_PS or-in */
 
 static int nkpt = NKPT;
 vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
 extern u_int32_t KERNend;
 extern u_int32_t KPTphys;
 
 #if defined(PAE) || defined(PAE_TABLES)
 pt_entry_t pg_nx;
 static uma_zone_t pdptzone;
 #endif
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int pat_works = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
     "Is page attribute table fully functional?");
 
 static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 static struct rwlock_padalign pvh_global_lock;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static struct md_page *pv_table;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
 int pv_maxchunks;			/* How many chunks we have KVA for */
 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP3;
 static pd_entry_t *KPTD;
 caddr_t ptvmmap = 0;
 caddr_t CADDR3;
 struct msgbuf *msgbufp = NULL;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static pt_entry_t *PMAP1 = NULL, *PMAP2;
 static pt_entry_t *PADDR1 = NULL, *PADDR2;
 #ifdef SMP
 static int PMAP1cpu;
 static int PMAP1changedcpu;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 
 	   &PMAP1changedcpu, 0,
 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
 #endif
 static int PMAP1changed;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 
 	   &PMAP1changed, 0,
 	   "Number of times pmap_pte_quick changed PMAP1");
 static int PMAP1unchanged;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 
 	   &PMAP1unchanged, 0,
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
 
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static void pmap_flush_page(vm_page_t m);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     struct spglist *free);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
     struct spglist *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
 #if defined(PAE) || defined(PAE_TABLES)
 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
     int wait);
 #endif
 static void pmap_set_pg(void);
 
 static __inline void pagezero(void *page);
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * If you get an error here, then you set KVA_PAGES wrong! See the
  * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
  * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
  */
 CTASSERT(KERNBASE % (1 << 24) == 0);
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 	struct pcpu *pc;
 	int i;
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	/*
 	 * Initialize the first available kernel virtual address.  However,
 	 * using "firstaddr" may waste a few pages of the kernel virtual
 	 * address space, because locore may not have mapped every physical
 	 * page that it allocated.  Preferably, locore would provide a first
 	 * unused virtual address in addition to "firstaddr".
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 #if defined(PAE) || defined(PAE_TABLES)
 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 #endif
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
  	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	LIST_INIT(&allpmaps);
 
 	/*
 	 * Request a spin mutex so that changes to allpmaps cannot be
 	 * preempted by smp_rendezvous_cpus().  Otherwise,
 	 * pmap_update_pde_kernel() could access allpmaps while it is
 	 * being changed.
 	 */
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 
 	/*
 	 * Initialize temporary map objects on the current CPU for use
 	 * during early boot.
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the boot-time memory test.
 	 */
 	pc = pcpu_find(curcpu);
 	mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
 	SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
 	SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
 	SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
 
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1);
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
 
 	/*
 	 * KPTmap is used by pmap_kextract().
 	 *
 	 * KPTmap is first initialized by locore.  However, that initial
 	 * KPTmap can only support NKPT page table pages.  Here, a larger
 	 * KPTmap is created that can support KVA_PAGES page table pages.
 	 */
 	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
 
 	for (i = 0; i < NKPT; i++)
 		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
 
 	/*
 	 * Adjust the start of the KPTD and KPTmap so that the implementation
 	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
 	 */
 	KPTD -= KPTDI;
 	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
 
 	/*
 	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
 	 * respectively.
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
 
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	virtual_avail = va;
 
 	/*
 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
 	 * physical memory region that is used by the ACPI wakeup code.  This
 	 * mapping must not have PG_G set. 
 	 */
 #ifdef XBOX
 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
 	 * an early stadium, we cannot yet neatly map video memory ... :-(
 	 * Better fixes are very welcome! */
 	if (!arch_i386_is_xbox)
 #endif
 	for (i = 1; i < NKPT; i++)
 		PTD[i] = 0;
 
 	/* Initialize the PAT MSR if present. */
 	pmap_init_pat();
 
 	/* Turn on PG_G on kernel page(s) */
 	pmap_set_pg();
 }
 
 static void
 pmap_init_reserved_pages(void)
 {
 	struct pcpu *pc;
 	vm_offset_t pages;
 	int i;
 
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		/*
 		 * Skip if the mapping has already been initialized,
 		 * i.e. this is the BSP.
 		 */
 		if (pc->pc_cmap_addr1 != 0)
 			continue;
 		mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
 		pages = kva_alloc(PAGE_SIZE * 3);
 		if (pages == 0)
 			panic("%s: unable to allocate KVA", __func__);
 		pc->pc_cmap_pte1 = vtopte(pages);
 		pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
 		pc->pc_cmap_addr1 = (caddr_t)pages;
 		pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
 		pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
 	}
 }
  
 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	int pat_table[PAT_INDEX_SIZE];
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_table[i] = -1;
 	pat_table[PAT_WRITE_BACK] = 0;
 	pat_table[PAT_WRITE_THROUGH] = 1;
 	pat_table[PAT_UNCACHEABLE] = 3;
 	pat_table[PAT_WRITE_COMBINING] = 3;
 	pat_table[PAT_WRITE_PROTECTED] = 3;
 	pat_table[PAT_UNCACHED] = 3;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if ((cpu_feature & CPUID_PAT) == 0) {
 		for (i = 0; i < PAT_INDEX_SIZE; i++)
 			pat_index[i] = pat_table[i];
 		pat_works = 0;
 		return;
 	}
 
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
 		pat_works = 0;
 
 	/* Initialize default PAT entries. */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(6, PAT_UNCACHED) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	if (pat_works) {
 		/*
 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 		 * Program 5 and 6 as WP and WC.
 		 * Leave 4 and 7 as WB and UC.
 		 */
 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
 		pat_table[PAT_UNCACHED] = 2;
 		pat_table[PAT_WRITE_PROTECTED] = 5;
 		pat_table[PAT_WRITE_COMBINING] = 6;
 	} else {
 		/*
 		 * Just replace PAT Index 2 with WC instead of UC-.
 		 */
 		pat_msr &= ~PAT_MASK(2);
 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 		pat_table[PAT_WRITE_COMBINING] = 2;
 	}
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = pat_table[i];
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 /*
  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
  */
 static void
 pmap_set_pg(void)
 {
 	pt_entry_t *pte;
 	vm_offset_t va, endva;
 
 	if (pgeflag == 0)
 		return;
 
 	endva = KERNBASE + KERNend;
 
 	if (pseflag) {
 		va = KERNBASE + KERNLOAD;
 		while (va  < endva) {
 			pdir_pde(PTD, va) |= pgeflag;
 			invltlb();	/* Flush non-PG_G entries. */
 			va += NBPDR;
 		}
 	} else {
 		va = (vm_offset_t)btext;
 		while (va < endva) {
 			pte = vtopte(va);
 			if (*pte)
 				*pte |= pgeflag;
 			invltlb();	/* Flush non-PG_G entries. */
 			va += PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
 #if defined(PAE) || defined(PAE_TABLES)
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
 	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
 	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
 }
 #endif
 
 /*
  * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
  * Requirements:
  *  - Must deal with pages in order to ensure that none of the PG_* bits
  *    are ever set, PG_V in particular.
  *  - Assumes we can write to ptes without pte_store() atomic ops, even
  *    on PAE systems.  This should be ok.
  *  - Assumes nothing will ever test these addresses for 0 to indicate
  *    no mapping instead of correctly checking PG_V.
  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
  * Because PG_V is never set, there can be no mappings to invalidate.
  */
 static vm_offset_t
 pmap_ptelist_alloc(vm_offset_t *head)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	va = *head;
 	if (va == 0)
 		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
 	pte = vtopte(va);
 	*head = *pte;
 	if (*head & PG_V)
 		panic("pmap_ptelist_alloc: va with PG_V set!");
 	*pte = 0;
 	return (va);
 }
 
 static void
 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	if (va & PG_V)
 		panic("pmap_ptelist_free: freeing va with PG_V set!");
 	pte = vtopte(va);
 	*pte = *head;		/* virtual! PG_V is 0 though */
 	*head = va;
 }
 
 static void
 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
 {
 	int i;
 	vm_offset_t va;
 
 	*head = 0;
 	for (i = npages - 1; i >= 0; i--) {
 		va = (vm_offset_t)base + i * PAGE_SIZE;
 		pmap_ptelist_free(head, va);
 	}
 }
 
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t mpte;
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	for (i = 0; i < NKPT; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = i + KPTDI;
 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 	}
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings supported and enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pseflag == 0)
 		pg_ps_enabled = 0;
 	else if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 	}
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
 	 */
 	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
 	    PAGE_SIZE) / NBPDR + 1;
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 
 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
 	if (pv_chunkbase == NULL)
 		panic("pmap_init: not enough kvm for pv chunks");
 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 #if defined(PAE) || defined(PAE_TABLES)
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 
 	pmap_initialized = 1;
 	if (!bootverbose)
 		return;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
 		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
 }
 
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
 	"Max number of PV entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
 	"Page share factor per proc");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
     "2/4MB page mapping counters");
 
 static u_long pmap_pde_demotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pde_demotions, 0, "2/4MB page demotions");
 
 static u_long pmap_pde_mappings;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pde_mappings, 0, "2/4MB page mappings");
 
 static u_long pmap_pde_p_failures;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
 
 static u_long pmap_pde_promotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pde_promotions, 0, "2/4MB page promotions");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 int
 pmap_cache_bits(int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 		panic("Unknown caching mode %d\n", mode);
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* Map the caching mode to a PAT index. */
 	pat_idx = pat_index[mode];
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_idx & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_idx & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_idx & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 
 /*
  * The caller is responsible for maintaining TLB consistency.
  */
 static void
 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
 {
 	pd_entry_t *pde;
 	pmap_t pmap;
 	boolean_t PTD_updated;
 
 	PTD_updated = FALSE;
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
 		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
 		    PG_FRAME))
 			PTD_updated = TRUE;
 		pde = pmap_pde(pmap, va);
 		pde_store(pde, newpde);
 	}
 	mtx_unlock_spin(&allpmaps_lock);
 	KASSERT(PTD_updated,
 	    ("pmap_kenter_pde: current page table is not in allpmaps"));
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
 {
 	u_long cr4;
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
 	else if ((newpde & PG_G) == 0)
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
 	else {
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
 		cr4 = rcr4();
 		load_cr4(cr4 & ~CR4_PGE);
 		/*
 		 * Although preemption at this point could be detrimental to
 		 * performance, it would not lead to an error.  PG_G is simply
 		 * ignored if CR4.PGE is clear.  Moreover, in case this block
 		 * is re-entered, the load_cr4() either above or below will
 		 * modify CR4.PGE flushing the TLB.
 		 */
 		load_cr4(cr4 | CR4_PGE);
 	}
 }
 
 void
 invltlb_glob(void)
 {
 	uint64_t cr4;
 
 	if (pgeflag == 0) {
 		invltlb();
 	} else {
 		cr4 = rcr4();
 		load_cr4(cr4 & ~CR4_PGE);
 		load_cr4(cr4 | CR4_PGE);
 	}
 }
 
 
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
  * N.B.: Before calling any of the following TLB invalidation functions,
  * the calling processor must ensure that all stores updating a non-
  * kernel page table are globally performed.  Otherwise, another
  * processor could cache an old, pre-update entry without being
  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  * active on another processor after its pm_active field is checked by
  * one of the following functions but before a store updating the page
  * table is globally performed. (2) The pmap becomes active on another
  * processor before its pm_active field is checked but due to
  * speculative loads one of the following functions stills reads the
  * pmap as inactive on the other processor.
  * 
  * The kernel page table is exempt because its pm_active field is
  * immutable.  The kernel page table is always active on every
  * processor.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t *mask, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		invlpg(va);
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		if (CPU_ISSET(cpuid, &pmap->pm_active))
 			invlpg(va);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invlpg(*mask, va);
 	sched_unpin();
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t *mask, other_cpus;
 	vm_offset_t addr;
 	u_int cpuid;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		if (CPU_ISSET(cpuid, &pmap->pm_active))
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invlpg_range(*mask, sva, eva);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t *mask, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		invltlb_glob();
 		mask = &all_cpus;
 	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		invltlb();
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		if (CPU_ISSET(cpuid, &pmap->pm_active))
 			invltlb();
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invltlb(*mask, pmap);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_kernel(void *arg)
 {
 	struct pde_action *act = arg;
 	pd_entry_t *pde;
 	pmap_t pmap;
 
 	if (act->store == PCPU_GET(cpuid)) {
 
 		/*
 		 * Elsewhere, this operation requires allpmaps_lock for
 		 * synchronization.  Here, it does not because it is being
 		 * performed in the context of an all_cpus rendezvous.
 		 */
 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
 			pde = pmap_pde(pmap, act->va);
 			pde_store(pde, act->newpde);
 		}
 	}
 }
 
 static void
 pmap_update_pde_user(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pde_store(act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap)
 		active = all_cpus;
 	else
 		active = pmap->pm_active;
 	if (CPU_OVERLAP(&active, &other_cpus)) {
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
 		    pmap_update_pde_kernel : pmap_update_pde_user,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		if (pmap == kernel_pmap)
 			pmap_kenter_pde(va, newpde);
 		else
 			pde_store(pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap)
 		invltlb_glob();
 	else if (!CPU_EMPTY(&pmap->pm_active))
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, newpde);
 	else
 		pde_store(pde, newpde);
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		pmap_update_pde_invalidate(va, newpde);
 }
 #endif /* !SMP */
 
 #define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
 
 void
 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
 {
 
 	if (force) {
 		sva &= ~(vm_offset_t)cpu_clflush_line_size;
 	} else {
 		KASSERT((sva & PAGE_MASK) == 0,
 		    ("pmap_invalidate_cache_range: sva not page-aligned"));
 		KASSERT((eva & PAGE_MASK) == 0,
 		    ("pmap_invalidate_cache_range: eva not page-aligned"));
 	}
 
 	if ((cpu_feature & CPUID_SS) != 0 && !force)
 		; /* If "Self Snoop" is supported and allowed, do nothing. */
 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 #ifdef DEV_APIC
 		/*
 		 * XXX: Some CPUs fault, hang, or trash the local APIC
 		 * registers if we use CLFLUSH on the local APIC
 		 * range.  The local APIC is always uncached, so we
 		 * don't need to flush for that range anyway.
 		 */
 		if (pmap_kextract(sva) == lapic_paddr)
 			return;
 #endif
 		/*
 		 * Otherwise, do per-cache line flush.  Use the mfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
 		mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflushopt(sva);
 		mfence();
 	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
 #ifdef DEV_APIC
 		if (pmap_kextract(sva) == lapic_paddr)
 			return;
 #endif
 		/*
 		 * Writes are ordered by CLFLUSH on Intel CPUs.
 		 */
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	} else {
 
 		/*
 		 * No targeted cache flush methods are supported by CPU,
 		 * or the supplied range is bigger than 2MB.
 		 * Globally invalidate cache.
 		 */
 		pmap_invalidate_cache();
 	}
 }
 
 void
 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 {
 	int i;
 
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 	    (cpu_feature & CPUID_CLFSH) == 0) {
 		pmap_invalidate_cache();
 	} else {
 		for (i = 0; i < count; i++)
 			pmap_flush_page(pages[i]);
 	}
 }
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap || pmap ==
 	    vmspace_pmap(curthread->td_proc->p_vmspace));
 }
 
 /*
  * If the given pmap is not the current or kernel pmap, the returned pte must
  * be released by passing it to pmap_pte_release().
  */
 pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_lock(&PMAP2mutex);
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP2 & PG_FRAME) != newpf) {
 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (NULL);
 }
 
 /*
  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
  * being NULL.
  */
 static __inline void
 pmap_pte_release(pt_entry_t *pte)
 {
 
 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 }
 
 /*
  * NB:  The sequence of updating a page table followed by accesses to the
  * corresponding pages is subject to the situation described in the "AMD64
  * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
  * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
  * right after modifying the PTE bits is crucial.
  */
 static __inline void
 invlcaddr(void *caddr)
 {
 
 	invlpg((u_int)caddr);
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
  * If the given pmap is not the current pmap, pvh_global_lock
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0)
 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		else {
 			pte = pmap_pte(pmap, va);
 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 			pmap_pte_release(pte);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde;
 	pt_entry_t pte, *ptep;
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	pa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	pde = *pmap_pde(pmap, va);
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				if (vm_page_pa_tryrelock(pmap, (pde &
 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			ptep = pmap_pte(pmap, va);
 			pte = *ptep;
 			pmap_pte_release(ptep);
 			if (pte != 0 &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 				    &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 		}
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 	vm_paddr_t superpage_offset;
 	pd_entry_t newpde;
 
 	va = *virt;
 	/*
 	 * Does the physical address range's size and alignment permit at
 	 * least one superpage mapping to be created?
 	 */ 
 	superpage_offset = start & PDRMASK;
 	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
 		/*
 		 * Increase the starting virtual address so that its alignment
 		 * does not preclude the use of superpage mappings.
 		 */
 		if ((va & PDRMASK) < superpage_offset)
 			va = (va & ~PDRMASK) + superpage_offset;
 		else if ((va & PDRMASK) > superpage_offset)
 			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
 	}
 	sva = va;
 	while (start < end) {
 		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
 		    pseflag) {
 			KASSERT((va & PDRMASK) == 0,
 			    ("pmap_map: misaligned va %#x", va));
 			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
 			pmap_kenter_pde(va, newpde);
 			va += NBPDR;
 			start += NBPDR;
 		} else {
 			pmap_kenter(va, start);
 			va += PAGE_SIZE;
 			start += PAGE_SIZE;
 		}
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 static __inline void
 pmap_free_zero_pages(struct spglist *free)
 {
 	vm_page_t m;
 
 	while ((m = SLIST_FIRST(free)) != NULL) {
 		SLIST_REMOVE_HEAD(free, plinks.s.ss);
 		/* Preserve the page's PG_ZERO setting. */
 		vm_page_free_toq(m);
 	}
 }
 
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	pmap->pm_pdir[m->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	/*
 	 * This is a release store so that the ordinary store unmapping
 	 * the page table page is globally performed before TLB shoot-
 	 * down is begun.
 	 */
 	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 	pmap_invalidate_page(pmap, pteva);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, mpte, free));
 }
 
 /*
  * Initialize the pmap for the swapper process.
  */
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	/*
 	 * Since the page table directory is shared with the kernel pmap,
 	 * which is already included in the list "allpmaps", this pmap does
 	 * not need to be inserted into that list.
 	 */
 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
 #if defined(PAE) || defined(PAE_TABLES)
 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
 #endif
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 int
 pmap_pinit(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	vm_paddr_t pa;
 	int i;
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
 		if (pmap->pm_pdir == NULL)
 			return (0);
 #if defined(PAE) || defined(PAE_TABLES)
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 		pmap->pm_root.rt_root = 0;
 	}
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_pinit: pmap has reserved page table page(s)"));
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD;) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 		if (m == NULL)
 			VM_WAIT;
 		else {
 			ptdpg[i++] = m;
 		}
 	}
 
 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++)
 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
 			pagezero(pmap->pm_pdir + (i * NPDEPG));
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	/* Copy the kernel page table directory entries. */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
 	mtx_unlock_spin(&allpmaps_lock);
 
 	/* install self-referential address mapping entry(s) */
 	for (i = 0; i < NPGPTD; i++) {
 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
 #if defined(PAE) || defined(PAE_TABLES)
 		pmap->pm_pdpt[i] = pa | PG_V;
 #endif
 	}
 
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 
 	return (1);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
 {
 	vm_paddr_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
 			PMAP_UNLOCK(pmap);
 			rw_wunlock(&pvh_global_lock);
 			VM_WAIT;
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
 {
 	u_int ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
 		ptepa = pmap->pm_pdir[ptepindex];
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has
 		 * been deallocated. 
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	for (i = 0; i < NPGPTD; i++)
 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
 		    PG_FRAME);
 
 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
 	    sizeof(*pmap->pm_pdir));
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		m = ptdpg[i];
 #if defined(PAE) || defined(PAE_TABLES)
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		m->wire_count--;
 		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 		vm_page_free_zero(m);
 	}
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return (sysctl_handle_long(oidp, &ksize, 0, req));
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return (sysctl_handle_long(oidp, &kfree, 0, req));
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
 
 		pmap_kenter_pde(kernel_vm_end, newpdir);
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
 CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
 static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE10
 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  */
 static vm_page_t
 pmap_pv_reclaim(pmap_t locked_pmap)
 {
 	struct pch newtail;
 	struct pv_chunk *pc;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint32_t inuse;
 	int bit, field, freed;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	TAILQ_INIT(&newtail);
 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
 	    SLIST_EMPTY(&free))) {
 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 		if (pmap != pc->pc_pmap) {
 			if (pmap != NULL) {
 				pmap_invalidate_all(pmap);
 				if (pmap != locked_pmap)
 					PMAP_UNLOCK(pmap);
 			}
 			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
 				pmap = NULL;
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfl(inuse);
 				pv = &pc->pc_pventry[field * 32 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = pmap_pte(pmap, va);
 				tpte = *pte;
 				if ((tpte & PG_W) == 0)
 					tpte = pte_load_clear(pte);
 				pmap_pte_release(pte);
 				if ((tpte & PG_W) != 0)
 					continue;
 				KASSERT(tpte != 0,
 				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
 				    pmap, va));
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 			continue;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap->pm_stats.resident_count -= freed;
 		PV_STAT(pv_entry_frees += freed);
 		PV_STAT(pv_entry_spare += freed);
 		pv_entry_count -= freed;
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		for (field = 0; field < _NPCM; field++)
 			if (pc->pc_map[field] != pc_freemask[field]) {
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 
 				/*
 				 * One freed pv entry in locked_pmap is
 				 * sufficient.
 				 */
 				if (pmap == locked_pmap)
 					goto out;
 				break;
 			}
 		if (field == _NPCM) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 			pmap_qremove((vm_offset_t)pc, 1);
 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 			break;
 		}
 	}
 out:
 	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
 	if (pmap != NULL) {
 		pmap_invalidate_all(pmap);
 		if (pmap != locked_pmap)
 			PMAP_UNLOCK(pmap);
 	}
 	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	pmap_free_zero_pages(&free);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
 			/*
 			 * 98% of the time, pc is already at the head of the
 			 * list.  If it isn't already, move it to the head.
 			 */
 			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
 			    pc)) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			return;
 		}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 	pmap_qremove((vm_offset_t)pc, 1);
 	vm_page_unwire(m, PQ_NONE);
 	vm_page_free(m);
 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfl(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 32 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			for (field = 0; field < _NPCM; field++)
 				if (pc->pc_map[field] != 0) {
 					PV_STAT(pv_entry_spare--);
 					return (pv);	/* not full, return */
 				}
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	/*
 	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
 	 * global lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = pmap_pv_reclaim(pmap);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			break;
 		}
 	}
 	return (pv);
 }
 
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
 
 	/*
 	 * Transfer the 4mpage's pv entry for this mapping to the first
 	 * page's pv list.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_4mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("pmap_pv_demote_pde: page %p is not managed", m));
 		va += PAGE_SIZE;
 		pmap_insert_entry(pmap, va, m);
 	} while (va < va_last);
 }
 
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the
 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
 	 * to get_pv_entry(), a transfer avoids the possibility that
 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
 	 * removes one of the mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_4mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	struct md_page *pvh;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		if (TAILQ_EMPTY(&pvh->pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 	}
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the pv entries for each of the pages within a superpage.
  */
 static boolean_t
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;	
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
  * 2- or 4MB page mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	struct spglist free;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * Invalidate the 2- or 4MB page mapping and return
 		 * "failure" if the mapping was never accessed or the
 		 * allocation of the new page table page fails.
 		 */
 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
 		    VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
 			pmap_invalidate_page(pmap, trunc_4mpage(va));
 			pmap_free_zero_pages(&free);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 		if (va < VM_MAXUSER_ADDRESS)
 			pmap->pm_stats.resident_count++;
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 
 	/*
 	 * If the page mapping is in the kernel's address space, then the
 	 * KPTmap can provide access to the page table page.  Otherwise,
 	 * temporarily map the page table page (mpte) into the kernel's
 	 * address space at either PADDR1 or PADDR2. 
 	 */
 	if (va >= KERNBASE)
 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
 	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 		if ((*PMAP1 & PG_FRAME) != mptepa) {
 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		firstpte = PADDR1;
 	} else {
 		mtx_lock(&PMAP2mutex);
 		if ((*PMAP2 & PG_FRAME) != mptepa) {
 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		firstpte = PADDR2;
 	}
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & PG_A) != 0,
 	    ("pmap_demote_pde: oldpde is missing PG_A"));
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	if ((newpte & PG_PDE_PAT) != 0)
 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
 
 	/*
 	 * If the page table page is new, initialize it.
 	 */
 	if (mpte->wire_count == 1) {
 		mpte->wire_count = NPTEPG;
 		pmap_fill_ptp(firstpte, newpte);
 	}
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */ 
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 	
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, newpde);
 	else
 		pde_store(pde, newpde);	
 	if (firstpte == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 
 	/*
 	 * Invalidate the recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the pv entry.  This depends on the earlier demotion
 	 * of the mapping.  Specifically, the (re)creation of a per-
 	 * page pv entry might trigger the execution of pmap_collect(),
 	 * which might reclaim a newly (re)created per-page pv entry
 	 * and destroy the associated mapping.  In order to destroy
 	 * the mapping, the PDE must have already changed from mapping
 	 * the 2mpage to referencing the page table page.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
 
 	pmap_pde_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Removes a 2- or 4MB page mapping from the kernel pmap.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
 
 	/*
 	 * Initialize the page table page.
 	 */
 	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
 
 	/*
 	 * Remove the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else 
 		pmap_kenter_pde(va, newpde);
 
 	/*
 	 * Invalidate the recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static void
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpde & PG_G)
 		pmap_invalidate_page(kernel_pmap, sva);
 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 	if (oldpde & PG_MANAGED) {
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			pmap->pm_stats.resident_count--;
 			KASSERT(mpte->wire_count == NPTEPG,
 			    ("pmap_remove_pde: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 		}
 	}
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
     struct spglist *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	KASSERT(oldpte != 0,
 	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pt_entry_t *pte;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, free);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	struct spglist free;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva, &free);
 		goto out;
 	}
 
 	for (; sva < eva; sva = pdnxt) {
 		u_int pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, &free);
 				continue;
 			} else if (!pmap_demote_pde(pmap,
 			    &pmap->pm_pdir[pdirindex], sva)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			}
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva, &free))
 				break;
 		}
 	}
 out:
 	sched_unpin();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
 		    pmap, pv->pv_va));
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 4mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m;
 	boolean_t anychanged;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if (oldpde & PG_MANAGED) {
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++)
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 	}
 	if ((prot & VM_PROT_WRITE) == 0)
 		newpde &= ~(PG_RW | PG_M);
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 #endif
 	if (newpde != oldpde) {
 		if (!pde_cmpset(pde, oldpde, newpde))
 			goto retry;
 		if (oldpde & PG_G)
 			pmap_invalidate_page(pmap, sva);
 		else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	boolean_t anychanged, pv_lists_locked;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 #else
 	if (prot & VM_PROT_WRITE)
 		return;
 #endif
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	anychanged = FALSE;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
 		u_int pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all(
 							    pmap);
 						PMAP_UNLOCK(pmap);
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva)) {
 					/*
 					 * The large page mapping was
 					 * destroyed.
 					 */
 					continue;
 				}
 			}
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
 
 retry:
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits in
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 #if defined(PAE) || defined(PAE_TABLES)
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 #endif
 
 			if (pbits != obits) {
 #if defined(PAE) || defined(PAE_TABLES)
 				if (!atomic_cmpset_64(pte, obits, pbits))
 					goto retry;
 #else
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
 #endif
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
  * For promotion to occur, two conditions must be met: (1) the 4KB page
  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
  * mappings must have identical characteristics.
  *
  * Managed (PG_MANAGED) mappings within the kernel address space are not
  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
  * pmap.
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	vm_offset_t oldpteva;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2- or 4MB page.
 	 */
 	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
 		    ~PG_RW))  
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/* 
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			pmap_pde_p_failures++;
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK);
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
 			    " in pmap %p", oldpteva, pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			pmap_pde_p_failures++;
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == va >> PDRSHIFT,
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte)) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	if ((newpde & PG_PTE_PAT) != 0)
 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, PG_PS | newpde);
 	else
 		pde_store(pde, PG_PS | newpde);
 
 	pmap_pde_promotions++;
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
 	    " in pmap %p", va, pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	boolean_t invlva, wired;
 
 	va = trunc_page(va);
 	mpte = NULL;
 	wired = (flags & PMAP_ENTER_WIRED) != 0;
 
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
 	    va));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 
 	pde = pmap_pde(pmap, va);
 	if (va < VM_MAXUSER_ADDRESS) {
 		/*
 		 * va is for UVA.
 		 * In the case that a page table page is not resident,
 		 * we are creating it here.  pmap_allocpte() handles
 		 * demotion.
 		 */
 		mpte = pmap_allocpte(pmap, va, flags);
 		if (mpte == NULL) {
 			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
 			    ("pmap_allocpte failed with sleep allowed"));
 			sched_unpin();
 			rw_wunlock(&pvh_global_lock);
 			PMAP_UNLOCK(pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 	} else {
 		/*
 		 * va is for KVA, so pmap_demote_pde() will never fail
 		 * to install a page table page.  PG_V is also
 		 * asserted by pmap_demote_pde().
 		 */
 		KASSERT(pde != NULL && (*pde & PG_V) != 0,
 		    ("KVA %#x invalid pde pdir %#jx", va,
 		    (uintmax_t)pmap->pm_pdir[PTDPTDI]));
 		if ((*pde & PG_PS) != 0)
 			pmap_demote_pde(pmap, pde, va);
 	}
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry is not valid, which should not
 	 * happen.  We should have either allocated the page table
 	 * page or demoted the existing mapping above.
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
 		    (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 
 	pv = NULL;
 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%x", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		if (pv == NULL)
 			pv = get_pv_entry(pmap, FALSE);
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		pa |= PG_MANAGED;
 	} else if (pv != NULL)
 		free_pv_entry(pmap, pv);
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
 		if ((newpte & PG_MANAGED) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 #endif
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		newpte |= PG_A;
 		if ((flags & VM_PROT_WRITE) != 0)
 			newpte |= PG_M;
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_aflag_set(om, PGA_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m))
 					invlva = TRUE;
 #if defined(PAE) || defined(PAE_TABLES)
 				if ((origpte & PG_NX) == 0 &&
 				    (newpte & PG_NX) != 0)
 					invlva = TRUE;
 #endif
 			}
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((prot & VM_PROT_WRITE) == 0)
 					invlva = TRUE;
 			}
 			if ((origpte & PG_MANAGED) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte);
 	}
 
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va);
 
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
  * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
  * blocking, (2) a mapping already exists at the specified virtual address, or
  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
  */
 static boolean_t
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	pd_entry_t *pde, newpde;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pde = pmap_pde(pmap, va);
 	if (*pde != 0) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		newpde |= PG_MANAGED;
 
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 	}
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 #endif
 	if (va < VM_MAXUSER_ADDRESS)
 		newpde |= PG_U;
 
 	/*
 	 * Increment counters.
 	 */
 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.
 	 */
 	pde_store(pde, newpde);
 
 	pmap_pde_mappings++;
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pg_ps_enabled &&
 		    pmap_enter_pde(pmap, va, m, prot))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	struct spglist free;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		u_int ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    PMAP_ENTER_NOSLEEP);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(&free);
 			}
 			
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
 #if defined(PAE) || defined(PAE_TABLES)
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 #endif
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p;
 	int pat_mode;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pseflag && 
 	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2/4MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
 		 * "size" is a multiple of 2/4M, adding the PAT setting to
 		 * "pa" will not affect the termination of this loop.
 		 */
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
 		    size; pa += NBPDR) {
 			pde = pmap_pde(pmap, addr);
 			if (*pde == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap->pm_stats.resident_count += NBPDR /
 				    PAGE_SIZE;
 				pmap_pde_mappings++;
 			}
 			/* Else continue on if the PDE is already valid. */
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t pv_lists_locked;
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pde = pmap_pde(pmap, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * Regardless of whether a pde (or pte) is 32
 				 * or 64 bits in size, PG_W is among the least
 				 * significant 32 bits.
 				 */
 				atomic_clear_int((u_int *)pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						/* Repeat sva. */
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pde(pmap, pde, sva))
 					panic("pmap_unwire: demotion failed");
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 *
 			 * PG_W is among the least significant 32 bits.
 			 */
 			atomic_clear_int((u_int *)pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct spglist free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	rw_wlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	sched_pin();
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr;
 		u_int ptepindex;
 
 		KASSERT(addr < UPT_MIN_ADDRESS,
 		    ("pmap_copy: invalid to pmap_copy page tables"));
 
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		if (pdnxt < addr)
 			pdnxt = end_addr;
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 				continue;
 			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
 			    ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
 			    PG_PS_FRAME))) {
 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
 				    ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 				pmap_pde_mappings++;
 			}
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < pdnxt) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    PMAP_ENTER_NOSLEEP);
 				if (dstmpte == NULL)
 					goto out;
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
 	 			} else {
 					SLIST_INIT(&free);
 					if (pmap_unwire_ptp(dst_pmap, dstmpte,
 					    &free)) {
 						pmap_invalidate_page(dst_pmap,
 						    addr);
 						pmap_free_zero_pages(&free);
 					}
 					goto out;
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
 
 /*
  * Zero 1 page of virtual memory mapped from a hardware page by the caller.
  */
 static __inline void
 pagezero(void *page)
 {
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686) {
 #if defined(CPU_ENABLE_SSE)
 		if (cpu_feature & CPUID_SSE2)
 			sse2_pagezero(page);
 		else
 #endif
 			i686_pagezero(page);
 	} else
 #endif
 		bzero(page, PAGE_SIZE);
 }
 
 /*
  * Zero the specified hardware page.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = pcpu_find(curcpu);
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte2)
 		panic("pmap_zero_page: CMAP2 busy");
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(m->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	pagezero(pc->pc_cmap_addr2);
 	*cmap_pte2 = 0;
-	mtx_unlock(&pc->pc_cmap_lock);
+
+	/*
+	 * Unpin the thread before releasing the lock.  Otherwise the thread
+	 * could be rescheduled while still bound to the current CPU, only
+	 * to unpin itself immediately upon resuming execution.
+	 */
 	sched_unpin();
+	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = pcpu_find(curcpu);
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte2)
 		panic("pmap_zero_page_area: CMAP2 busy");
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(m->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	if (off == 0 && size == PAGE_SIZE) 
 		pagezero(pc->pc_cmap_addr2);
 	else
 		bzero(pc->pc_cmap_addr2 + off, size);
 	*cmap_pte2 = 0;
-	mtx_unlock(&pc->pc_cmap_lock);
 	sched_unpin();
+	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	pt_entry_t *cmap_pte1, *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = pcpu_find(curcpu);
 	cmap_pte1 = pc->pc_cmap_pte1; 
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*cmap_pte2)
 		panic("pmap_copy_page: CMAP2 busy");
 	*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
 	    pmap_cache_bits(src->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr1);
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
 	    pmap_cache_bits(dst->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
 	*cmap_pte1 = 0;
 	*cmap_pte2 = 0;
-	mtx_unlock(&pc->pc_cmap_lock);
 	sched_unpin();
+	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	vm_page_t a_pg, b_pg;
 	char *a_cp, *b_cp;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	pt_entry_t *cmap_pte1, *cmap_pte2;
 	struct pcpu *pc;
 	int cnt;
 
 	sched_pin();
 	pc = pcpu_find(curcpu);
 	cmap_pte1 = pc->pc_cmap_pte1; 
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte1 != 0)
 		panic("pmap_copy_pages: CMAP1 busy");
 	if (*cmap_pte2 != 0)
 		panic("pmap_copy_pages: CMAP2 busy");
 	while (xfersize > 0) {
 		a_pg = ma[a_offset >> PAGE_SHIFT];
 		a_pg_offset = a_offset & PAGE_MASK;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		b_pg = mb[b_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
 		    pmap_cache_bits(a_pg->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr1);
 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
 		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr2);
 		a_cp = pc->pc_cmap_addr1 + a_pg_offset;
 		b_cp = pc->pc_cmap_addr2 + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 	*cmap_pte1 = 0;
 	*cmap_pte2 = 0;
-	mtx_unlock(&pc->pc_cmap_lock);
 	sched_unpin();
+	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	int count;
 
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  *	pmap_pvh_wired_mappings:
  *
  *	Return the updated number "count" of managed mappings that are wired.
  */
 static int
 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 {
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 4mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct spglist free;
 	int field, idx;
 	int32_t bit;
 	uint32_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != PCPU_GET(curpmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
 		    pc->pc_pmap));
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 32 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pde(pmap, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & PG_PS) == 0) {
 					pte = vtopte(pv->pv_va);
 					tpte = *pte & ~PG_PTE_PAT;
 				}
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08x\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if ((tpte & PG_PS) != 0) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				if ((tpte & PG_PS) != 0) {
 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if (TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						pmap->pm_stats.resident_count--;
 						KASSERT(mpte->wire_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->wire_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 						atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 					}
 				} else {
 					pmap->pm_stats.resident_count--;
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					if (TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 					pmap_unuse_pt(pmap, pv->pv_va, &free);
 				}
 			}
 		}
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(&free);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Returns TRUE if any of the given mappings were used to modify
  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
  * mappings are supported.
  */
 static boolean_t
 pmap_is_modified_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (*pde != 0 && (*pde & PG_PS) == 0) {
 		pte = vtopte(addr);
 		rv = *pte == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Returns TRUE if any of the given mappings were referenced and FALSE
  * otherwise.  Both page and 4mpage mappings are supported.
  */
 static boolean_t
 pmap_is_referenced_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if ((oldpte & PG_RW) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_RW and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	int rtval = 0;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	pa = VM_PAGE_TO_PHYS(m);
 	pvh = pa_to_pvh(pa);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "*pde" is mapping a 2/4MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((*pde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by either 1024
 			 * or 512 4KB pages, it should not be cleared every
 			 * time it is tested.  Apply a simple "hash" function
 			 * on the physical page number, the virtual superpage
 			 * number, and the pmap address to select one 4KB page
 			 * out of the 1024 or 512 on which testing the
 			 * reference bit will result in clearing that bit.
 			 * This function is designed to avoid the selection of
 			 * the same 4KB page for every 2- or 4MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (*pde & PG_W) == 0) {
 				atomic_clear_int((u_int *)pde, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 			}
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		}
 		if (rtval >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
 		    m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			atomic_clear_int((u_int *)pte, PG_A);
 			pmap_invalidate_page(pmap, pv->pv_va);
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
 	    PMAP_TS_REFERENCED_MAX);
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte;
 	vm_offset_t va, pdnxt;
 	vm_page_t m;
 	boolean_t anychanged, pv_lists_locked;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	anychanged = FALSE;
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pde = pmap_pde(pmap, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			if (!pv_lists_locked) {
 				pv_lists_locked = TRUE;
 				if (!rw_try_wlock(&pvh_global_lock)) {
 					if (anychanged)
 						pmap_invalidate_all(pmap);
 					PMAP_UNLOCK(pmap);
 					goto resume;
 				}
 				sched_pin();
 			}
 			if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying page
 			 * table page is fully populated, this removal never
 			 * frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				pte = pmap_pte_quick(pmap, sva);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, sva, NULL);
 				anychanged = TRUE;
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		va = pdnxt;
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_int((u_int *)pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_int((u_int *)pte, PG_A);
 			else
 				goto maybe_invlrng;
 			if ((*pte & PG_G) != 0) {
 				if (va == pdnxt)
 					va = sva;
 			} else
 				anychanged = TRUE;
 			continue;
 maybe_invlrng:
 			if (va != pdnxt) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = pdnxt;
 			}
 		}
 		if (va != pdnxt)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
 		if ((oldpde & PG_RW) != 0) {
 			if (pmap_demote_pde(pmap, pde, va)) {
 				if ((oldpde & PG_W) == 0) {
 					/*
 					 * Write protect the mapping to a
 					 * single page so that a subsequent
 					 * write access may repromote.
 					 */
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pte_quick(pmap, va);
 					oldpte = *pte;
 					if ((oldpte & PG_V) != 0) {
 						/*
 						 * Regardless of whether a pte is 32 or 64 bits
 						 * in size, PG_RW and PG_M are among the least
 						 * significant 32 bits.
 						 */
 						while (!atomic_cmpset_int((u_int *)pte,
 						    oldpte,
 						    oldpte & ~(PG_M | PG_RW)))
 							oldpte = *pte;
 						vm_page_dirty(m);
 						pmap_invalidate_page(pmap, va);
 					}
 				}
 			}
 		}
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_M is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
 {
 	u_int opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~PG_PTE_CACHE;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
 {
 	u_int opde, npde;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~PG_PDE_CACHE;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = pa & PG_FRAME;
 
 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
 		va = KERNBASE + pa;
 	else if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    ppim->mode == mode)
 				return ((void *)(ppim->va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 	pmap_invalidate_cache_range(va, va + size, FALSE);
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized)
 		kva_free(va, size);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 
 	/*
 	 * If "m" is a normal page, flush it from the cache.
 	 * See pmap_invalidate_cache_range().
 	 *
 	 * First, try to find an existing mapping of the page by sf
 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
 	 * flushes the cache.
 	 */    
 	if (sf_buf_invalidate_cache(m))
 		return;
 
 	/*
 	 * If page is not mapped by sf buffer, but CPU does not
 	 * support self snoop, map the page transient and do
 	 * invalidation. In the worst case, whole cache is flushed by
 	 * pmap_invalidate_cache_range().
 	 */
 	if ((cpu_feature & CPUID_SS) == 0)
 		pmap_flush_page(m);
 }
 
 static void
 pmap_flush_page(vm_page_t m)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 	vm_offset_t sva, eva;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
 		sched_pin();
 		pc = pcpu_find(curcpu);
 		cmap_pte2 = pc->pc_cmap_pte2; 
 		mtx_lock(&pc->pc_cmap_lock);
 		if (*cmap_pte2)
 			panic("pmap_flush_page: CMAP2 busy");
 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
 		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr2);
 		sva = (vm_offset_t)pc->pc_cmap_addr2;
 		eva = sva + PAGE_SIZE;
 
 		/*
 		 * Use mfence despite the ordering implied by
 		 * mtx_{un,}lock() because clflush on non-Intel CPUs
 		 * and clflushopt are not guaranteed to be ordered by
 		 * any other instruction.
 		 */
 		if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size) {
 			if (useclflushopt)
 				clflushopt(sva);
 			else
 				clflush(sva);
 		}
 		if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		*cmap_pte2 = 0;
-		mtx_unlock(&pc->pc_cmap_lock);
 		sched_unpin();
+		mtx_unlock(&pc->pc_cmap_lock);
 	} else
 		pmap_invalidate_cache();
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the kernel map.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	int cache_bits_pte, cache_bits_pde;
 	boolean_t changed;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses above the recursive map.
 	 */
 	if (base < VM_MIN_KERNEL_ADDRESS)
 		return (EINVAL);
 
 	cache_bits_pde = pmap_cache_bits(mode, 1);
 	cache_bits_pte = pmap_cache_bits(mode, 0);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down
 	 * 2/4MB pages into 4KB pages if required.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (tmpva = base; tmpva < base + size; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return (EINVAL);
 		}
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2/4MB page already has
 			 * the required memory type, then we need not
 			 * demote this page.  Just increment tmpva to
 			 * the next 2/4MB page frame.
 			 */
 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_4mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2/4MB
 			 * page frame and there is at least 2/4MB left
 			 * within the range, then we need not break
 			 * down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
 				PMAP_UNLOCK(kernel_pmap);
 				return (ENOMEM);
 			}
 		}
 		pte = vtopte(tmpva);
 		if (*pte == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return (EINVAL);
 		}
 		tmpva += PAGE_SIZE;
 	}
 	PMAP_UNLOCK(kernel_pmap);
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde);
 				changed = TRUE;
 			}
 			tmpva = trunc_4mpage(tmpva) + NBPDR;
 		} else {
 			pte = vtopte(tmpva);
 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte);
 				changed = TRUE;
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
 		pmap_invalidate_cache_range(base, tmpva, FALSE);
 	}
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t *pdep;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	int val;
 
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, addr);
 	if (*pdep != 0) {
 		if (*pdep & PG_PS) {
 			pte = *pdep;
 			/* Compute the physical address of the 4KB page. */
 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 			    PG_FRAME;
 			val = MINCORE_SUPER;
 		} else {
 			ptep = pmap_pte(pmap, addr);
 			pte = *ptep;
 			pmap_pte_release(ptep);
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int	cpuid;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 	cpuid = PCPU_GET(cpuid);
 #if defined(SMP)
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 #if defined(PAE) || defined(PAE_TABLES)
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_offset_t qaddr;
 	pt_entry_t *pte;
 
 	critical_enter();
 	qaddr = PCPU_GET(qmap_addr);
 	pte = vtopte(qaddr);
 
 	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
 	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(pmap_page_get_memattr(m), 0);
 	invlpg(qaddr);
 
 	return (qaddr);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 	vm_offset_t qaddr;
 	pt_entry_t *pte;
 
 	qaddr = PCPU_GET(qmap_addr);
 	pte = vtopte(qaddr);
 
 	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
 	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
 
 	*pte = 0;
 	critical_exit();
 }
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = vmspace_pmap(p->p_vmspace);
 			for (i = 0; i < NPDEPTD; i++) {
 				pd_entry_t *pde;
 				pt_entry_t *pte;
 				vm_offset_t base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for (j = 0; j < NPTEPG; j++) {
 						vm_offset_t va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							sx_sunlock(&allproc_lock);
 							return (npte);
 						}
 						pte = pmap_pte(pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (npte);
 }
 #endif
Index: projects/netbsd-tests-upstream-01-2017/sys/kern/kern_acct.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/kern/kern_acct.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/kern/kern_acct.c	(revision 312218)
@@ -1,652 +1,652 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1994 Christopher G. Demetriou
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_acct.c	8.1 (Berkeley) 6/14/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The routines implemented in this file are described in:
  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  *	    UNIX Operating System (Addison Welley, 1989)
  * on pages 62-63.
  * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
  * compt_t representation described in the above reference was replaced
  * with that of IEEE-754 floats.
  *
  * Arguably, to simplify accounting operations, this mechanism should
  * be replaced by one in which an accounting log file (similar to /dev/klog)
  * is read by a user process, etc.  However, that has its own problems.
  */
 
 /* Floating point definitions from <float.h>. */
 #define FLT_MANT_DIG    24              /* p */
 #define FLT_MAX_EXP     128             /* emax */
 
 /*
  * Internal accounting functions.
  * The former's operation is described in Leffler, et al., and the latter
  * was provided by UCB with the 4.4BSD-Lite release
  */
 static uint32_t	encode_timeval(struct timeval);
 static uint32_t	encode_long(long);
 static void	acctwatch(void);
 static void	acct_thread(void *);
 static int	acct_disable(struct thread *, int);
 
 /*
  * Accounting vnode pointer, saved vnode pointer, and flags for each.
  * acct_sx protects against changes to the active vnode and credentials
  * while accounting records are being committed to disk.
  */
 static int		 acct_configured;
 static int		 acct_suspended;
 static struct vnode	*acct_vp;
 static struct ucred	*acct_cred;
 static struct plimit	*acct_limit;
 static int		 acct_flags;
 static struct sx	 acct_sx;
 
 SX_SYSINIT(acct, &acct_sx, "acct_sx");
 
 /*
  * State of the accounting kthread.
  */
 static int		 acct_state;
 
 #define	ACCT_RUNNING	1	/* Accounting kthread is running. */
 #define	ACCT_EXITREQ	2	/* Accounting kthread should exit. */
 
 /*
  * Values associated with enabling and disabling accounting
  */
 static int acctsuspend = 2;	/* stop accounting when < 2% free space left */
 SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
 	&acctsuspend, 0, "percentage of free disk space below which accounting stops");
 
 static int acctresume = 4;	/* resume when free space risen to > 4% */
 SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
 	&acctresume, 0, "percentage of free disk space above which accounting resumes");
 
 static int acctchkfreq = 15;	/* frequency (in seconds) to check space */
 
 static int
 sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	/* Write out the old value. */
 	error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int));
 	if (error || req->newptr == NULL)
 		return (error);
 
 	/* Read in and verify the new value. */
 	error = SYSCTL_IN(req, &value, sizeof(int));
 	if (error)
 		return (error);
 	if (value <= 0)
 		return (EINVAL);
 	acctchkfreq = value;
 	return (0);
 }
 SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT|CTLFLAG_RW,
     &acctchkfreq, 0, sysctl_acct_chkfreq, "I",
     "frequency for checking the free space");
 
 SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
 	"Accounting configured or not");
 
 SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
 	"Accounting suspended or not");
 
 /*
  * Accounting system call.  Written based on the specification and previous
  * implementation done by Mark Tinguely.
  */
 int
 sys_acct(struct thread *td, struct acct_args *uap)
 {
 	struct nameidata nd;
 	int error, flags, i, replacing;
 
 	error = priv_check(td, PRIV_ACCT);
 	if (error)
 		return (error);
 
 	/*
 	 * If accounting is to be started to a file, open that file for
 	 * appending and make sure it's a 'normal'.
 	 */
 	if (uap->path != NULL) {
 		NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1,
 		    UIO_USERSPACE, uap->path, td);
 		flags = FWRITE | O_APPEND;
 		error = vn_open(&nd, &flags, 0, NULL);
 		if (error)
 			return (error);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		error = mac_system_check_acct(td->td_ucred, nd.ni_vp);
 		if (error) {
 			VOP_UNLOCK(nd.ni_vp, 0);
 			vn_close(nd.ni_vp, flags, td->td_ucred, td);
 			return (error);
 		}
 #endif
 		VOP_UNLOCK(nd.ni_vp, 0);
 		if (nd.ni_vp->v_type != VREG) {
 			vn_close(nd.ni_vp, flags, td->td_ucred, td);
 			return (EACCES);
 		}
 #ifdef MAC
 	} else {
 		error = mac_system_check_acct(td->td_ucred, NULL);
 		if (error)
 			return (error);
 #endif
 	}
 
 	/*
 	 * Disallow concurrent access to the accounting vnode while we swap
 	 * it out, in order to prevent access after close.
 	 */
 	sx_xlock(&acct_sx);
 
 	/*
 	 * Don't log spurious disable/enable messages if we are
 	 * switching from one accounting file to another due to log
 	 * rotation.
 	 */
 	replacing = (acct_vp != NULL && uap->path != NULL);
 
 	/*
 	 * If accounting was previously enabled, kill the old space-watcher,
 	 * close the file, and (if no new file was specified, leave).  Reset
 	 * the suspended state regardless of whether accounting remains
 	 * enabled.
 	 */
 	acct_suspended = 0;
 	if (acct_vp != NULL)
 		error = acct_disable(td, !replacing);
 	if (uap->path == NULL) {
 		if (acct_state & ACCT_RUNNING) {
 			acct_state |= ACCT_EXITREQ;
 			wakeup(&acct_state);
 		}
 		sx_xunlock(&acct_sx);
 		return (error);
 	}
 
 	/*
 	 * Create our own plimit object without limits. It will be assigned
 	 * to exiting processes.
 	 */
 	acct_limit = lim_alloc();
 	for (i = 0; i < RLIM_NLIMITS; i++)
 		acct_limit->pl_rlimit[i].rlim_cur =
 		    acct_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
 
 	/*
 	 * Save the new accounting file vnode, and schedule the new
 	 * free space watcher.
 	 */
 	acct_vp = nd.ni_vp;
 	acct_cred = crhold(td->td_ucred);
 	acct_flags = flags;
 	if (acct_state & ACCT_RUNNING)
 		acct_state &= ~ACCT_EXITREQ;
 	else {
 		/*
 		 * Try to start up an accounting kthread.  We may start more
 		 * than one, but if so the extras will commit suicide as
 		 * soon as they start up.
 		 */
 		error = kproc_create(acct_thread, NULL, NULL, 0, 0,
 		    "accounting");
 		if (error) {
 			(void) acct_disable(td, 0);
 			sx_xunlock(&acct_sx);
 			log(LOG_NOTICE, "Unable to start accounting thread\n");
 			return (error);
 		}
 	}
 	acct_configured = 1;
 	sx_xunlock(&acct_sx);
 	if (!replacing)
 		log(LOG_NOTICE, "Accounting enabled\n");
 	return (error);
 }
 
 /*
  * Disable currently in-progress accounting by closing the vnode, dropping
  * our reference to the credential, and clearing the vnode's flags.
  */
 static int
 acct_disable(struct thread *td, int logging)
 {
 	int error;
 
 	sx_assert(&acct_sx, SX_XLOCKED);
 	error = vn_close(acct_vp, acct_flags, acct_cred, td);
 	crfree(acct_cred);
 	lim_free(acct_limit);
 	acct_configured = 0;
 	acct_vp = NULL;
 	acct_cred = NULL;
 	acct_flags = 0;
 	if (logging)
 		log(LOG_NOTICE, "Accounting disabled\n");
 	return (error);
 }
 
 /*
  * Write out process accounting information, on process exit.
  * Data to be written out is specified in Leffler, et al.
  * and are enumerated below.  (They're also noted in the system
  * "acct.h" header file.)
  */
 int
 acct_process(struct thread *td)
 {
 	struct acctv2 acct;
 	struct timeval ut, st, tmp;
 	struct plimit *oldlim;
 	struct proc *p;
 	struct rusage ru;
 	int t, ret;
 
 	/*
 	 * Lockless check of accounting condition before doing the hard
 	 * work.
 	 */
 	if (acct_vp == NULL || acct_suspended)
 		return (0);
 
 	sx_slock(&acct_sx);
 
 	/*
 	 * If accounting isn't enabled, don't bother.  Have to check again
 	 * once we own the lock in case we raced with disabling of accounting
 	 * by another thread.
 	 */
 	if (acct_vp == NULL || acct_suspended) {
 		sx_sunlock(&acct_sx);
 		return (0);
 	}
 
 	p = td->td_proc;
 
 	/*
 	 * Get process accounting information.
 	 */
 
 	sx_slock(&proctree_lock);
 	PROC_LOCK(p);
 
 	/* (1) The terminal from which the process was started */
 	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
 		acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
 	else
 		acct.ac_tty = NODEV;
 	sx_sunlock(&proctree_lock);
 
 	/* (2) The name of the command that ran */
 	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
 
 	/* (3) The amount of user and system time that was used */
 	rufetchcalc(p, &ru, &ut, &st);
 	acct.ac_utime = encode_timeval(ut);
 	acct.ac_stime = encode_timeval(st);
 
 	/* (4) The elapsed time the command ran (and its starting time) */
 	getboottime(&tmp);
 	timevaladd(&tmp, &p->p_stats->p_start);
 	acct.ac_btime = tmp.tv_sec;
 	microuptime(&tmp);
 	timevalsub(&tmp, &p->p_stats->p_start);
 	acct.ac_etime = encode_timeval(tmp);
 
 	/* (5) The average amount of memory used */
 	tmp = ut;
 	timevaladd(&tmp, &st);
 	/* Convert tmp (i.e. u + s) into hz units to match ru_i*. */
 	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
 	if (t)
 		acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
 		    + ru.ru_isrss) / t);
 	else
 		acct.ac_mem = 0;
 
 	/* (6) The number of disk I/O operations done */
 	acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);
 
 	/* (7) The UID and GID of the process */
 	acct.ac_uid = p->p_ucred->cr_ruid;
 	acct.ac_gid = p->p_ucred->cr_rgid;
 
 	/* (8) The boolean flags that tell how the process terminated, etc. */
 	acct.ac_flagx = p->p_acflag;
 
 	/* Setup ancillary structure fields. */
 	acct.ac_flagx |= ANVER;
 	acct.ac_zero = 0;
 	acct.ac_version = 2;
 	acct.ac_len = acct.ac_len2 = sizeof(acct);
 
 	/*
 	 * Eliminate rlimits (file size limit in particular).
 	 */
 	oldlim = p->p_limit;
 	p->p_limit = lim_hold(acct_limit);
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	/*
 	 * Write the accounting information to the file.
 	 */
 	ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct),
 	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED,
 	    NULL, td);
 	sx_sunlock(&acct_sx);
 	return (ret);
 }
 
 /* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */
 
 /* Convert timevals and longs into IEEE-754 bit patterns. */
 
 /* Mantissa mask (MSB is implied, so subtract 1). */
 #define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)
 
 /*
  * We calculate integer values to a precision of approximately
  * 28 bits.
  * This is high-enough precision to fill the 24 float bits
  * and low-enough to avoid overflowing the 32 int bits.
  */
 #define CALC_BITS 28
 
 /* log_2(1000000). */
 #define LOG2_1M 20
 
 /*
  * Convert the elements of a timeval into a 32-bit word holding
  * the bits of a IEEE-754 float.
  * The float value represents the timeval's value in microsecond units.
  */
 static uint32_t
 encode_timeval(struct timeval tv)
 {
 	int log2_s;
-	int val, exponent;	/* Unnormalized value and exponent */
-	int norm_exponent;	/* Normalized exponent */
+	int val, exp;	/* Unnormalized value and exponent */
+	int norm_exp;	/* Normalized exponent */
 	int shift;
 
 	/*
 	 * First calculate value and exponent to about CALC_BITS precision.
 	 * Note that the following conditionals have been ordered so that
 	 * the most common cases appear first.
 	 */
 	if (tv.tv_sec == 0) {
 		if (tv.tv_usec == 0)
 			return (0);
-		exponent = 0;
+		exp = 0;
 		val = tv.tv_usec;
 	} else {
 		/*
 		 * Calculate the value to a precision of approximately
 		 * CALC_BITS.
 		 */
 		log2_s = fls(tv.tv_sec) - 1;
 		if (log2_s + LOG2_1M < CALC_BITS) {
-			exponent = 0;
+			exp = 0;
 			val = 1000000 * tv.tv_sec + tv.tv_usec;
 		} else {
-			exponent = log2_s + LOG2_1M - CALC_BITS;
+			exp = log2_s + LOG2_1M - CALC_BITS;
 			val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec +
-			    tv.tv_usec) >> exponent);
+			    tv.tv_usec) >> exp);
 		}
 	}
 	/* Now normalize and pack the value into an IEEE-754 float. */
-	norm_exponent = fls(val) - 1;
-	shift = FLT_MANT_DIG - norm_exponent - 1;
+	norm_exp = fls(val) - 1;
+	shift = FLT_MANT_DIG - norm_exp - 1;
 #ifdef ACCT_DEBUG
 	printf("val=%d exp=%d shift=%d log2(val)=%d\n",
-	    val, exponent, shift, norm_exponent);
-	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exponent + norm_exponent,
+	    val, exp, shift, norm_exp);
+	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
 	    ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
 #endif
-	return (((FLT_MAX_EXP - 1 + exponent + norm_exponent) << (FLT_MANT_DIG - 1)) |
+	return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) |
 	    ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
 }
 
 /*
  * Convert a non-negative long value into the bit pattern of
  * an IEEE-754 float value.
  */
 static uint32_t
 encode_long(long val)
 {
-	int norm_exponent;	/* Normalized exponent */
+	int norm_exp;	/* Normalized exponent */
 	int shift;
 
 	if (val == 0)
 		return (0);
 	if (val < 0) {
 		log(LOG_NOTICE,
 		    "encode_long: negative value %ld in accounting record\n",
 		    val);
 		val = LONG_MAX;
 	}
-	norm_exponent = fls(val) - 1;
-	shift = FLT_MANT_DIG - norm_exponent - 1;
+	norm_exp = fls(val) - 1;
+	shift = FLT_MANT_DIG - norm_exp - 1;
 #ifdef ACCT_DEBUG
 	printf("val=%d shift=%d log2(val)=%d\n",
-	    val, shift, norm_exponent);
-	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exponent,
+	    val, shift, norm_exp);
+	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
 	    ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
 #endif
-	return (((FLT_MAX_EXP - 1 + norm_exponent) << (FLT_MANT_DIG - 1)) |
+	return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) |
 	    ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
 }
 
 /* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */
 
 /*
  * Periodically check the filesystem to see if accounting
  * should be turned on or off.  Beware the case where the vnode
  * has been vgone()'d out from underneath us, e.g. when the file
  * system containing the accounting file has been forcibly unmounted.
  */
 /* ARGSUSED */
 static void
 acctwatch(void)
 {
 	struct statfs *sp;
 
 	sx_assert(&acct_sx, SX_XLOCKED);
 
 	/*
 	 * If accounting was disabled before our kthread was scheduled,
 	 * then acct_vp might be NULL.  If so, just ask our kthread to
 	 * exit and return.
 	 */
 	if (acct_vp == NULL) {
 		acct_state |= ACCT_EXITREQ;
 		return;
 	}
 
 	/*
 	 * If our vnode is no longer valid, tear it down and signal the
 	 * accounting thread to die.
 	 */
 	if (acct_vp->v_type == VBAD) {
 		(void) acct_disable(NULL, 1);
 		acct_state |= ACCT_EXITREQ;
 		return;
 	}
 
 	/*
 	 * Stopping here is better than continuing, maybe it will be VBAD
 	 * next time around.
 	 */
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	if (VFS_STATFS(acct_vp->v_mount, sp) < 0) {
 		free(sp, M_STATFS);
 		return;
 	}
 	if (acct_suspended) {
 		if (sp->f_bavail > (int64_t)(acctresume * sp->f_blocks /
 		    100)) {
 			acct_suspended = 0;
 			log(LOG_NOTICE, "Accounting resumed\n");
 		}
 	} else {
 		if (sp->f_bavail <= (int64_t)(acctsuspend * sp->f_blocks /
 		    100)) {
 			acct_suspended = 1;
 			log(LOG_NOTICE, "Accounting suspended\n");
 		}
 	}
 	free(sp, M_STATFS);
 }
 
 /*
  * The main loop for the dedicated kernel thread that periodically calls
  * acctwatch().
  */
 static void
 acct_thread(void *dummy)
 {
 	u_char pri;
 
 	/* This is a low-priority kernel thread. */
 	pri = PRI_MAX_KERN;
 	thread_lock(curthread);
 	sched_prio(curthread, pri);
 	thread_unlock(curthread);
 
 	/* If another accounting kthread is already running, just die. */
 	sx_xlock(&acct_sx);
 	if (acct_state & ACCT_RUNNING) {
 		sx_xunlock(&acct_sx);
 		kproc_exit(0);
 	}
 	acct_state |= ACCT_RUNNING;
 
 	/* Loop until we are asked to exit. */
 	while (!(acct_state & ACCT_EXITREQ)) {
 
 		/* Perform our periodic checks. */
 		acctwatch();
 
 		/*
 		 * We check this flag again before sleeping since the
 		 * acctwatch() might have shut down accounting and asked us
 		 * to exit.
 		 */
 		if (!(acct_state & ACCT_EXITREQ)) {
 			sx_sleep(&acct_state, &acct_sx, 0, "-",
 			    acctchkfreq * hz);
 		}
 	}
 
 	/*
 	 * Acknowledge the exit request and shutdown.  We clear both the
 	 * exit request and running flags.
 	 */
 	acct_state = 0;
 	sx_xunlock(&acct_sx);
 	kproc_exit(0);
 }
Index: projects/netbsd-tests-upstream-01-2017/sys/kern/kern_shutdown.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/kern/kern_shutdown.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/kern/kern_shutdown.c	(revision 312218)
@@ -1,1258 +1,1258 @@
 /*-
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ekcd.h"
 #include "opt_kdb.h"
 #include "opt_panic.h"
 #include "opt_sched.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <crypto/rijndael/rijndael-api-fst.h>
 #include <crypto/sha2/sha256.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/cpu.h>
 #include <machine/dump.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <sys/signalvar.h>
 
 static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer");
 
 #ifndef PANIC_REBOOT_WAIT_TIME
 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
 #endif
 static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME;
 SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN,
     &panic_reboot_wait_time, 0,
     "Seconds to wait before rebooting after a panic");
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 #ifdef KDB
 #ifdef KDB_UNATTENDED
 int debugger_on_panic = 0;
 #else
 int debugger_on_panic = 1;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &debugger_on_panic, 0, "Run debugger on kernel panic");
 
 #ifdef KDB_TRACE
 static int trace_on_panic = 1;
 #else
 static int trace_on_panic = 0;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, trace_on_panic,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &trace_on_panic, 0, "Print stack trace on kernel panic");
 #endif /* KDB */
 
 static int sync_on_panic = 0;
 SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN,
 	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
 
 static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0,
     "Shutdown environment");
 
 #ifndef DIAGNOSTIC
 static int show_busybufs;
 #else
 static int show_busybufs = 1;
 #endif
 SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
 	&show_busybufs, 0, "");
 
 int suspend_blocked = 0;
 SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW,
 	&suspend_blocked, 0, "Block suspend due to a pending shutdown");
 
 #ifdef EKCD
 FEATURE(ekcd, "Encrypted kernel crash dumps support");
 
 MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data");
 
 struct kerneldumpcrypto {
 	uint8_t			kdc_encryption;
 	uint8_t			kdc_iv[KERNELDUMP_IV_MAX_SIZE];
 	keyInstance		kdc_ki;
 	cipherInstance		kdc_ci;
 	off_t			kdc_nextoffset;
 	uint32_t		kdc_dumpkeysize;
 	struct kerneldumpkey	kdc_dumpkey[];
 };
 #endif
 
 /*
  * Variable panicstr contains argument to first call to panic; used as flag
  * to indicate that the kernel has already called panic.
  */
 const char *panicstr;
 
 int dumping;				/* system is dumping */
 int rebooting;				/* system is rebooting */
 static struct dumperinfo dumper;	/* our selected dumper */
 
 /* Context information for dump-debuggers. */
 static struct pcb dumppcb;		/* Registers. */
 lwpid_t dumptid;			/* Thread ID. */
 
 static struct cdevsw reroot_cdevsw = {
      .d_version = D_VERSION,
      .d_name    = "reroot",
 };
 
 static void poweroff_wait(void *, int);
 static void shutdown_halt(void *junk, int howto);
 static void shutdown_panic(void *junk, int howto);
 static void shutdown_reset(void *junk, int howto);
 static int kern_reroot(void);
 
 /* register various local shutdown events */
 static void
 shutdown_conf(void *unused)
 {
 
 	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
 	    SHUTDOWN_PRI_LAST + 200);
 }
 
 SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
 
 /*
  * The only reason this exists is to create the /dev/reroot/ directory,
  * used by reroot code in init(8) as a mountpoint for tmpfs.
  */
 static void
 reroot_conf(void *unused)
 {
 	int error;
 	struct cdev *cdev;
 
 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev,
 	    &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot");
 	if (error != 0) {
 		printf("%s: failed to create device node, error %d",
 		    __func__, error);
 	}
 }
 
 SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL);
 
 /*
  * The system call that results in a reboot.
  */
 /* ARGSUSED */
 int
 sys_reboot(struct thread *td, struct reboot_args *uap)
 {
 	int error;
 
 	error = 0;
 #ifdef MAC
 	error = mac_system_check_reboot(td->td_ucred, uap->opt);
 #endif
 	if (error == 0)
 		error = priv_check(td, PRIV_REBOOT);
 	if (error == 0) {
 		if (uap->opt & RB_REROOT) {
 			error = kern_reroot();
 		} else {
 			mtx_lock(&Giant);
 			kern_reboot(uap->opt);
 			mtx_unlock(&Giant);
 		}
 	}
 	return (error);
 }
 
 /*
  * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
  */
 void
 shutdown_nice(int howto)
 {
 
 	if (initproc != NULL) {
 		/* Send a signal to init(8) and have it shutdown the world. */
 		PROC_LOCK(initproc);
 		if (howto & RB_POWEROFF)
 			kern_psignal(initproc, SIGUSR2);
 		else if (howto & RB_HALT)
 			kern_psignal(initproc, SIGUSR1);
 		else
 			kern_psignal(initproc, SIGINT);
 		PROC_UNLOCK(initproc);
 	} else {
 		/* No init(8) running, so simply reboot. */
 		kern_reboot(howto | RB_NOSYNC);
 	}
 }
 
 static void
 print_uptime(void)
 {
 	int f;
 	struct timespec ts;
 
 	getnanouptime(&ts);
 	printf("Uptime: ");
 	f = 0;
 	if (ts.tv_sec >= 86400) {
 		printf("%ldd", (long)ts.tv_sec / 86400);
 		ts.tv_sec %= 86400;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 3600) {
 		printf("%ldh", (long)ts.tv_sec / 3600);
 		ts.tv_sec %= 3600;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 60) {
 		printf("%ldm", (long)ts.tv_sec / 60);
 		ts.tv_sec %= 60;
 		f = 1;
 	}
 	printf("%lds\n", (long)ts.tv_sec);
 }
 
 int
 doadump(boolean_t textdump)
 {
 	boolean_t coredump;
 	int error;
 
 	error = 0;
 	if (dumping)
 		return (EBUSY);
 	if (dumper.dumper == NULL)
 		return (ENXIO);
 
 	savectx(&dumppcb);
 	dumptid = curthread->td_tid;
 	dumping++;
 
 	coredump = TRUE;
 #ifdef DDB
 	if (textdump && textdump_pending) {
 		coredump = FALSE;
 		textdump_dumpsys(&dumper);
 	}
 #endif
 	if (coredump)
 		error = dumpsys(&dumper);
 
 	dumping--;
 	return (error);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 kern_reboot(int howto)
 {
 	static int once = 0;
 
 #if defined(SMP)
 	/*
 	 * Bind us to CPU 0 so that all shutdown code runs there.  Some
 	 * systems don't shutdown properly (i.e., ACPI power off) if we
 	 * run on another processor.
 	 */
 	if (!SCHEDULER_STOPPED()) {
 		thread_lock(curthread);
 		sched_bind(curthread, 0);
 		thread_unlock(curthread);
 		KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
 	}
 #endif
 	/* We're in the process of rebooting. */
 	rebooting = 1;
 
 	/* We are out of the debugger now. */
 	kdb_active = 0;
 
 	/*
 	 * Do any callouts that should be done BEFORE syncing the filesystems.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
 
 	/* 
 	 * Now sync filesystems
 	 */
 	if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) {
 		once = 1;
 		bufshutdown(show_busybufs);
 	}
 
 	print_uptime();
 
 	cngrab();
 
 	/*
 	 * Ok, now do things that assume all filesystem activity has
 	 * been completed.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
 
 	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) 
 		doadump(TRUE);
 
 	/* Now that we're going to really halt the system... */
 	EVENTHANDLER_INVOKE(shutdown_final, howto);
 
 	for(;;) ;	/* safety against shutdown_reset not working */
 	/* NOTREACHED */
 }
 
 /*
  * The system call that results in changing the rootfs.
  */
 static int
 kern_reroot(void)
 {
 	struct vnode *oldrootvnode, *vp;
 	struct mount *mp, *devmp;
 	int error;
 
 	if (curproc != initproc)
 		return (EPERM);
 
 	/*
 	 * Mark the filesystem containing currently-running executable
 	 * (the temporary copy of init(8)) busy.
 	 */
 	vp = curproc->p_textvp;
 	error = vn_lock(vp, LK_SHARED);
 	if (error != 0)
 		return (error);
 	mp = vp->v_mount;
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp, 0);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0) {
 			VOP_UNLOCK(vp, 0);
 			return (ENOENT);
 		}
 		if (vp->v_iflag & VI_DOOMED) {
 			VOP_UNLOCK(vp, 0);
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp, 0);
 
 	/*
 	 * Remove the filesystem containing currently-running executable
 	 * from the mount list, to prevent it from being unmounted
 	 * by vfs_unmountall(), and to avoid confusing vfs_mountroot().
 	 *
 	 * Also preserve /dev - forcibly unmounting it could cause driver
 	 * reinitialization.
 	 */
 
 	vfs_ref(rootdevmp);
 	devmp = rootdevmp;
 	rootdevmp = NULL;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	TAILQ_REMOVE(&mountlist, devmp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	oldrootvnode = rootvnode;
 
 	/*
 	 * Unmount everything except for the two filesystems preserved above.
 	 */
 	vfs_unmountall();
 
 	/*
 	 * Add /dev back; vfs_mountroot() will move it into its new place.
 	 */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	rootdevmp = devmp;
 	vfs_rel(rootdevmp);
 
 	/*
 	 * Mount the new rootfs.
 	 */
 	vfs_mountroot();
 
 	/*
 	 * Update all references to the old rootvnode.
 	 */
 	mountcheckdirs(oldrootvnode, rootvnode);
 
 	/*
 	 * Add the temporary filesystem back and unbusy it.
 	 */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_unbusy(mp);
 
 	return (0);
 }
 
 /*
  * If the shutdown was a clean halt, behave accordingly.
  */
 static void
 shutdown_halt(void *junk, int howto)
 {
 
 	if (howto & RB_HALT) {
 		printf("\n");
 		printf("The operating system has halted.\n");
 		printf("Please press any key to reboot.\n\n");
 		switch (cngetc()) {
 		case -1:		/* No console, just die */
 			cpu_halt();
 			/* NOTREACHED */
 		default:
 			howto &= ~RB_HALT;
 			break;
 		}
 	}
 }
 
 /*
  * Check to see if the system paniced, pause and then reboot
  * according to the specified delay.
  */
 static void
 shutdown_panic(void *junk, int howto)
 {
 	int loop;
 
 	if (howto & RB_DUMP) {
 		if (panic_reboot_wait_time != 0) {
 			if (panic_reboot_wait_time != -1) {
 				printf("Automatic reboot in %d seconds - "
 				       "press a key on the console to abort\n",
 					panic_reboot_wait_time);
 				for (loop = panic_reboot_wait_time * 10;
 				     loop > 0; --loop) {
 					DELAY(1000 * 100); /* 1/10th second */
 					/* Did user type a key? */
 					if (cncheckc() != -1)
 						break;
 				}
 				if (!loop)
 					return;
 			}
 		} else { /* zero time specified - reboot NOW */
 			return;
 		}
 		printf("--> Press a key on the console to reboot,\n");
 		printf("--> or switch off the system now.\n");
 		cngetc();
 	}
 }
 
 /*
  * Everything done, now reset
  */
 static void
 shutdown_reset(void *junk, int howto)
 {
 
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 
 	/*
 	 * Acquiring smp_ipi_mtx here has a double effect:
 	 * - it disables interrupts avoiding CPU0 preemption
 	 *   by fast handlers (thus deadlocking  against other CPUs)
 	 * - it avoids deadlocks against smp_rendezvous() or, more 
 	 *   generally, threads busy-waiting, with this spinlock held,
 	 *   and waiting for responses by threads on other CPUs
 	 *   (ie. smp_tlb_shootdown()).
 	 *
 	 * For the !SMP case it just needs to handle the former problem.
 	 */
 #ifdef SMP
 	mtx_lock_spin(&smp_ipi_mtx);
 #else
 	spinlock_enter();
 #endif
 
 	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
 	cpu_reset();
 	/* NOTREACHED */ /* assuming reset worked */
 }
 
 #if defined(WITNESS) || defined(INVARIANT_SUPPORT)
 static int kassert_warn_only = 0;
 #ifdef KDB
 static int kassert_do_kdb = 0;
 #endif
 #ifdef KTR
 static int kassert_do_ktr = 0;
 #endif
 static int kassert_do_log = 1;
 static int kassert_log_pps_limit = 4;
 static int kassert_log_mute_at = 0;
 static int kassert_log_panic_at = 0;
 static int kassert_warnings = 0;
 
 SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RWTUN,
     &kassert_warn_only, 0,
     "KASSERT triggers a panic (1) or just a warning (0)");
 
 #ifdef KDB
 SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RWTUN,
     &kassert_do_kdb, 0, "KASSERT will enter the debugger");
 #endif
 
 #ifdef KTR
 SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RWTUN,
     &kassert_do_ktr, 0,
     "KASSERT does a KTR, set this to the KTRMASK you want");
 #endif
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RWTUN,
     &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RWTUN,
     &kassert_warnings, 0, "number of KASSERTs that have been triggered");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RWTUN,
     &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RWTUN,
     &kassert_log_pps_limit, 0, "limit number of log messages per second");
 
 SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RWTUN,
     &kassert_log_mute_at, 0, "max number of KASSERTS to log");
 
 static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kassert_sysctl_kassert, "I", "set to trigger a test kassert");
 
 static int
 kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i));
 	return (0);
 }
 
 /*
  * Called by KASSERT, this decides if we will panic
  * or if we will log via printf and/or ktr.
  */
 void
 kassert_panic(const char *fmt, ...)
 {
 	static char buf[256];
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	/*
 	 * panic if we're not just warning, or if we've exceeded
 	 * kassert_log_panic_at warnings.
 	 */
 	if (!kassert_warn_only ||
 	    (kassert_log_panic_at > 0 &&
 	     kassert_warnings >= kassert_log_panic_at)) {
 		va_start(ap, fmt);
 		vpanic(fmt, ap);
 		/* NORETURN */
 	}
 #ifdef KTR
 	if (kassert_do_ktr)
 		CTR0(ktr_mask, buf);
 #endif /* KTR */
 	/*
 	 * log if we've not yet met the mute limit.
 	 */
 	if (kassert_do_log &&
 	    (kassert_log_mute_at == 0 ||
 	     kassert_warnings < kassert_log_mute_at)) {
 		static  struct timeval lasterr;
 		static  int curerr;
 
 		if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) {
 			printf("KASSERT failed: %s\n", buf);
 			kdb_backtrace();
 		}
 	}
 #ifdef KDB
 	if (kassert_do_kdb) {
 		kdb_enter(KDB_WHY_KASSERT, buf);
 	}
 #endif
 	atomic_add_int(&kassert_warnings, 1);
 }
 #endif
 
 /*
  * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
  * and then reboots.  If we are called twice, then we avoid trying to sync
  * the disks as this often leads to recursive panics.
  */
 void
 panic(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vpanic(fmt, ap);
 }
 
 void
 vpanic(const char *fmt, va_list ap)
 {
 #ifdef SMP
 	cpuset_t other_cpus;
 #endif
 	struct thread *td = curthread;
 	int bootopt, newpanic;
 	static char buf[256];
 
 	spinlock_enter();
 
 #ifdef SMP
 	/*
 	 * stop_cpus_hard(other_cpus) should prevent multiple CPUs from
 	 * concurrently entering panic.  Only the winner will proceed
 	 * further.
 	 */
 	if (panicstr == NULL && !kdb_active) {
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		stop_cpus_hard(other_cpus);
 	}
+#endif
 
 	/*
 	 * Ensure that the scheduler is stopped while panicking, even if panic
 	 * has been entered from kdb.
 	 */
 	td->td_stopsched = 1;
-#endif
 
 	bootopt = RB_AUTOBOOT;
 	newpanic = 0;
 	if (panicstr)
 		bootopt |= RB_NOSYNC;
 	else {
 		bootopt |= RB_DUMP;
 		panicstr = fmt;
 		newpanic = 1;
 	}
 
 	if (newpanic) {
 		(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 		panicstr = buf;
 		cngrab();
 		printf("panic: %s\n", buf);
 	} else {
 		printf("panic: ");
 		vprintf(fmt, ap);
 		printf("\n");
 	}
 #ifdef SMP
 	printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 
 #ifdef KDB
 	if (newpanic && trace_on_panic)
 		kdb_backtrace();
 	if (debugger_on_panic)
 		kdb_enter(KDB_WHY_PANIC, "panic");
 #endif
 	/*thread_lock(td); */
 	td->td_flags |= TDF_INPANIC;
 	/* thread_unlock(td); */
 	if (!sync_on_panic)
 		bootopt |= RB_NOSYNC;
 	kern_reboot(bootopt);
 }
 
 /*
  * Support for poweroff delay.
  *
  * Please note that setting this delay too short might power off your machine
  * before the write cache on your hard disk has been flushed, leading to
  * soft-updates inconsistencies.
  */
 #ifndef POWEROFF_DELAY
 # define POWEROFF_DELAY 5000
 #endif
 static int poweroff_delay = POWEROFF_DELAY;
 
 SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
     &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)");
 
 static void
 poweroff_wait(void *junk, int howto)
 {
 
 	if (!(howto & RB_POWEROFF) || poweroff_delay <= 0)
 		return;
 	DELAY(poweroff_delay * 1000);
 }
 
 /*
  * Some system processes (e.g. syncer) need to be stopped at appropriate
  * points in their main loops prior to a system shutdown, so that they
  * won't interfere with the shutdown process (e.g. by holding a disk buf
  * to cause sync to fail).  For each of these system processes, register
  * shutdown_kproc() as a handler for one of shutdown events.
  */
 static int kproc_shutdown_wait = 60;
 SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
     &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process");
 
 void
 kproc_shutdown(void *arg, int howto)
 {
 	struct proc *p;
 	int error;
 
 	if (panicstr)
 		return;
 
 	p = (struct proc *)arg;
 	printf("Waiting (max %d seconds) for system process `%s' to stop... ",
 	    kproc_shutdown_wait, p->p_comm);
 	error = kproc_suspend(p, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 void
 kthread_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 	int error;
 
 	if (panicstr)
 		return;
 
 	td = (struct thread *)arg;
 	printf("Waiting (max %d seconds) for system thread `%s' to stop... ",
 	    kproc_shutdown_wait, td->td_name);
 	error = kthread_suspend(td, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)];
 SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD,
     dumpdevname, 0, "Device for kernel dumps");
 
 #ifdef EKCD
 static struct kerneldumpcrypto *
 kerneldumpcrypto_create(size_t blocksize, uint8_t encryption,
     const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey)
 {
 	struct kerneldumpcrypto *kdc;
 	struct kerneldumpkey *kdk;
 	uint32_t dumpkeysize;
 
 	dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize);
 	kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO);
 
 	arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0);
 
 	kdc->kdc_encryption = encryption;
 	switch (kdc->kdc_encryption) {
 	case KERNELDUMP_ENC_AES_256_CBC:
 		if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0)
 			goto failed;
 		break;
 	default:
 		goto failed;
 	}
 
 	kdc->kdc_dumpkeysize = dumpkeysize;
 	kdk = kdc->kdc_dumpkey;
 	kdk->kdk_encryption = kdc->kdc_encryption;
 	memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv));
 	kdk->kdk_encryptedkeysize = htod32(encryptedkeysize);
 	memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize);
 
 	return (kdc);
 failed:
 	explicit_bzero(kdc, sizeof(*kdc) + dumpkeysize);
 	free(kdc, M_EKCD);
 	return (NULL);
 }
 #endif /* EKCD */
 
 int
 kerneldumpcrypto_init(struct kerneldumpcrypto *kdc)
 {
 #ifndef EKCD
 	return (0);
 #else
 	uint8_t hash[SHA256_DIGEST_LENGTH];
 	SHA256_CTX ctx;
 	struct kerneldumpkey *kdk;
 	int error;
 
 	error = 0;
 
 	if (kdc == NULL)
 		return (0);
 
 	/*
 	 * When a user enters ddb it can write a crash dump multiple times.
 	 * Each time it should be encrypted using a different IV.
 	 */
 	SHA256_Init(&ctx);
 	SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv));
 	SHA256_Final(hash, &ctx);
 	bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv));
 
 	switch (kdc->kdc_encryption) {
 	case KERNELDUMP_ENC_AES_256_CBC:
 		if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC,
 		    kdc->kdc_iv) <= 0) {
 			error = EINVAL;
 			goto out;
 		}
 		break;
 	default:
 		error = EINVAL;
 		goto out;
 	}
 
 	kdc->kdc_nextoffset = 0;
 
 	kdk = kdc->kdc_dumpkey;
 	memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv));
 out:
 	explicit_bzero(hash, sizeof(hash));
 	return (error);
 #endif
 }
 
 uint32_t
 kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc)
 {
 
 #ifdef EKCD
 	if (kdc == NULL)
 		return (0);
 	return (kdc->kdc_dumpkeysize);
 #else
 	return (0);
 #endif
 }
 
 /* Registration of dumpers */
 int
 set_dumper(struct dumperinfo *di, const char *devname, struct thread *td,
     uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize,
     const uint8_t *encryptedkey)
 {
 	size_t wantcopy;
 	int error;
 
 	error = priv_check(td, PRIV_SETDUMPER);
 	if (error != 0)
 		return (error);
 
 	if (di == NULL) {
 		error = 0;
 		goto cleanup;
 	}
 	if (dumper.dumper != NULL)
 		return (EBUSY);
 	dumper = *di;
 	dumper.blockbuf = NULL;
 	dumper.kdc = NULL;
 
 	if (encryption != KERNELDUMP_ENC_NONE) {
 #ifdef EKCD
 		dumper.kdc = kerneldumpcrypto_create(di->blocksize, encryption,
 		    key, encryptedkeysize, encryptedkey);
 		if (dumper.kdc == NULL) {
 			error = EINVAL;
 			goto cleanup;
 		}
 #else
 		error = EOPNOTSUPP;
 		goto cleanup;
 #endif
 	}
 
 	wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname));
 	if (wantcopy >= sizeof(dumpdevname)) {
 		printf("set_dumper: device name truncated from '%s' -> '%s'\n",
 			devname, dumpdevname);
 	}
 
 	dumper.blockbuf = malloc(di->blocksize, M_DUMPER, M_WAITOK | M_ZERO);
 	return (0);
 cleanup:
 #ifdef EKCD
 	if (dumper.kdc != NULL) {
 		explicit_bzero(dumper.kdc, sizeof(*dumper.kdc) +
 		    dumper.kdc->kdc_dumpkeysize);
 		free(dumper.kdc, M_EKCD);
 	}
 #endif
 	if (dumper.blockbuf != NULL) {
 		explicit_bzero(dumper.blockbuf, dumper.blocksize);
 		free(dumper.blockbuf, M_DUMPER);
 	}
 	explicit_bzero(&dumper, sizeof(dumper));
 	dumpdevname[0] = '\0';
 	return (error);
 }
 
 static int
 dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length)
 {
 
 	if (length != 0 && (offset < di->mediaoffset ||
 	    offset - di->mediaoffset + length > di->mediasize)) {
 		printf("Attempt to write outside dump device boundaries.\n"
 	    "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
 		    (intmax_t)offset, (intmax_t)di->mediaoffset,
 		    (uintmax_t)length, (intmax_t)di->mediasize);
 		return (ENOSPC);
 	}
 
 	return (0);
 }
 
 #ifdef EKCD
 static int
 dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size)
 {
 
 	switch (kdc->kdc_encryption) {
 	case KERNELDUMP_ENC_AES_256_CBC:
 		if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf,
 		    8 * size, buf) <= 0) {
 			return (EIO);
 		}
 		if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC,
 		    buf + size - 16 /* IV size for AES-256-CBC */) <= 0) {
 			return (EIO);
 		}
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 /* Encrypt data and call dumper. */
 static int
 dump_encrypted_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length)
 {
 	static uint8_t buf[KERNELDUMP_BUFFER_SIZE];
 	struct kerneldumpcrypto *kdc;
 	int error;
 	size_t nbytes;
 	off_t nextoffset;
 
 	kdc = di->kdc;
 
 	error = dump_check_bounds(di, offset, length);
 	if (error != 0)
 		return (error);
 
 	/* Signal completion. */
 	if (virtual == NULL && physical == 0 && offset == 0 && length == 0) {
 		return (di->dumper(di->priv, virtual, physical, offset,
 		    length));
 	}
 
 	/* Data have to be aligned to block size. */
 	if ((length % di->blocksize) != 0)
 		return (EINVAL);
 
 	/*
 	 * Data have to be written continuously becase we're encrypting using
 	 * CBC mode which has this assumption.
 	 */
 	if (kdc->kdc_nextoffset != 0 && kdc->kdc_nextoffset != offset)
 		return (EINVAL);
 
 	nextoffset = offset + (off_t)length;
 
 	while (length > 0) {
 		nbytes = MIN(length, sizeof(buf));
 		bcopy(virtual, buf, nbytes);
 
 		if (dump_encrypt(kdc, buf, nbytes) != 0)
 			return (EIO);
 
 		error = di->dumper(di->priv, buf, physical, offset, nbytes);
 		if (error != 0)
 			return (error);
 
 		offset += nbytes;
 		virtual = (void *)((uint8_t *)virtual + nbytes);
 		length -= nbytes;
 	}
 
 	kdc->kdc_nextoffset = nextoffset;
 
 	return (0);
 }
 #endif /* EKCD */
 
 /* Call dumper with bounds checking. */
 static int
 dump_raw_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length)
 {
 	int error;
 
 	error = dump_check_bounds(di, offset, length);
 	if (error != 0)
 		return (error);
 
 	return (di->dumper(di->priv, virtual, physical, offset, length));
 }
 
 int
 dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length)
 {
 
 #ifdef EKCD
 	if (di->kdc != NULL) {
 		return (dump_encrypted_write(di, virtual, physical, offset,
 		    length));
 	}
 #endif
 
 	return (dump_raw_write(di, virtual, physical, offset, length));
 }
 
 static int
 dump_pad(struct dumperinfo *di, void *virtual, size_t length, void **buf,
     size_t *size)
 {
 
 	if (length > di->blocksize)
 		return (ENOMEM);
 
 	*size = di->blocksize;
 	if (length == di->blocksize) {
 		*buf = virtual;
 	} else {
 		*buf = di->blockbuf;
 		memcpy(*buf, virtual, length);
 		memset((uint8_t *)*buf + length, 0, di->blocksize - length);
 	}
 
 	return (0);
 }
 
 static int
 dump_raw_write_pad(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length, size_t *size)
 {
 	void *buf;
 	int error;
 
 	error = dump_pad(di, virtual, length, &buf, size);
 	if (error != 0)
 		return (error);
 
 	return (dump_raw_write(di, buf, physical, offset, *size));
 }
 
 int
 dump_write_pad(struct dumperinfo *di, void *virtual, vm_offset_t physical,
     off_t offset, size_t length, size_t *size)
 {
 	void *buf;
 	int error;
 
 	error = dump_pad(di, virtual, length, &buf, size);
 	if (error != 0)
 		return (error);
 
 	return (dump_write(di, buf, physical, offset, *size));
 }
 
 int
 dump_write_header(struct dumperinfo *di, struct kerneldumpheader *kdh,
     vm_offset_t physical, off_t offset)
 {
 	size_t size;
 	int ret;
 
 	ret = dump_raw_write_pad(di, kdh, physical, offset, sizeof(*kdh),
 	    &size);
 	if (ret == 0 && size != di->blocksize)
 		ret = EINVAL;
 	return (ret);
 }
 
 int
 dump_write_key(struct dumperinfo *di, vm_offset_t physical, off_t offset)
 {
 #ifndef EKCD
 	return (0);
 #else /* EKCD */
 	struct kerneldumpcrypto *kdc;
 
 	kdc = di->kdc;
 	if (kdc == NULL)
 		return (0);
 
 	return (dump_raw_write(di, kdc->kdc_dumpkey, physical, offset,
 	    kdc->kdc_dumpkeysize));
 #endif /* !EKCD */
 }
 
 void
 mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver,
     uint64_t dumplen, uint32_t dumpkeysize, uint32_t blksz)
 {
 
 	bzero(kdh, sizeof(*kdh));
 	strlcpy(kdh->magic, magic, sizeof(kdh->magic));
 	strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
 	kdh->version = htod32(KERNELDUMPVERSION);
 	kdh->architectureversion = htod32(archver);
 	kdh->dumplength = htod64(dumplen);
 	kdh->dumptime = htod64(time_second);
 	kdh->dumpkeysize = htod32(dumpkeysize);
 	kdh->blocksize = htod32(blksz);
 	strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
 	strlcpy(kdh->versionstring, version, sizeof(kdh->versionstring));
 	if (panicstr != NULL)
 		strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
 	kdh->parity = kerneldump_parity(kdh);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(panic, db_show_panic)
 {
 
 	if (panicstr == NULL)
 		db_printf("panicstr not set\n");
 	else
 		db_printf("panic: %s\n", panicstr);
 }
 #endif
Index: projects/netbsd-tests-upstream-01-2017/sys/kern/subr_gtaskqueue.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/kern/subr_gtaskqueue.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/kern/subr_gtaskqueue.c	(revision 312218)
@@ -1,937 +1,937 @@
 /*-
  * Copyright (c) 2000 Doug Rabson
  * Copyright (c) 2014 Jeff Roberson
  * Copyright (c) 2016 Matthew Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/gtaskqueue.h>
 #include <sys/unistd.h>
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_GTASKQUEUE, "taskqueue", "Task Queues");
 static void	gtaskqueue_thread_enqueue(void *);
 static void	gtaskqueue_thread_loop(void *arg);
 
 struct gtaskqueue_busy {
 	struct gtask	*tb_running;
 	TAILQ_ENTRY(gtaskqueue_busy) tb_link;
 };
 
 static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1;
 
 struct gtaskqueue {
 	STAILQ_HEAD(, gtask)	tq_queue;
 	gtaskqueue_enqueue_fn	tq_enqueue;
 	void			*tq_context;
 	char			*tq_name;
 	TAILQ_HEAD(, gtaskqueue_busy) tq_active;
 	struct mtx		tq_mutex;
 	struct thread		**tq_threads;
 	int			tq_tcount;
 	int			tq_spin;
 	int			tq_flags;
 	int			tq_callouts;
 	taskqueue_callback_fn	tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
 	void			*tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
 };
 
 #define	TQ_FLAGS_ACTIVE		(1 << 0)
 #define	TQ_FLAGS_BLOCKED	(1 << 1)
 #define	TQ_FLAGS_UNLOCKED_ENQUEUE	(1 << 2)
 
 #define	DT_CALLOUT_ARMED	(1 << 0)
 
 #define	TQ_LOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_lock_spin(&(tq)->tq_mutex);			\
 		else							\
 			mtx_lock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_LOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_OWNED)
 
 #define	TQ_UNLOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_unlock_spin(&(tq)->tq_mutex);		\
 		else							\
 			mtx_unlock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_UNLOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
 
 #ifdef INVARIANTS
 static void
 gtask_dump(struct gtask *gtask)
 {
 	printf("gtask: %p ta_flags=%x ta_priority=%d ta_func=%p ta_context=%p\n",
 	       gtask, gtask->ta_flags, gtask->ta_priority, gtask->ta_func, gtask->ta_context);
 }
 #endif
 
 static __inline int
 TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
     int t)
 {
 	if (tq->tq_spin)
 		return (msleep_spin(p, m, wm, t));
 	return (msleep(p, m, pri, wm, t));
 }
 
 static struct gtaskqueue *
 _gtaskqueue_create(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context,
 		 int mtxflags, const char *mtxname __unused)
 {
 	struct gtaskqueue *queue;
 	char *tq_name;
 
 	tq_name = malloc(TASKQUEUE_NAMELEN, M_GTASKQUEUE, mflags | M_ZERO);
 	if (!tq_name)
 		return (NULL);
 
 	snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue");
 
 	queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO);
 	if (!queue)
 		return (NULL);
 
 	STAILQ_INIT(&queue->tq_queue);
 	TAILQ_INIT(&queue->tq_active);
 	queue->tq_enqueue = enqueue;
 	queue->tq_context = context;
 	queue->tq_name = tq_name;
 	queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
 	queue->tq_flags |= TQ_FLAGS_ACTIVE;
 	if (enqueue == gtaskqueue_thread_enqueue)
 		queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE;
 	mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags);
 
 	return (queue);
 }
 
 
 /*
  * Signal a taskqueue thread to terminate.
  */
 static void
 gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq)
 {
 
 	while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
 		wakeup(tq);
 		TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
 	}
 }
 
 static void
 gtaskqueue_free(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
 	gtaskqueue_terminate(queue->tq_threads, queue);
 	KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
 	KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
 	mtx_destroy(&queue->tq_mutex);
 	free(queue->tq_threads, M_GTASKQUEUE);
 	free(queue->tq_name, M_GTASKQUEUE);
 	free(queue, M_GTASKQUEUE);
 }
 
 int
 grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *gtask)
 {
 #ifdef INVARIANTS
 	if (queue == NULL) {
 		gtask_dump(gtask);
 		panic("queue == NULL");
 	}
 #endif
 	TQ_LOCK(queue);
 	if (gtask->ta_flags & TASK_ENQUEUED) {
 		TQ_UNLOCK(queue);
 		return (0);
 	}
 	STAILQ_INSERT_TAIL(&queue->tq_queue, gtask, ta_link);
 	gtask->ta_flags |= TASK_ENQUEUED;
 	TQ_UNLOCK(queue);
 	if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
 		queue->tq_enqueue(queue->tq_context);
 	return (0);
 }
 
 static void
 gtaskqueue_task_nop_fn(void *context)
 {
 }
 
 /*
  * Block until all currently queued tasks in this taskqueue
  * have begun execution.  Tasks queued during execution of
  * this function are ignored.
  */
 static void
 gtaskqueue_drain_tq_queue(struct gtaskqueue *queue)
 {
 	struct gtask t_barrier;
 
 	if (STAILQ_EMPTY(&queue->tq_queue))
 		return;
 
 	/*
 	 * Enqueue our barrier after all current tasks, but with
 	 * the highest priority so that newly queued tasks cannot
 	 * pass it.  Because of the high priority, we can not use
 	 * taskqueue_enqueue_locked directly (which drops the lock
 	 * anyway) so just insert it at tail while we have the
 	 * queue lock.
 	 */
 	GTASK_INIT(&t_barrier, 0, USHRT_MAX, gtaskqueue_task_nop_fn, &t_barrier);
 	STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link);
 	t_barrier.ta_flags |= TASK_ENQUEUED;
 
 	/*
 	 * Once the barrier has executed, all previously queued tasks
 	 * have completed or are currently executing.
 	 */
 	while (t_barrier.ta_flags & TASK_ENQUEUED)
 		TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0);
 }
 
 /*
  * Block until all currently executing tasks for this taskqueue
  * complete.  Tasks that begin execution during the execution
  * of this function are ignored.
  */
 static void
 gtaskqueue_drain_tq_active(struct gtaskqueue *queue)
 {
 	struct gtaskqueue_busy tb_marker, *tb_first;
 
 	if (TAILQ_EMPTY(&queue->tq_active))
 		return;
 
 	/* Block taskq_terminate().*/
 	queue->tq_callouts++;
 
 	/*
 	 * Wait for all currently executing taskqueue threads
 	 * to go idle.
 	 */
 	tb_marker.tb_running = TB_DRAIN_WAITER;
 	TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link);
 	while (TAILQ_FIRST(&queue->tq_active) != &tb_marker)
 		TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0);
 	TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link);
 
 	/*
 	 * Wakeup any other drain waiter that happened to queue up
 	 * without any intervening active thread.
 	 */
 	tb_first = TAILQ_FIRST(&queue->tq_active);
 	if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER)
 		wakeup(tb_first);
 
 	/* Release taskqueue_terminate(). */
 	queue->tq_callouts--;
 	if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 		wakeup_one(queue->tq_threads);
 }
 
 void
 gtaskqueue_block(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags |= TQ_FLAGS_BLOCKED;
 	TQ_UNLOCK(queue);
 }
 
 void
 gtaskqueue_unblock(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
 	if (!STAILQ_EMPTY(&queue->tq_queue))
 		queue->tq_enqueue(queue->tq_context);
 	TQ_UNLOCK(queue);
 }
 
 static void
 gtaskqueue_run_locked(struct gtaskqueue *queue)
 {
 	struct gtaskqueue_busy tb;
 	struct gtaskqueue_busy *tb_first;
 	struct gtask *gtask;
 
 	KASSERT(queue != NULL, ("tq is NULL"));
 	TQ_ASSERT_LOCKED(queue);
 	tb.tb_running = NULL;
 
 	while (STAILQ_FIRST(&queue->tq_queue)) {
 		TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
 
 		/*
 		 * Carefully remove the first task from the queue and
 		 * clear its TASK_ENQUEUED flag
 		 */
 		gtask = STAILQ_FIRST(&queue->tq_queue);
 		KASSERT(gtask != NULL, ("task is NULL"));
 		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
 		gtask->ta_flags &= ~TASK_ENQUEUED;
 		tb.tb_running = gtask;
 		TQ_UNLOCK(queue);
 
 		KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL"));
 		gtask->ta_func(gtask->ta_context);
 
 		TQ_LOCK(queue);
 		tb.tb_running = NULL;
 		wakeup(gtask);
 
 		TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
 		tb_first = TAILQ_FIRST(&queue->tq_active);
 		if (tb_first != NULL &&
 		    tb_first->tb_running == TB_DRAIN_WAITER)
 			wakeup(tb_first);
 	}
 }
 
 static int
 task_is_running(struct gtaskqueue *queue, struct gtask *gtask)
 {
 	struct gtaskqueue_busy *tb;
 
 	TQ_ASSERT_LOCKED(queue);
 	TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
 		if (tb->tb_running == gtask)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 gtaskqueue_cancel_locked(struct gtaskqueue *queue, struct gtask *gtask)
 {
 
 	if (gtask->ta_flags & TASK_ENQUEUED)
 		STAILQ_REMOVE(&queue->tq_queue, gtask, gtask, ta_link);
 	gtask->ta_flags &= ~TASK_ENQUEUED;
 	return (task_is_running(queue, gtask) ? EBUSY : 0);
 }
 
 int
 gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask)
 {
 	int error;
 
 	TQ_LOCK(queue);
 	error = gtaskqueue_cancel_locked(queue, gtask);
 	TQ_UNLOCK(queue);
 
 	return (error);
 }
 
 void
 gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *gtask)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask))
 		TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0);
 	TQ_UNLOCK(queue);
 }
 
 void
 gtaskqueue_drain_all(struct gtaskqueue *queue)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	gtaskqueue_drain_tq_queue(queue);
 	gtaskqueue_drain_tq_active(queue);
 	TQ_UNLOCK(queue);
 }
 
 static int
 _gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
     cpuset_t *mask, const char *name, va_list ap)
 {
 	char ktname[MAXCOMLEN + 1];
 	struct thread *td;
 	struct gtaskqueue *tq;
 	int i, error;
 
 	if (count <= 0)
 		return (EINVAL);
 
 	vsnprintf(ktname, sizeof(ktname), name, ap);
 	tq = *tqp;
 
 	tq->tq_threads = malloc(sizeof(struct thread *) * count, M_GTASKQUEUE,
 	    M_NOWAIT | M_ZERO);
 	if (tq->tq_threads == NULL) {
 		printf("%s: no memory for %s threads\n", __func__, ktname);
 		return (ENOMEM);
 	}
 
 	for (i = 0; i < count; i++) {
 		if (count == 1)
 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
 		else
 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0,
 			    "%s_%d", ktname, i);
 		if (error) {
 			/* should be ok to continue, taskqueue_free will dtrt */
 			printf("%s: kthread_add(%s): error %d", __func__,
 			    ktname, error);
 			tq->tq_threads[i] = NULL;		/* paranoid */
 		} else
 			tq->tq_tcount++;
 	}
 	for (i = 0; i < count; i++) {
 		if (tq->tq_threads[i] == NULL)
 			continue;
 		td = tq->tq_threads[i];
 		if (mask) {
 			error = cpuset_setthread(td->td_tid, mask);
 			/*
 			 * Failing to pin is rarely an actual fatal error;
 			 * it'll just affect performance.
 			 */
 			if (error)
 				printf("%s: curthread=%llu: can't pin; "
 				    "error=%d\n",
 				    __func__,
 				    (unsigned long long) td->td_tid,
 				    error);
 		}
 		thread_lock(td);
 		sched_prio(td, pri);
 		sched_add(td, SRQ_BORING);
 		thread_unlock(td);
 	}
 
 	return (0);
 }
 
 static int
 gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
     const char *name, ...)
 {
 	va_list ap;
 	int error;
 
 	va_start(ap, name);
 	error = _gtaskqueue_start_threads(tqp, count, pri, NULL, name, ap);
 	va_end(ap);
 	return (error);
 }
 
 static inline void
 gtaskqueue_run_callback(struct gtaskqueue *tq,
     enum taskqueue_callback_type cb_type)
 {
 	taskqueue_callback_fn tq_callback;
 
 	TQ_ASSERT_UNLOCKED(tq);
 	tq_callback = tq->tq_callbacks[cb_type];
 	if (tq_callback != NULL)
 		tq_callback(tq->tq_cb_contexts[cb_type]);
 }
 
 static void
 gtaskqueue_thread_loop(void *arg)
 {
 	struct gtaskqueue **tqp, *tq;
 
 	tqp = arg;
 	tq = *tqp;
 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
 	TQ_LOCK(tq);
 	while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
 		/* XXX ? */
 		gtaskqueue_run_locked(tq);
 		/*
 		 * Because taskqueue_run() can drop tq_mutex, we need to
 		 * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
 		 * meantime, which means we missed a wakeup.
 		 */
 		if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 			break;
 		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
 	}
 	gtaskqueue_run_locked(tq);
 	/*
 	 * This thread is on its way out, so just drop the lock temporarily
 	 * in order to call the shutdown callback.  This allows the callback
 	 * to look at the taskqueue, even just before it dies.
 	 */
 	TQ_UNLOCK(tq);
 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
 	TQ_LOCK(tq);
 
 	/* rendezvous with thread that asked us to terminate */
 	tq->tq_tcount--;
 	wakeup_one(tq->tq_threads);
 	TQ_UNLOCK(tq);
 	kthread_exit();
 }
 
 static void
 gtaskqueue_thread_enqueue(void *context)
 {
 	struct gtaskqueue **tqp, *tq;
 
 	tqp = context;
 	tq = *tqp;
 	wakeup_one(tq);
 }
 
 
 static struct gtaskqueue *
 gtaskqueue_create_fast(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context)
 {
 	return _gtaskqueue_create(name, mflags, enqueue, context,
 			MTX_SPIN, "fast_taskqueue");
 }
 
 
 struct taskqgroup_cpu {
 	LIST_HEAD(, grouptask)	tgc_tasks;
 	struct gtaskqueue	*tgc_taskq;
 	int	tgc_cnt;
 	int	tgc_cpu;
 };
 
 struct taskqgroup {
 	struct taskqgroup_cpu tqg_queue[MAXCPU];
 	struct mtx	tqg_lock;
 	char *		tqg_name;
 	int		tqg_adjusting;
 	int		tqg_stride;
 	int		tqg_cnt;
 };
 
 struct taskq_bind_task {
 	struct gtask bt_task;
 	int	bt_cpuid;
 };
 
 static void
 taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
 {
 	struct taskqgroup_cpu *qcpu;
 
 	qcpu = &qgroup->tqg_queue[idx];
 	LIST_INIT(&qcpu->tgc_tasks);
 	qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
 	    taskqueue_thread_enqueue, &qcpu->tgc_taskq);
 	gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
 	    "%s_%d", qgroup->tqg_name, idx);
 	qcpu->tgc_cpu = cpu;
 }
 
 static void
 taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx)
 {
 
 	gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq);
 }
 
 /*
  * Find the taskq with least # of tasks that doesn't currently have any
  * other queues from the uniq identifier.
  */
 static int
 taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
 {
 	struct grouptask *n;
 	int i, idx, mincnt;
 	int strict;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 	if (qgroup->tqg_cnt == 0)
 		return (0);
 	idx = -1;
 	mincnt = INT_MAX;
 	/*
 	 * Two passes;  First scan for a queue with the least tasks that
 	 * does not already service this uniq id.  If that fails simply find
 	 * the queue with the least total tasks;
 	 */
 	for (strict = 1; mincnt == INT_MAX; strict = 0) {
 		for (i = 0; i < qgroup->tqg_cnt; i++) {
 			if (qgroup->tqg_queue[i].tgc_cnt > mincnt)
 				continue;
 			if (strict) {
 				LIST_FOREACH(n,
 				    &qgroup->tqg_queue[i].tgc_tasks, gt_list)
 					if (n->gt_uniq == uniq)
 						break;
 				if (n != NULL)
 					continue;
 			}
 			mincnt = qgroup->tqg_queue[i].tgc_cnt;
 			idx = i;
 		}
 	}
 	if (idx == -1)
 		panic("taskqgroup_find: Failed to pick a qid.");
 
 	return (idx);
 }
 
 void
 taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
     void *uniq, int irq, char *name)
 {
 	cpuset_t mask;
 	int qid;
 
 	gtask->gt_uniq = uniq;
 	gtask->gt_name = name;
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = -1;
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, uniq);
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
-	if (irq != -1 && smp_started) {
+	if (irq != -1 && (smp_started || mp_ncpus == 1)) {
 		gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu;
 		CPU_ZERO(&mask);
 		CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
 		mtx_unlock(&qgroup->tqg_lock);
 		intr_setaffinity(irq, &mask);
 	} else
 		mtx_unlock(&qgroup->tqg_lock);
 }
 
 static void
 taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
 	int qid, cpu;
 
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, gtask->gt_uniq);
 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
 	if (gtask->gt_irq != -1) {
 		mtx_unlock(&qgroup->tqg_lock);
 
 		CPU_ZERO(&mask);
 		CPU_SET(cpu, &mask);
 		intr_setaffinity(gtask->gt_irq, &mask);
 
 		mtx_lock(&qgroup->tqg_lock);
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask,
 			 gt_list);
 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	mtx_unlock(&qgroup->tqg_lock);
 }
 
 int
 taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
 	void *uniq, int cpu, int irq, char *name)
 {
 	cpuset_t mask;
 	int i, qid;
 
 	qid = -1;
 	gtask->gt_uniq = uniq;
 	gtask->gt_name = name;
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = cpu;
 	mtx_lock(&qgroup->tqg_lock);
-	if (smp_started) {
+	if (smp_started || mp_ncpus == 1) {
 		for (i = 0; i < qgroup->tqg_cnt; i++)
 			if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 				qid = i;
 				break;
 			}
 		if (qid == -1) {
 			mtx_unlock(&qgroup->tqg_lock);
 			return (EINVAL);
 		}
 	} else
 		qid = 0;
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
 	mtx_unlock(&qgroup->tqg_lock);
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
-	if (irq != -1 && smp_started)
+	if (irq != -1 && (smp_started || mp_ncpus == 1))
 		intr_setaffinity(irq, &mask);
 	return (0);
 }
 
 static int
 taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
 	int i, qid, irq, cpu;
 
 	qid = -1;
 	irq = gtask->gt_irq;
 	cpu = gtask->gt_cpu;
-	MPASS(smp_started);
+	MPASS(smp_started || mp_ncpus == 1);
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 			qid = i;
 			break;
 		}
 	if (qid == -1) {
 		mtx_unlock(&qgroup->tqg_lock);
 		return (EINVAL);
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	mtx_unlock(&qgroup->tqg_lock);
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
 
 	if (irq != -1)
 		intr_setaffinity(irq, &mask);
 	return (0);
 }
 
 void
 taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	int i;
 
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue)
 			break;
 	if (i == qgroup->tqg_cnt)
 		panic("taskqgroup_detach: task not in group\n");
 	qgroup->tqg_queue[i].tgc_cnt--;
 	LIST_REMOVE(gtask, gt_list);
 	mtx_unlock(&qgroup->tqg_lock);
 	gtask->gt_taskqueue = NULL;
 }
 
 static void
 taskqgroup_binder(void *ctx)
 {
 	struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx;
 	cpuset_t mask;
 	int error;
 
 	CPU_ZERO(&mask);
 	CPU_SET(gtask->bt_cpuid, &mask);
 	error = cpuset_setthread(curthread->td_tid, &mask);
 	thread_lock(curthread);
 	sched_bind(curthread, gtask->bt_cpuid);
 	thread_unlock(curthread);
 
 	if (error)
 		printf("taskqgroup_binder: setaffinity failed: %d\n",
 		    error);
 	free(gtask, M_DEVBUF);
 }
 
 static void
 taskqgroup_bind(struct taskqgroup *qgroup)
 {
 	struct taskq_bind_task *gtask;
 	int i;
 
 	/*
 	 * Bind taskqueue threads to specific CPUs, if they have been assigned
 	 * one.
 	 */
 	if (qgroup->tqg_cnt == 1)
 		return;
 
 	for (i = 0; i < qgroup->tqg_cnt; i++) {
 		gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK);
 		GTASK_INIT(&gtask->bt_task, 0, 0, taskqgroup_binder, gtask);
 		gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu;
 		grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq,
 		    &gtask->bt_task);
 	}
 }
 
 static int
 _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 {
 	LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
 	struct grouptask *gtask;
 	int i, k, old_cnt, old_cpu, cpu;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 
-	if (cnt < 1 || cnt * stride > mp_ncpus || !smp_started) {
+	if (cnt < 1 || cnt * stride > mp_ncpus || (!smp_started && (mp_ncpus != 1))) {
 		printf("taskqgroup_adjust failed cnt: %d stride: %d mp_ncpus: %d smp_started: %d\n",
 			   cnt, stride, mp_ncpus, smp_started);
 		return (EINVAL);
 	}
 	if (qgroup->tqg_adjusting) {
 		printf("taskqgroup_adjust failed: adjusting\n");
 		return (EBUSY);
 	}
 	qgroup->tqg_adjusting = 1;
 	old_cnt = qgroup->tqg_cnt;
 	old_cpu = 0;
 	if (old_cnt < cnt)
 		old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu;
 	mtx_unlock(&qgroup->tqg_lock);
 	/*
 	 * Set up queue for tasks added before boot.
 	 */
 	if (old_cnt == 0) {
 		LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
 		    grouptask, gt_list);
 		qgroup->tqg_queue[0].tgc_cnt = 0;
 	}
 
 	/*
 	 * If new taskq threads have been added.
 	 */
 	cpu = old_cpu;
 	for (i = old_cnt; i < cnt; i++) {
 		taskqgroup_cpu_create(qgroup, i, cpu);
 
 		for (k = 0; k < stride; k++)
 			cpu = CPU_NEXT(cpu);
 	}
 	mtx_lock(&qgroup->tqg_lock);
 	qgroup->tqg_cnt = cnt;
 	qgroup->tqg_stride = stride;
 
 	/*
 	 * Adjust drivers to use new taskqs.
 	 */
 	for (i = 0; i < old_cnt; i++) {
 		while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) {
 			LIST_REMOVE(gtask, gt_list);
 			qgroup->tqg_queue[i].tgc_cnt--;
 			LIST_INSERT_HEAD(&gtask_head, gtask, gt_list);
 		}
 	}
 	mtx_unlock(&qgroup->tqg_lock);
 
 	while ((gtask = LIST_FIRST(&gtask_head))) {
 		LIST_REMOVE(gtask, gt_list);
 		if (gtask->gt_cpu == -1)
 			taskqgroup_attach_deferred(qgroup, gtask);
 		else if (taskqgroup_attach_cpu_deferred(qgroup, gtask))
 			taskqgroup_attach_deferred(qgroup, gtask);
 	}
 
 #ifdef INVARIANTS
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++) {
 		MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL);
 		LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list)
 			MPASS(gtask->gt_taskqueue != NULL);
 	}
 	mtx_unlock(&qgroup->tqg_lock);
 #endif
 	/*
 	 * If taskq thread count has been reduced.
 	 */
 	for (i = cnt; i < old_cnt; i++)
 		taskqgroup_cpu_remove(qgroup, i);
 
 	taskqgroup_bind(qgroup);
 
 	mtx_lock(&qgroup->tqg_lock);
 	qgroup->tqg_adjusting = 0;
 
 	return (0);
 }
 
 int
 taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 {
 	int error;
 
 	mtx_lock(&qgroup->tqg_lock);
 	error = _taskqgroup_adjust(qgroup, cnt, stride);
 	mtx_unlock(&qgroup->tqg_lock);
 
 	return (error);
 }
 
 struct taskqgroup *
 taskqgroup_create(char *name)
 {
 	struct taskqgroup *qgroup;
 
 	qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF);
 	qgroup->tqg_name = name;
 	LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks);
 
 	return (qgroup);
 }
 
 void
 taskqgroup_destroy(struct taskqgroup *qgroup)
 {
 
 }
Index: projects/netbsd-tests-upstream-01-2017/sys/kern/uipc_mbuf.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/kern/uipc_mbuf.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/kern/uipc_mbuf.c	(revision 312218)
@@ -1,1869 +1,1869 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mbuf_profiling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
 #include <sys/sdt.h>
 
 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t",
     "void*", "void*");
 
 SDT_PROBE_DEFINE(sdt, , , m__cljset);
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
         "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
     "struct mbuf *", "mbufinfo_t *");
 
 #include <security/mac/mac_framework.h>
 
 int	max_linkhdr;
 int	max_protohdr;
 int	max_hdr;
 int	max_datalen;
 #ifdef MBUF_STRESS_TEST
 int	m_defragpackets;
 int	m_defragbytes;
 int	m_defraguseless;
 int	m_defragfailure;
 int	m_defragrandomfailures;
 #endif
 
 /*
  * sysctl(8) exported objects
  */
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
 	   &max_linkhdr, 0, "Size of largest link layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
 	   &max_protohdr, 0, "Size of largest protocol layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
 	   &max_hdr, 0, "Size of largest link plus protocol header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
 #ifdef MBUF_STRESS_TEST
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 	   &m_defragpackets, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 	   &m_defragbytes, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 	   &m_defraguseless, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 	   &m_defragfailure, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 	   &m_defragrandomfailures, 0, "");
 #endif
 
 /*
  * Ensure the correct size of various mbuf parameters.  It could be off due
  * to compiler-induced padding and alignment artifacts.
  */
 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
 
 /*
  * mbuf data storage should be 64-bit aligned regardless of architectural
  * pointer size; check this is the case with and without a packet header.
  */
 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
 
 /*
  * While the specific values here don't matter too much (i.e., +/- a few
  * words), we do want to ensure that changes to these values are carefully
  * reasoned about and properly documented.  This is especially the case as
  * network-protocol and device-driver modules encode these layouts, and must
  * be recompiled if the structures change.  Check these values at compile time
  * against the ones documented in comments in mbuf.h.
  *
  * NB: Possibly they should be documented there via #define's and not just
  * comments.
  */
 #if defined(__LP64__)
 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
 CTASSERT(sizeof(struct pkthdr) == 56);
 CTASSERT(sizeof(struct m_ext) == 48);
 #else
 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
 CTASSERT(sizeof(struct pkthdr) == 48);
 CTASSERT(sizeof(struct m_ext) == 28);
 #endif
 
 /*
  * Assert that the queue(3) macros produce code of the same size as an old
  * plain pointer does.
  */
 #ifdef INVARIANTS
-static struct mbuf m_assertbuf;
+static struct mbuf __used m_assertbuf;
 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
 #endif
 
 /*
  * Attach the cluster from *m to *n, set up m_ext in *n
  * and bump the refcount of the cluster.
  */
 void
 mb_dupcl(struct mbuf *n, struct mbuf *m)
 {
 	volatile u_int *refcnt;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
 
 	n->m_ext = m->m_ext;
 	n->m_flags |= M_EXT;
 	n->m_flags |= m->m_flags & M_RDONLY;
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt == 1)
 		*refcnt += 1;
 	else
 		atomic_add_int(refcnt, 1);
 }
 
 void
 m_demote_pkthdr(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	m_tag_delete_chain(m, NULL);
 	m->m_flags &= ~M_PKTHDR;
 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
 }
 
 /*
  * Clean up mbuf (chain) from any tags and packet headers.
  * If "all" is set then the first mbuf in the chain will be
  * cleaned too.
  */
 void
 m_demote(struct mbuf *m0, int all, int flags)
 {
 	struct mbuf *m;
 
 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR)
 			m_demote_pkthdr(m);
 		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
 	}
 }
 
 /*
  * Sanity checks on mbuf (chain) for use in KASSERT() and general
  * debugging.
  * Returns 0 or panics when bad and 1 on all tests passed.
  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
  * blow up later.
  */
 int
 m_sanity(struct mbuf *m0, int sanitize)
 {
 	struct mbuf *m;
 	caddr_t a, b;
 	int pktlen = 0;
 
 #ifdef INVARIANTS
 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
 #else
 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
 #endif
 
 	for (m = m0; m != NULL; m = m->m_next) {
 		/*
 		 * Basic pointer checks.  If any of these fails then some
 		 * unrelated kernel memory before or after us is trashed.
 		 * No way to recover from that.
 		 */
 		a = M_START(m);
 		b = a + M_SIZE(m);
 		if ((caddr_t)m->m_data < a)
 			M_SANITY_ACTION("m_data outside mbuf data range left");
 		if ((caddr_t)m->m_data > b)
 			M_SANITY_ACTION("m_data outside mbuf data range right");
 		if ((caddr_t)m->m_data + m->m_len > b)
 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
 
 		/* m->m_nextpkt may only be set on first mbuf in chain. */
 		if (m != m0 && m->m_nextpkt != NULL) {
 			if (sanitize) {
 				m_freem(m->m_nextpkt);
 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
 			} else
 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
 		}
 
 		/* packet length (not mbuf length!) calculation */
 		if (m0->m_flags & M_PKTHDR)
 			pktlen += m->m_len;
 
 		/* m_tags may only be attached to first mbuf in chain. */
 		if (m != m0 && m->m_flags & M_PKTHDR &&
 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
 			if (sanitize) {
 				m_tag_delete_chain(m, NULL);
 				/* put in 0xDEADC0DE perhaps? */
 			} else
 				M_SANITY_ACTION("m_tags on in-chain mbuf");
 		}
 
 		/* M_PKTHDR may only be set on first mbuf in chain */
 		if (m != m0 && m->m_flags & M_PKTHDR) {
 			if (sanitize) {
 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 				m->m_flags &= ~M_PKTHDR;
 				/* put in 0xDEADCODE and leave hdr flag in */
 			} else
 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
 		}
 	}
 	m = m0;
 	if (pktlen && pktlen != m->m_pkthdr.len) {
 		if (sanitize)
 			m->m_pkthdr.len = 0;
 		else
 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
 	}
 	return 1;
 
 #undef	M_SANITY_ACTION
 }
 
 /*
  * Non-inlined part of m_init().
  */
 int
 m_pkthdr_init(struct mbuf *m, int how)
 {
 #ifdef MAC
 	int error;
 #endif
 	m->m_data = m->m_pktdat;
 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 #ifdef MAC
 	/* If the label init fails, fail the alloc */
 	error = mac_mbuf_init(m, how);
 	if (error)
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
 void
 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 {
 
 #if 0
 	/* see below for why these are not enabled */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
 	    ("m_move_pkthdr: to has tags"));
 #endif
 #ifdef MAC
 	/*
 	 * XXXMAC: It could be this should also occur for non-MAC?
 	 */
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
 	from->m_flags &= ~M_PKTHDR;
 }
 
 /*
  * Duplicate "from"'s mbuf pkthdr in "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  * In particular, this does a deep copy of the packet tags.
  */
 int
 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
 {
 
 #if 0
 	/*
 	 * The mbuf allocator only initializes the pkthdr
 	 * when the mbuf is allocated with m_gethdr(). Many users
 	 * (e.g. m_copy*, m_prepend) use m_get() and then
 	 * smash the pkthdr as needed causing these
 	 * assertions to trip.  For now just disable them.
 	 */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
 #endif
 	MBUF_CHECKSLEEP(how);
 #ifdef MAC
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
 	SLIST_INIT(&to->m_pkthdr.tags);
 	return (m_tag_copy_chain(to, from, how));
 }
 
 /*
  * Lesser-used path for M_PREPEND:
  * allocate new mbuf to prepend to chain,
  * copy junk along.
  */
 struct mbuf *
 m_prepend(struct mbuf *m, int len, int how)
 {
 	struct mbuf *mn;
 
 	if (m->m_flags & M_PKTHDR)
 		mn = m_gethdr(how, m->m_type);
 	else
 		mn = m_get(how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR)
 		m_move_pkthdr(mn, m);
 	mn->m_next = m;
 	m = mn;
 	if (len < M_SIZE(m))
 		M_ALIGN(m, len);
 	m->m_len = len;
 	return (m);
 }
 
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
 struct mbuf *
 m_copym(struct mbuf *m, int off0, int len, int wait)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
 	MBUF_CHECKSLEEP(wait);
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = 1;
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	np = &top;
 	top = NULL;
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL,
 			    ("m_copym, length > size of mbuf chain"));
 			break;
 		}
 		if (copyhdr)
 			n = m_gethdr(wait, m->m_type);
 		else
 			n = m_get(wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			if (!m_dup_pkthdr(n, m, wait))
 				goto nospace;
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			copyhdr = 0;
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		if (len != M_COPYALL)
 			len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 	}
 
 	return (top);
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  * Preserve alignment of the first mbuf so if the creator has left
  * some room at the beginning (e.g. for inserting protocol headers)
  * the copies still have the room available.
  */
 struct mbuf *
 m_copypacket(struct mbuf *m, int how)
 {
 	struct mbuf *top, *n, *o;
 
 	MBUF_CHECKSLEEP(how);
 	n = m_get(how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
 
 	if (!m_dup_pkthdr(n, m, how))
 		goto nospace;
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
 		mb_dupcl(n, m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 	}
 
 	m = m->m_next;
 	while (m) {
 		o = m_get(how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
 		n->m_next = o;
 		n = n->m_next;
 
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
 			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
 
 		m = m->m_next;
 	}
 	return top;
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
 void
 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	u_int count;
 
 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
 		m = m->m_next;
 	}
 }
 
 /*
  * Copy a packet header mbuf chain into a completely new chain, including
  * copying any mbuf clusters.  Use this instead of m_copypacket() when
  * you need a writable copy of an mbuf chain.
  */
 struct mbuf *
 m_dup(const struct mbuf *m, int how)
 {
 	struct mbuf **p, *top = NULL;
 	int remain, moff, nsize;
 
 	MBUF_CHECKSLEEP(how);
 	/* Sanity check */
 	if (m == NULL)
 		return (NULL);
 	M_ASSERTPKTHDR(m);
 
 	/* While there's more data, get a new mbuf, tack it on, and fill it */
 	remain = m->m_pkthdr.len;
 	moff = 0;
 	p = &top;
 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
 		if (remain >= MINCLSIZE) {
 			n = m_getcl(how, m->m_type, 0);
 			nsize = MCLBYTES;
 		} else {
 			n = m_get(how, m->m_type);
 			nsize = MLEN;
 		}
 		if (n == NULL)
 			goto nospace;
 
 		if (top == NULL) {		/* First one, must be PKTHDR */
 			if (!m_dup_pkthdr(n, m, how)) {
 				m_free(n);
 				goto nospace;
 			}
 			if ((n->m_flags & M_EXT) == 0)
 				nsize = MHLEN;
 			n->m_flags &= ~M_RDONLY;
 		}
 		n->m_len = 0;
 
 		/* Link it into the new chain */
 		*p = n;
 		p = &n->m_next;
 
 		/* Copy data from original mbuf(s) into new mbuf */
 		while (n->m_len < nsize && m != NULL) {
 			int chunk = min(nsize - n->m_len, m->m_len - moff);
 
 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
 			moff += chunk;
 			n->m_len += chunk;
 			remain -= chunk;
 			if (moff == m->m_len) {
 				m = m->m_next;
 				moff = 0;
 			}
 		}
 
 		/* Check correct total mbuf length */
 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
 		    	("%s: bogus m_pkthdr.len", __func__));
 	}
 	return (top);
 
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Concatenate mbuf chain n to m.
  * Both chains must be of the same type (e.g. MT_DATA).
  * Any m_pkthdr is not updated.
  */
 void
 m_cat(struct mbuf *m, struct mbuf *n)
 {
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
 		if (!M_WRITABLE(m) ||
 		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
 		}
 		/* splat the data from one into the other */
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		    (u_int)n->m_len);
 		m->m_len += n->m_len;
 		n = m_free(n);
 	}
 }
 
 /*
  * Concatenate two pkthdr mbuf chains.
  */
 void
 m_catpkt(struct mbuf *m, struct mbuf *n)
 {
 
 	M_ASSERTPKTHDR(m);
 	M_ASSERTPKTHDR(n);
 
 	m->m_pkthdr.len += n->m_pkthdr.len;
 	m_demote(n, 1, 0);
 
 	m_cat(m, n);
 }
 
 void
 m_adj(struct mbuf *mp, int req_len)
 {
 	int len = req_len;
 	struct mbuf *m;
 	int count;
 
 	if ((m = mp) == NULL)
 		return;
 	if (len >= 0) {
 		/*
 		 * Trim from head.
 		 */
 		while (m != NULL && len > 0) {
 			if (m->m_len <= len) {
 				len -= m->m_len;
 				m->m_len = 0;
 				m = m->m_next;
 			} else {
 				m->m_len -= len;
 				m->m_data += len;
 				len = 0;
 			}
 		}
 		if (mp->m_flags & M_PKTHDR)
 			mp->m_pkthdr.len -= (req_len - len);
 	} else {
 		/*
 		 * Trim from tail.  Scan the mbuf chain,
 		 * calculating its length and finding the last mbuf.
 		 * If the adjustment only affects this mbuf, then just
 		 * adjust and return.  Otherwise, rescan and truncate
 		 * after the remaining size.
 		 */
 		len = -len;
 		count = 0;
 		for (;;) {
 			count += m->m_len;
 			if (m->m_next == (struct mbuf *)0)
 				break;
 			m = m->m_next;
 		}
 		if (m->m_len >= len) {
 			m->m_len -= len;
 			if (mp->m_flags & M_PKTHDR)
 				mp->m_pkthdr.len -= len;
 			return;
 		}
 		count -= len;
 		if (count < 0)
 			count = 0;
 		/*
 		 * Correct length for chain is "count".
 		 * Find the mbuf with last data, adjust its length,
 		 * and toss data from remaining mbufs on chain.
 		 */
 		m = mp;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len = count;
 		for (; m; m = m->m_next) {
 			if (m->m_len >= count) {
 				m->m_len = count;
 				if (m->m_next != NULL) {
 					m_freem(m->m_next);
 					m->m_next = NULL;
 				}
 				break;
 			}
 			count -= m->m_len;
 		}
 	}
 }
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
  * and in the data area of an mbuf (so that mtod will work
  * for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
 struct mbuf *
 m_pullup(struct mbuf *n, int len)
 {
 	struct mbuf *m;
 	int count;
 	int space;
 
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
 	 * otherwise allocate a new mbuf to prepend to the chain.
 	 */
 	if ((n->m_flags & M_EXT) == 0 &&
 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
 		if (n->m_len >= len)
 			return (n);
 		m = n;
 		n = n->m_next;
 		len -= m->m_len;
 	} else {
 		if (len > MHLEN)
 			goto bad;
 		m = m_get(M_NOWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
 		if (n->m_flags & M_PKTHDR)
 			m_move_pkthdr(m, n);
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		  (u_int)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
 bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Like m_pullup(), except a new mbuf is always allocated, and we allow
  * the amount of empty space before the data in the new mbuf to be specified
  * (in the event that the caller expects to prepend later).
  */
 struct mbuf *
 m_copyup(struct mbuf *n, int len, int dstoff)
 {
 	struct mbuf *m;
 	int count, space;
 
 	if (len > (MHLEN - dstoff))
 		goto bad;
 	m = m_get(M_NOWAIT, n->m_type);
 	if (m == NULL)
 		goto bad;
 	if (n->m_flags & M_PKTHDR)
 		m_move_pkthdr(m, n);
 	m->m_data += dstoff;
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
 		    (unsigned)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
  bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
  *
  * Note that the resulting mbufs might be read-only, because the new
  * mbuf can end up sharing an mbuf cluster with the original mbuf if
  * the "breaking point" happens to lie within a cluster mbuf. Use the
  * M_WRITABLE() macro to check for this case.
  */
 struct mbuf *
 m_split(struct mbuf *m0, int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len = len0, remain;
 
 	MBUF_CHECKSLEEP(wait);
 	for (m = m0; m && len > m->m_len; m = m->m_next)
 		len -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
 	if (m0->m_flags & M_PKTHDR && remain == 0) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_next = m->m_next;
 		m->m_next = NULL;
 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		return (n);
 	} else if (m0->m_flags & M_PKTHDR) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		if (m->m_flags & M_EXT)
 			goto extpacket;
 		if (remain > MHLEN) {
 			/* m can't be the lead packet */
 			M_ALIGN(n, 0);
 			n->m_next = m_split(m, len, wait);
 			if (n->m_next == NULL) {
 				(void) m_free(n);
 				return (NULL);
 			} else {
 				n->m_len = 0;
 				return (n);
 			}
 		} else
 			M_ALIGN(n, remain);
 	} else if (remain == 0) {
 		n = m->m_next;
 		m->m_next = NULL;
 		return (n);
 	} else {
 		n = m_get(wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + len;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
 	n->m_len = remain;
 	m->m_len = len;
 	n->m_next = m->m_next;
 	m->m_next = NULL;
 	return (n);
 }
 /*
  * Routine to copy from device local memory into mbufs.
  * Note that `off' argument is offset into first mbuf of target chain from
  * which to begin copying the data to.
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
     void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
 	while (totlen > 0) {
 		if (top == NULL) {	/* First one, must be PKTHDR */
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 				len = MHLEN;
 
 				/* Place initial small packet/header at end of mbuf */
 				if (m && totlen + off + max_linkhdr <= MHLEN) {
 					m->m_data += max_linkhdr;
 					len -= max_linkhdr;
 				}
 			}
 			if (m == NULL)
 				return NULL;
 			m->m_pkthdr.rcvif = ifp;
 			m->m_pkthdr.len = totlen;
 		} else {
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
 				len = MCLBYTES;
 			} else {
 				m = m_get(M_NOWAIT, MT_DATA);
 				len = MLEN;
 			}
 			if (m == NULL) {
 				m_freem(top);
 				return NULL;
 			}
 		}
 		if (off) {
 			m->m_data += off;
 			len -= off;
 			off = 0;
 		}
 		m->m_len = len = min(totlen, len);
 		if (copy)
 			copy(buf, mtod(m, caddr_t), (u_int)len);
 		else
 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
 		buf += len;
 		*mp = m;
 		mp = &m->m_next;
 		totlen -= len;
 	}
 	return (top);
 }
 
 /*
  * Copy data from a buffer back into the indicated mbuf chain,
  * starting "off" bytes from the beginning, extending the mbuf
  * chain if necessary.
  */
 void
 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 {
 	int mlen;
 	struct mbuf *m = m0, *n;
 	int totlen = 0;
 
 	if (m0 == NULL)
 		return;
 	while (off > (mlen = m->m_len)) {
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 	while (len > 0) {
 		if (m->m_next == NULL && (len > m->m_len - off)) {
 			m->m_len += min(len - (m->m_len - off),
 			    M_TRAILINGSPACE(m));
 		}
 		mlen = min (m->m_len - off, len);
 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
 		cp += mlen;
 		len -= mlen;
 		mlen += off;
 		off = 0;
 		totlen += mlen;
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
 		m->m_pkthdr.len = totlen;
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 int
 m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space, remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_get(M_NOWAIT, m->m_type);
 		if (n == NULL)
 			break;
 		n->m_len = min(MLEN, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len, remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 	return (remainder == 0);
 }
 
 /*
  * Apply function f to the data in an mbuf chain starting "off" bytes from
  * the beginning, continuing for "len" bytes.
  */
 int
 m_apply(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	u_int count;
 	int rval;
 
 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
 		if (rval)
 			return (rval);
 		len -= count;
 		off = 0;
 		m = m->m_next;
 	}
 	return (0);
 }
 
 /*
  * Return a pointer to mbuf/offset of location in mbuf chain.
  */
 struct mbuf *
 m_getptr(struct mbuf *m, int loc, int *off)
 {
 
 	while (loc >= 0) {
 		/* Normal end of search. */
 		if (m->m_len > loc) {
 			*off = loc;
 			return (m);
 		} else {
 			loc -= m->m_len;
 			if (m->m_next == NULL) {
 				if (loc == 0) {
 					/* Point at the end of valid data. */
 					*off = m->m_len;
 					return (m);
 				}
 				return (NULL);
 			}
 			m = m->m_next;
 		}
 	}
 	return (NULL);
 }
 
 void
 m_print(const struct mbuf *m, int maxlen)
 {
 	int len;
 	int pdata;
 	const struct mbuf *m2;
 
 	if (m == NULL) {
 		printf("mbuf: %p\n", m);
 		return;
 	}
 
 	if (m->m_flags & M_PKTHDR)
 		len = m->m_pkthdr.len;
 	else
 		len = -1;
 	m2 = m;
 	while (m2 != NULL && (len == -1 || len)) {
 		pdata = m2->m_len;
 		if (maxlen != -1 && pdata > maxlen)
 			pdata = maxlen;
 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
 		if (pdata)
 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
 		if (len != -1)
 			len -= m2->m_len;
 		m2 = m2->m_next;
 	}
 	if (len > 0)
 		printf("%d bytes unaccounted for.\n", len);
 	return;
 }
 
 u_int
 m_fixhdr(struct mbuf *m0)
 {
 	u_int len;
 
 	len = m_length(m0, NULL);
 	m0->m_pkthdr.len = len;
 	return (len);
 }
 
 u_int
 m_length(struct mbuf *m0, struct mbuf **last)
 {
 	struct mbuf *m;
 	u_int len;
 
 	len = 0;
 	for (m = m0; m != NULL; m = m->m_next) {
 		len += m->m_len;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (last != NULL)
 		*last = m;
 	return (len);
 }
 
 /*
  * Defragment a mbuf chain, returning the shortest possible
  * chain of mbufs and clusters.  If allocation fails and
  * this cannot be completed, NULL will be returned, but
  * the passed in chain will be unchanged.  Upon success,
  * the original chain will be freed, and the new chain
  * will be returned.
  *
  * If a non-packet header is passed in, the original
  * mbuf (chain?) will be returned unharmed.
  */
 struct mbuf *
 m_defrag(struct mbuf *m0, int how)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0, length;
 
 	MBUF_CHECKSLEEP(how);
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 #ifdef MBUF_STRESS_TEST
 	if (m_defragrandomfailures) {
 		int temp = arc4random() & 0xff;
 		if (temp == 0xba)
 			goto nospace;
 	}
 #endif
 
 	if (m0->m_pkthdr.len > MHLEN)
 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m_final = m_gethdr(how, MT_DATA);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	while (progress < m0->m_pkthdr.len) {
 		length = m0->m_pkthdr.len - progress;
 		if (length > MCLBYTES)
 			length = MCLBYTES;
 
 		if (m_new == NULL) {
 			if (length > MLEN)
 				m_new = m_getcl(how, MT_DATA, 0);
 			else
 				m_new = m_get(how, MT_DATA);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
 		progress += length;
 		m_new->m_len = length;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 #ifdef MBUF_STRESS_TEST
 	if (m0->m_next == NULL)
 		m_defraguseless++;
 #endif
 	m_freem(m0);
 	m0 = m_final;
 #ifdef MBUF_STRESS_TEST
 	m_defragpackets++;
 	m_defragbytes += m0->m_pkthdr.len;
 #endif
 	return (m0);
 nospace:
 #ifdef MBUF_STRESS_TEST
 	m_defragfailure++;
 #endif
 	if (m_final)
 		m_freem(m_final);
 	return (NULL);
 }
 
 /*
  * Defragment an mbuf chain, returning at most maxfrags separate
  * mbufs+clusters.  If this is not possible NULL is returned and
  * the original mbuf chain is left in its present (potentially
  * modified) state.  We use two techniques: collapsing consecutive
  * mbufs and replacing consecutive mbufs by a cluster.
  *
  * NB: this should really be named m_defrag but that name is taken
  */
 struct mbuf *
 m_collapse(struct mbuf *m0, int how, int maxfrags)
 {
 	struct mbuf *m, *n, *n2, **prev;
 	u_int curfrags;
 
 	/*
 	 * Calculate the current number of frags.
 	 */
 	curfrags = 0;
 	for (m = m0; m != NULL; m = m->m_next)
 		curfrags++;
 	/*
 	 * First, try to collapse mbufs.  Note that we always collapse
 	 * towards the front so we don't need to deal with moving the
 	 * pkthdr.  This may be suboptimal if the first mbuf has much
 	 * less data than the following.
 	 */
 	m = m0;
 again:
 	for (;;) {
 		n = m->m_next;
 		if (n == NULL)
 			break;
 		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
 			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
 				n->m_len);
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
 			m_free(n);
 			if (--curfrags <= maxfrags)
 				return m0;
 		} else
 			m = n;
 	}
 	KASSERT(maxfrags > 1,
 		("maxfrags %u, but normal collapse failed", maxfrags));
 	/*
 	 * Collapse consecutive mbufs to a cluster.
 	 */
 	prev = &m0->m_next;		/* NB: not the first mbuf */
 	while ((n = *prev) != NULL) {
 		if ((n2 = n->m_next) != NULL &&
 		    n->m_len + n2->m_len < MCLBYTES) {
 			m = m_getcl(how, MT_DATA, 0);
 			if (m == NULL)
 				goto bad;
 			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
 			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
 				n2->m_len);
 			m->m_len = n->m_len + n2->m_len;
 			m->m_next = n2->m_next;
 			*prev = m;
 			m_free(n);
 			m_free(n2);
 			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
 				return m0;
 			/*
 			 * Still not there, try the normal collapse
 			 * again before we allocate another cluster.
 			 */
 			goto again;
 		}
 		prev = &n->m_next;
 	}
 	/*
 	 * No place where we can collapse to a cluster; punt.
 	 * This can occur if, for example, you request 2 frags
 	 * but the packet requires that both be clusters (we
 	 * never reallocate the first mbuf to avoid moving the
 	 * packet header).
 	 */
 bad:
 	return NULL;
 }
 
 #ifdef MBUF_STRESS_TEST
 
 /*
  * Fragment an mbuf chain.  There's no reason you'd ever want to do
  * this in normal usage, but it's great for stress testing various
  * mbuf consumers.
  *
  * If fragmentation is not possible, the original chain will be
  * returned.
  *
  * Possible length values:
  * 0	 no fragmentation will occur
  * > 0	each fragment will be of the specified length
  * -1	each fragment will be the same random value in length
  * -2	each fragment's length will be entirely random
  * (Random values range from 1 to 256)
  */
 struct mbuf *
 m_fragment(struct mbuf *m0, int how, int length)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0;
 
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	if ((length == 0) || (length < -2))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	if (length == -1)
 		length = 1 + (arc4random() & 255);
 
 	while (progress < m0->m_pkthdr.len) {
 		int fraglen;
 
 		if (length > 0)
 			fraglen = length;
 		else
 			fraglen = 1 + (arc4random() & 255);
 		if (fraglen > m0->m_pkthdr.len - progress)
 			fraglen = m0->m_pkthdr.len - progress;
 
 		if (fraglen > MCLBYTES)
 			fraglen = MCLBYTES;
 
 		if (m_new == NULL) {
 			m_new = m_getcl(how, MT_DATA, 0);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
 		progress += fraglen;
 		m_new->m_len = fraglen;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 	m_freem(m0);
 	m0 = m_final;
 	return (m0);
 nospace:
 	if (m_final)
 		m_freem(m_final);
 	/* Return the original chain on failure */
 	return (m0);
 }
 
 #endif
 
 /*
  * Copy the contents of uio into a properly sized mbuf chain.
  */
 struct mbuf *
 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 {
 	struct mbuf *m, *mb;
 	int error, length;
 	ssize_t total;
 	int progress = 0;
 
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/*
 	 * The smallest unit returned by m_getm2() is a single mbuf
 	 * with pkthdr.  We can't align past it.
 	 */
 	if (align >= MHLEN)
 		return (NULL);
 
 	/*
 	 * Give us the full allocation or nothing.
 	 * If len is zero return the smallest empty mbuf.
 	 */
 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
 	if (m == NULL)
 		return (NULL);
 	m->m_data += align;
 
 	/* Fill all mbufs with uio data and update header information. */
 	for (mb = m; mb != NULL; mb = mb->m_next) {
 		length = min(M_TRAILINGSPACE(mb), total - progress);
 
 		error = uiomove(mtod(mb, void *), length, uio);
 		if (error) {
 			m_freem(m);
 			return (NULL);
 		}
 
 		mb->m_len = length;
 		progress += length;
 		if (flags & M_PKTHDR)
 			m->m_pkthdr.len += length;
 	}
 	KASSERT(progress == total, ("%s: progress != total", __func__));
 
 	return (m);
 }
 
 /*
  * Copy an mbuf chain into a uio limited by len if set.
  */
 int
 m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
 {
 	int error, length, total;
 	int progress = 0;
 
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/* Fill the uio with data from the mbufs. */
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);
 
 		error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);
 
 		progress += length;
 	}
 
 	return (0);
 }
 
 /*
  * Create a writable copy of the mbuf chain.  While doing this
  * we compact the chain with a goal of producing a chain with
  * at most two mbufs.  The second mbuf in this chain is likely
  * to be a cluster.  The primary purpose of this work is to create
  * a writable packet for encryption, compression, etc.  The
  * secondary goal is to linearize the data so the data can be
  * passed to crypto hardware in the most efficient manner possible.
  */
 struct mbuf *
 m_unshare(struct mbuf *m0, int how)
 {
 	struct mbuf *m, *mprev;
 	struct mbuf *n, *mfirst, *mlast;
 	int len, off;
 
 	mprev = NULL;
 	for (m = m0; m != NULL; m = mprev->m_next) {
 		/*
 		 * Regular mbufs are ignored unless there's a cluster
 		 * in front of it that we can use to coalesce.  We do
 		 * the latter mainly so later clusters can be coalesced
 		 * also w/o having to handle them specially (i.e. convert
 		 * mbuf+cluster -> cluster).  This optimization is heavily
 		 * influenced by the assumption that we're running over
 		 * Ethernet where MCLBYTES is large enough that the max
 		 * packet size will permit lots of coalescing into a
 		 * single cluster.  This in turn permits efficient
 		 * crypto operations, especially when using hardware.
 		 */
 		if ((m->m_flags & M_EXT) == 0) {
 			if (mprev && (mprev->m_flags & M_EXT) &&
 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
 				/* XXX: this ignores mbuf types */
 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 				    mtod(m, caddr_t), m->m_len);
 				mprev->m_len += m->m_len;
 				mprev->m_next = m->m_next;	/* unlink from chain */
 				m_free(m);			/* reclaim mbuf */
 #if 0
 				newipsecstat.ips_mbcoalesced++;
 #endif
 			} else {
 				mprev = m;
 			}
 			continue;
 		}
 		/*
 		 * Writable mbufs are left alone (for now).
 		 */
 		if (M_WRITABLE(m)) {
 			mprev = m;
 			continue;
 		}
 
 		/*
 		 * Not writable, replace with a copy or coalesce with
 		 * the previous mbuf if possible (since we have to copy
 		 * it anyway, we try to reduce the number of mbufs and
 		 * clusters so that future work is easier).
 		 */
 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
 		/* NB: we only coalesce into a cluster or larger */
 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
 			/* XXX: this ignores mbuf types */
 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 			    mtod(m, caddr_t), m->m_len);
 			mprev->m_len += m->m_len;
 			mprev->m_next = m->m_next;	/* unlink from chain */
 			m_free(m);			/* reclaim mbuf */
 #if 0
 			newipsecstat.ips_clcoalesced++;
 #endif
 			continue;
 		}
 
 		/*
 		 * Allocate new space to hold the copy and copy the data.
 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
 		 * splitting them into clusters.  We could just malloc a
 		 * buffer and make it external but too many device drivers
 		 * don't know how to break up the non-contiguous memory when
 		 * doing DMA.
 		 */
 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 		if (n == NULL) {
 			m_freem(m0);
 			return (NULL);
 		}
 		if (m->m_flags & M_PKTHDR) {
 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
 			    __func__, m0, m));
 			m_move_pkthdr(n, m);
 		}
 		len = m->m_len;
 		off = 0;
 		mfirst = n;
 		mlast = NULL;
 		for (;;) {
 			int cc = min(len, MCLBYTES);
 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
 			n->m_len = cc;
 			if (mlast != NULL)
 				mlast->m_next = n;
 			mlast = n;
 #if 0
 			newipsecstat.ips_clcopied++;
 #endif
 
 			len -= cc;
 			if (len <= 0)
 				break;
 			off += cc;
 
 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 			if (n == NULL) {
 				m_freem(mfirst);
 				m_freem(m0);
 				return (NULL);
 			}
 		}
 		n->m_next = m->m_next;
 		if (mprev == NULL)
 			m0 = mfirst;		/* new head of chain */
 		else
 			mprev->m_next = mfirst;	/* replace old mbuf */
 		m_free(m);			/* release old mbuf */
 		mprev = mfirst;
 	}
 	return (m0);
 }
 
 #ifdef MBUF_PROFILING
 
 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
 struct mbufprofile {
 	uintmax_t wasted[MP_BUCKETS];
 	uintmax_t used[MP_BUCKETS];
 	uintmax_t segments[MP_BUCKETS];
 } mbprof;
 
 #define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
 #define MP_NUMLINES 6
 #define MP_NUMSPERLINE 16
 #define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
 /* work out max space needed and add a bit of spare space too */
 #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
 #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
 
 char mbprofbuf[MP_BUFSIZE];
 
 void
 m_profile(struct mbuf *m)
 {
 	int segments = 0;
 	int used = 0;
 	int wasted = 0;
 
 	while (m) {
 		segments++;
 		used += m->m_len;
 		if (m->m_flags & M_EXT) {
 			wasted += MHLEN - sizeof(m->m_ext) +
 			    m->m_ext.ext_size - m->m_len;
 		} else {
 			if (m->m_flags & M_PKTHDR)
 				wasted += MHLEN - m->m_len;
 			else
 				wasted += MLEN - m->m_len;
 		}
 		m = m->m_next;
 	}
 	/* be paranoid.. it helps */
 	if (segments > MP_BUCKETS - 1)
 		segments = MP_BUCKETS - 1;
 	if (used > 100000)
 		used = 100000;
 	if (wasted > 100000)
 		wasted = 100000;
 	/* store in the appropriate bucket */
 	/* don't bother locking. if it's slightly off, so what? */
 	mbprof.segments[segments]++;
 	mbprof.used[fls(used)]++;
 	mbprof.wasted[fls(wasted)]++;
 }
 
 static void
 mbprof_textify(void)
 {
 	int offset;
 	char *c;
 	uint64_t *p;
 
 	p = &mbprof.wasted[0];
 	c = mbprofbuf;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "wasted:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.wasted[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.used[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "used:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.used[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.segments[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "segments:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.segments[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 }
 
 static int
 mbprof_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	mbprof_textify();
 	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
 	return (error);
 }
 
 static int
 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
 {
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (clear) {
 		bzero(&mbprof, sizeof(mbprof));
 	}
 
 	return (error);
 }
 
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
 	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
 	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
 #endif
 
Index: projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/ar71xxreg.h
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/ar71xxreg.h	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/ar71xxreg.h	(revision 312218)
@@ -1,569 +1,573 @@
 /*-
  * Copyright (c) 2009 Oleksandr Tymoshenko
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* $FreeBSD$ */
 
 #ifndef _AR71XX_REG_H_
 #define _AR71XX_REG_H_
 
 /* PCI region */
 #define AR71XX_PCI_MEM_BASE		0x10000000
 /* 
  * PCI mem windows is 0x08000000 bytes long but we exclude control 
  * region from the resource manager
  */
 #define AR71XX_PCI_MEM_SIZE		0x07000000
 #define AR71XX_PCI_IRQ_START		0
 #define AR71XX_PCI_IRQ_END		2
 #define AR71XX_PCI_NIRQS		3
 /*
  * PCI devices slots are starting from this number
  */
 #define	AR71XX_PCI_BASE_SLOT		17
 
 /* PCI config registers */
 #define	AR71XX_PCI_LCONF_CMD		0x17010000
 #define			PCI_LCONF_CMD_READ	0x00000000
 #define			PCI_LCONF_CMD_WRITE	0x00010000
 #define	AR71XX_PCI_LCONF_WRITE_DATA	0x17010004
 #define	AR71XX_PCI_LCONF_READ_DATA	0x17010008
 #define	AR71XX_PCI_CONF_ADDR		0x1701000C
 #define	AR71XX_PCI_CONF_CMD		0x17010010
 #define			PCI_CONF_CMD_READ	0x0000000A
 #define			PCI_CONF_CMD_WRITE	0x0000000B
 #define	AR71XX_PCI_CONF_WRITE_DATA	0x17010014
 #define	AR71XX_PCI_CONF_READ_DATA	0x17010018
 #define	AR71XX_PCI_ERROR		0x1701001C
 #define	AR71XX_PCI_ERROR_ADDR		0x17010020
 #define	AR71XX_PCI_AHB_ERROR		0x17010024
 #define	AR71XX_PCI_AHB_ERROR_ADDR	0x17010028
 
 /* APB region */
 /*
  * Size is not really true actual APB window size is 
  * 0x01000000 but it should handle OHCI memory as well
  * because this controller's interrupt is routed through 
  * APB. 
  */
 #define AR71XX_APB_BASE         0x18000000
 #define AR71XX_APB_SIZE         0x06000000
 
 /* DDR registers */
 #define AR71XX_DDR_CONFIG		0x18000000
 #define AR71XX_DDR_CONFIG2		0x18000004
 #define AR71XX_DDR_MODE_REGISTER	0x18000008
 #define AR71XX_DDR_EXT_MODE_REGISTER	0x1800000C
 #define AR71XX_DDR_CONTROL		0x18000010
 #define AR71XX_DDR_REFRESH		0x18000014
 #define AR71XX_DDR_RD_DATA_THIS_CYCLE	0x18000018
 #define AR71XX_TAP_CONTROL0		0x1800001C
 #define AR71XX_TAP_CONTROL1		0x18000020
 #define AR71XX_TAP_CONTROL2		0x18000024
 #define AR71XX_TAP_CONTROL3		0x18000028
 #define AR71XX_PCI_WINDOW0		0x1800007C
 #define AR71XX_PCI_WINDOW1		0x18000080
 #define AR71XX_PCI_WINDOW2		0x18000084
 #define AR71XX_PCI_WINDOW3		0x18000088
 #define AR71XX_PCI_WINDOW4		0x1800008C
 #define AR71XX_PCI_WINDOW5		0x18000090
 #define AR71XX_PCI_WINDOW6		0x18000094
 #define AR71XX_PCI_WINDOW7		0x18000098
 #define AR71XX_WB_FLUSH_GE0		0x1800009C
 #define AR71XX_WB_FLUSH_GE1		0x180000A0
 #define AR71XX_WB_FLUSH_USB		0x180000A4
 #define AR71XX_WB_FLUSH_PCI		0x180000A8
 
 /*
  * Values for PCI_WINDOW_X registers 
  */
 #define PCI_WINDOW0_ADDR		0x10000000
 #define PCI_WINDOW1_ADDR		0x11000000
 #define PCI_WINDOW2_ADDR		0x12000000
 #define PCI_WINDOW3_ADDR		0x13000000
 #define PCI_WINDOW4_ADDR		0x14000000
 #define PCI_WINDOW5_ADDR		0x15000000
 #define PCI_WINDOW6_ADDR		0x16000000
 #define PCI_WINDOW7_ADDR		0x17000000
 /* This value enables acces to PCI config registers */
 #define PCI_WINDOW7_CONF_ADDR		0x07000000
 
 #define	AR71XX_UART_ADDR		0x18020000
+#define		AR71XX_UART_THR		0x0
+#define		AR71XX_UART_LSR		0x14
+#define		AR71XX_UART_LSR_THRE	(1 << 5)
+#define		AR71XX_UART_LSR_TEMT	(1 << 6)
 
 #define	AR71XX_USB_CTRL_FLADJ		0x18030000
 #define		USB_CTRL_FLADJ_HOST_SHIFT	12
 #define		USB_CTRL_FLADJ_A5_SHIFT		10
 #define		USB_CTRL_FLADJ_A4_SHIFT		8
 #define		USB_CTRL_FLADJ_A3_SHIFT		6
 #define		USB_CTRL_FLADJ_A2_SHIFT		4
 #define		USB_CTRL_FLADJ_A1_SHIFT		2
 #define		USB_CTRL_FLADJ_A0_SHIFT		0
 #define	AR71XX_USB_CTRL_CONFIG		0x18030004
 #define		USB_CTRL_CONFIG_OHCI_DES_SWAP	(1 << 19)
 #define		USB_CTRL_CONFIG_OHCI_BUF_SWAP	(1 << 18)
 #define		USB_CTRL_CONFIG_EHCI_DES_SWAP	(1 << 17)
 #define		USB_CTRL_CONFIG_EHCI_BUF_SWAP	(1 << 16)
 #define		USB_CTRL_CONFIG_DISABLE_XTL	(1 << 13)
 #define		USB_CTRL_CONFIG_OVERRIDE_XTL	(1 << 12)
 #define		USB_CTRL_CONFIG_CLK_SEL_SHIFT	4
 #define		USB_CTRL_CONFIG_CLK_SEL_MASK	3
 #define		USB_CTRL_CONFIG_CLK_SEL_12	0
 #define		USB_CTRL_CONFIG_CLK_SEL_24	1
 #define		USB_CTRL_CONFIG_CLK_SEL_48	2
 #define		USB_CTRL_CONFIG_OVER_CURRENT_AS_GPIO	(1 << 8)
 #define		USB_CTRL_CONFIG_SS_SIMULATION_MODE	(1 << 2)
 #define		USB_CTRL_CONFIG_RESUME_UTMI_PLS_DIS	(1 << 1)
 #define		USB_CTRL_CONFIG_UTMI_BACKWARD_ENB	(1 << 0)
 
 #define	AR71XX_GPIO_BASE		0x18040000
 #define		AR71XX_GPIO_OE			0x00
 #define		AR71XX_GPIO_IN			0x04
 #define		AR71XX_GPIO_OUT			0x08
 #define		AR71XX_GPIO_SET			0x0c
 #define		AR71XX_GPIO_CLEAR		0x10
 #define		AR71XX_GPIO_INT			0x14
 #define		AR71XX_GPIO_INT_TYPE		0x18
 #define		AR71XX_GPIO_INT_POLARITY	0x1c
 #define		AR71XX_GPIO_INT_PENDING		0x20
 #define		AR71XX_GPIO_INT_MASK		0x24
 #define		AR71XX_GPIO_FUNCTION		0x28
 #define			GPIO_FUNC_STEREO_EN     (1 << 17)
 #define			GPIO_FUNC_SLIC_EN       (1 << 16)
 #define			GPIO_FUNC_SPI_CS2_EN    (1 << 13)
 				/* CS2 is shared with GPIO_1 */
 #define			GPIO_FUNC_SPI_CS1_EN    (1 << 12)
 				/* CS1 is shared with GPIO_0 */
 #define			GPIO_FUNC_UART_EN       (1 << 8)
 #define			GPIO_FUNC_USB_OC_EN     (1 << 4)
 #define			GPIO_FUNC_USB_CLK_EN    (0)
 
 #define	AR71XX_BASE_FREQ		40000000
 #define	AR71XX_PLL_CPU_BASE		0x18050000
 #define	AR71XX_PLL_CPU_CONFIG		0x18050000
 #define		PLL_SW_UPDATE			(1U << 31)
 #define		PLL_LOCKED			(1 << 30)
 #define		PLL_AHB_DIV_SHIFT		20
 #define		PLL_AHB_DIV_MASK		7
 #define		PLL_DDR_DIV_SEL_SHIFT		18
 #define		PLL_DDR_DIV_SEL_MASK		3
 #define		PLL_CPU_DIV_SEL_SHIFT		16
 #define		PLL_CPU_DIV_SEL_MASK		3
 #define		PLL_LOOP_BW_SHIFT		12
 #define		PLL_LOOP_BW_MASK		0xf
 #define		PLL_DIV_IN_SHIFT		10
 #define		PLL_DIV_IN_MASK			3
 #define		PLL_DIV_OUT_SHIFT		8
 #define		PLL_DIV_OUT_MASK		3
 #define		PLL_FB_SHIFT			3
 #define		PLL_FB_MASK			0x1f
 #define		PLL_BYPASS			(1 << 1)
 #define		PLL_POWER_DOWN			(1 << 0)
 #define	AR71XX_PLL_SEC_CONFIG		0x18050004
 #define		AR71XX_PLL_ETH0_SHIFT		17
 #define		AR71XX_PLL_ETH1_SHIFT		19
 #define	AR71XX_PLL_CPU_CLK_CTRL		0x18050008
 #define	AR71XX_PLL_ETH_INT0_CLK		0x18050010
 #define	AR71XX_PLL_ETH_INT1_CLK		0x18050014
 #define		XPLL_ETH_INT_CLK_10		0x00991099
 #define		XPLL_ETH_INT_CLK_100		0x00441011
 #define		XPLL_ETH_INT_CLK_1000		0x13110000
 #define		XPLL_ETH_INT_CLK_1000_GMII	0x14110000
 #define		PLL_ETH_INT_CLK_10		0x00991099
 #define		PLL_ETH_INT_CLK_100		0x00001099
 #define		PLL_ETH_INT_CLK_1000		0x00110000
 #define	AR71XX_PLL_ETH_EXT_CLK		0x18050018
 #define	AR71XX_PLL_PCI_CLK		0x1805001C
 
 /* Reset block */
 #define	AR71XX_RST_BLOCK_BASE	0x18060000
 
 #define AR71XX_RST_WDOG_CONTROL	0x18060008
 #define		RST_WDOG_LAST			(1U << 31)
 #define		RST_WDOG_ACTION_MASK		3
 #define		RST_WDOG_ACTION_RESET		3
 #define		RST_WDOG_ACTION_NMI		2
 #define		RST_WDOG_ACTION_GP_INTR		1
 #define		RST_WDOG_ACTION_NOACTION	0
 
 #define AR71XX_RST_WDOG_TIMER	0x1806000C
 /* 
  * APB interrupt status and mask register and interrupt bit numbers for 
  */
 #define AR71XX_MISC_INTR_STATUS	0x18060010
 #define AR71XX_MISC_INTR_MASK	0x18060014
 #define		MISC_INTR_TIMER		0
 #define		MISC_INTR_ERROR		1
 #define		MISC_INTR_GPIO		2
 #define		MISC_INTR_UART		3
 #define		MISC_INTR_WATCHDOG	4
 #define		MISC_INTR_PERF		5
 #define		MISC_INTR_OHCI		6
 #define		MISC_INTR_DMA		7
 
 #define AR71XX_PCI_INTR_STATUS	0x18060018
 #define AR71XX_PCI_INTR_MASK	0x1806001C
 #define		PCI_INTR_CORE		(1 << 4)
 
 #define AR71XX_RST_RESET	0x18060024
 #define		RST_RESET_FULL_CHIP	(1 << 24) /* Same as pulling
 							     the reset pin */
 #define		RST_RESET_CPU_COLD	(1 << 20) /* Cold reset */
 #define		RST_RESET_GE1_MAC	(1 << 13)
 #define		RST_RESET_GE1_PHY	(1 << 12)
 #define		RST_RESET_GE0_MAC	(1 <<  9)
 #define		RST_RESET_GE0_PHY	(1 <<  8)
 #define		RST_RESET_USB_OHCI_DLL	(1 <<  6)
 #define		RST_RESET_USB_HOST	(1 <<  5)
 #define		RST_RESET_USB_PHY	(1 <<  4)
 #define		RST_RESET_PCI_BUS	(1 <<  1)
 #define		RST_RESET_PCI_CORE	(1 <<  0)
 
 /* Chipset revision details */
 #define	AR71XX_RST_RESET_REG_REV_ID	0x18060090
 #define		REV_ID_MAJOR_MASK	0xfff0
 #define		REV_ID_MAJOR_AR71XX	0x00a0
 #define		REV_ID_MAJOR_AR913X	0x00b0
 #define		REV_ID_MAJOR_AR7240	0x00c0
 #define		REV_ID_MAJOR_AR7241	0x0100
 #define		REV_ID_MAJOR_AR7242	0x1100
 
 /* AR71XX chipset revision details */
 #define		AR71XX_REV_ID_MINOR_MASK	0x3
 #define		AR71XX_REV_ID_MINOR_AR7130	0x0
 #define		AR71XX_REV_ID_MINOR_AR7141	0x1
 #define		AR71XX_REV_ID_MINOR_AR7161	0x2
 #define		AR71XX_REV_ID_REVISION_MASK	0x3
 #define		AR71XX_REV_ID_REVISION_SHIFT	2
 
 /* AR724X chipset revision details */
 #define		AR724X_REV_ID_REVISION_MASK	0x3
 
 /* AR91XX chipset revision details */
 #define		AR91XX_REV_ID_MINOR_MASK	0x3
 #define		AR91XX_REV_ID_MINOR_AR9130	0x0
 #define		AR91XX_REV_ID_MINOR_AR9132	0x1
 #define		AR91XX_REV_ID_REVISION_MASK	0x3
 #define		AR91XX_REV_ID_REVISION_SHIFT	2
 
 typedef enum {
 	AR71XX_MII_MODE_NONE = 0,
 	AR71XX_MII_MODE_GMII,
 	AR71XX_MII_MODE_MII,
 	AR71XX_MII_MODE_RGMII,
 	AR71XX_MII_MODE_RMII,
 	AR71XX_MII_MODE_SGMII	/* not hardware defined, though! */
 } ar71xx_mii_mode;
 
 /*
  * AR71xx MII control region
  */
 #define	AR71XX_MII0_CTRL	0x18070000
 #define			MII_CTRL_SPEED_SHIFT	4
 #define			MII_CTRL_SPEED_MASK	3
 #define				MII_CTRL_SPEED_10	0
 #define				MII_CTRL_SPEED_100	1
 #define				MII_CTRL_SPEED_1000	2
 #define			MII_CTRL_IF_MASK	3
 #define			MII_CTRL_IF_SHIFT	0
 #define				MII0_CTRL_IF_GMII	0
 #define				MII0_CTRL_IF_MII	1
 #define				MII0_CTRL_IF_RGMII	2
 #define				MII0_CTRL_IF_RMII	3
 
 #define	AR71XX_MII1_CTRL	0x18070004
 
 #define				MII1_CTRL_IF_RGMII	0
 #define				MII1_CTRL_IF_RMII	1
 
 /*
  * GigE adapters region
  */
 #define AR71XX_MAC0_BASE	0x19000000
 #define AR71XX_MAC1_BASE	0x1A000000
 
 #define		AR71XX_MAC_CFG1			0x00
 #define			MAC_CFG1_SOFT_RESET		(1U << 31)
 #define			MAC_CFG1_SIMUL_RESET		(1 << 30)
 #define			MAC_CFG1_MAC_RX_BLOCK_RESET	(1 << 19)
 #define			MAC_CFG1_MAC_TX_BLOCK_RESET	(1 << 18)
 #define			MAC_CFG1_RX_FUNC_RESET		(1 << 17)
 #define			MAC_CFG1_TX_FUNC_RESET		(1 << 16)
 #define			MAC_CFG1_LOOPBACK		(1 <<  8)
 #define			MAC_CFG1_RXFLOW_CTRL		(1 <<  5)
 #define			MAC_CFG1_TXFLOW_CTRL		(1 <<  4)
 #define			MAC_CFG1_SYNC_RX		(1 <<  3)
 #define			MAC_CFG1_RX_ENABLE		(1 <<  2)
 #define			MAC_CFG1_SYNC_TX		(1 <<  1)
 #define			MAC_CFG1_TX_ENABLE		(1 <<  0)
 #define		AR71XX_MAC_CFG2			0x04
 #define			MAC_CFG2_PREAMBLE_LEN_MASK	0xf
 #define			MAC_CFG2_PREAMBLE_LEN_SHIFT	12
 #define			MAC_CFG2_IFACE_MODE_1000	(2 << 8)
 #define			MAC_CFG2_IFACE_MODE_10_100	(1 << 8)
 #define			MAC_CFG2_IFACE_MODE_SHIFT	8
 #define			MAC_CFG2_IFACE_MODE_MASK	3
 #define			MAC_CFG2_HUGE_FRAME		(1 << 5)
 #define			MAC_CFG2_LENGTH_FIELD		(1 << 4)
 #define			MAC_CFG2_ENABLE_PADCRC		(1 << 2)
 #define			MAC_CFG2_ENABLE_CRC		(1 << 1)
 #define			MAC_CFG2_FULL_DUPLEX		(1 << 0)
 #define		AR71XX_MAC_IFG			0x08
 #define		AR71XX_MAC_HDUPLEX		0x0C
 #define		AR71XX_MAC_MAX_FRAME_LEN	0x10
 #define		AR71XX_MAC_MII_CFG		0x20
 #define			MAC_MII_CFG_RESET		(1U << 31)
 #define			MAC_MII_CFG_SCAN_AUTO_INC	(1 <<  5)
 #define			MAC_MII_CFG_PREAMBLE_SUP	(1 <<  4)
 #define			MAC_MII_CFG_CLOCK_SELECT_MASK	0x7
 #define			MAC_MII_CFG_CLOCK_SELECT_MASK_AR933X	0xf
 #define			MAC_MII_CFG_CLOCK_DIV_4		0
 #define			MAC_MII_CFG_CLOCK_DIV_6		2
 #define			MAC_MII_CFG_CLOCK_DIV_8		3
 #define			MAC_MII_CFG_CLOCK_DIV_10	4
 #define			MAC_MII_CFG_CLOCK_DIV_14	5
 #define			MAC_MII_CFG_CLOCK_DIV_20	6
 #define			MAC_MII_CFG_CLOCK_DIV_28	7
 
 /* .. and the AR933x/AR934x extensions */
 #define			MAC_MII_CFG_CLOCK_DIV_34	8
 #define			MAC_MII_CFG_CLOCK_DIV_42	9
 #define			MAC_MII_CFG_CLOCK_DIV_50	10
 #define			MAC_MII_CFG_CLOCK_DIV_58	11
 #define			MAC_MII_CFG_CLOCK_DIV_66	12
 #define			MAC_MII_CFG_CLOCK_DIV_74	13
 #define			MAC_MII_CFG_CLOCK_DIV_82	14
 #define			MAC_MII_CFG_CLOCK_DIV_98	15
 
 #define		AR71XX_MAC_MII_CMD		0x24
 #define			MAC_MII_CMD_SCAN_CYCLE		(1 << 1)
 #define			MAC_MII_CMD_READ		1
 #define			MAC_MII_CMD_WRITE		0
 #define		AR71XX_MAC_MII_ADDR		0x28
 #define			MAC_MII_PHY_ADDR_SHIFT		8
 #define			MAC_MII_PHY_ADDR_MASK		0xff
 #define			MAC_MII_REG_MASK		0x1f
 #define		AR71XX_MAC_MII_CONTROL		0x2C
 #define			MAC_MII_CONTROL_MASK		0xffff
 #define		AR71XX_MAC_MII_STATUS		0x30
 #define			MAC_MII_STATUS_MASK		0xffff
 #define		AR71XX_MAC_MII_INDICATOR	0x34
 #define			MAC_MII_INDICATOR_NOT_VALID	(1 << 2)
 #define			MAC_MII_INDICATOR_SCANNING	(1 << 1)
 #define			MAC_MII_INDICATOR_BUSY		(1 << 0)
 #define		AR71XX_MAC_IFCONTROL		0x38
 #define			MAC_IFCONTROL_SPEED	(1 << 16)
 #define		AR71XX_MAC_STA_ADDR1		0x40
 #define		AR71XX_MAC_STA_ADDR2		0x44
 #define		AR71XX_MAC_FIFO_CFG0		0x48
 #define			FIFO_CFG0_TX_FABRIC		(1 << 4)
 #define			FIFO_CFG0_TX_SYSTEM		(1 << 3)
 #define			FIFO_CFG0_RX_FABRIC		(1 << 2)
 #define			FIFO_CFG0_RX_SYSTEM		(1 << 1)
 #define			FIFO_CFG0_WATERMARK		(1 << 0)
 #define			FIFO_CFG0_ALL			((1 << 5) - 1)
 #define			FIFO_CFG0_ENABLE_SHIFT		8
 #define		AR71XX_MAC_FIFO_CFG1		0x4C
 #define		AR71XX_MAC_FIFO_CFG2		0x50
 #define		AR71XX_MAC_FIFO_TX_THRESHOLD	0x54
 #define		AR71XX_MAC_FIFO_RX_FILTMATCH	0x58
 /* 
  * These flags applicable both to AR71XX_MAC_FIFO_RX_FILTMASK and
  * to AR71XX_MAC_FIFO_RX_FILTMATCH
  */
 #define			FIFO_RX_MATCH_UNICAST		(1 << 17)
 #define			FIFO_RX_MATCH_TRUNC_FRAME	(1 << 16)
 #define			FIFO_RX_MATCH_VLAN_TAG		(1 << 15)
 #define			FIFO_RX_MATCH_UNSUP_OPCODE	(1 << 14)
 #define			FIFO_RX_MATCH_PAUSE_FRAME	(1 << 13)
 #define			FIFO_RX_MATCH_CTRL_FRAME	(1 << 12)
 #define			FIFO_RX_MATCH_LONG_EVENT	(1 << 11)
 #define			FIFO_RX_MATCH_DRIBBLE_NIBBLE	(1 << 10)
 #define			FIFO_RX_MATCH_BCAST		(1 <<  9)
 #define			FIFO_RX_MATCH_MCAST		(1 <<  8)
 #define			FIFO_RX_MATCH_OK		(1 <<  7)
 #define			FIFO_RX_MATCH_OORANGE		(1 <<  6)
 #define			FIFO_RX_MATCH_LEN_MSMTCH	(1 <<  5)
 #define			FIFO_RX_MATCH_CRC_ERROR		(1 <<  4)
 #define			FIFO_RX_MATCH_CODE_ERROR	(1 <<  3)
 #define			FIFO_RX_MATCH_FALSE_CARRIER	(1 <<  2)
 #define			FIFO_RX_MATCH_RX_DV_EVENT	(1 <<  1)
 #define			FIFO_RX_MATCH_DROP_EVENT	(1 <<  0)
 /*
  * Exclude unicast and truncated frames from matching
  */
 #define			FIFO_RX_FILTMATCH_DEFAULT		\
 				(FIFO_RX_MATCH_VLAN_TAG		| \
 				FIFO_RX_MATCH_UNSUP_OPCODE	| \
 				FIFO_RX_MATCH_PAUSE_FRAME	| \
 				FIFO_RX_MATCH_CTRL_FRAME	| \
 				FIFO_RX_MATCH_LONG_EVENT	| \
 				FIFO_RX_MATCH_DRIBBLE_NIBBLE	| \
 				FIFO_RX_MATCH_BCAST		| \
 				FIFO_RX_MATCH_MCAST		| \
 				FIFO_RX_MATCH_OK		| \
 				FIFO_RX_MATCH_OORANGE		| \
 				FIFO_RX_MATCH_LEN_MSMTCH	| \
 				FIFO_RX_MATCH_CRC_ERROR		| \
 				FIFO_RX_MATCH_CODE_ERROR	| \
 				FIFO_RX_MATCH_FALSE_CARRIER	| \
 				FIFO_RX_MATCH_RX_DV_EVENT	| \
 				FIFO_RX_MATCH_DROP_EVENT)
 #define		AR71XX_MAC_FIFO_RX_FILTMASK	0x5C
 #define			FIFO_RX_MASK_BYTE_MODE		(1 << 19)
 #define			FIFO_RX_MASK_NO_SHORT_FRAME	(1 << 18)
 #define			FIFO_RX_MASK_BIT17		(1 << 17)
 #define			FIFO_RX_MASK_BIT16		(1 << 16)
 #define			FIFO_RX_MASK_TRUNC_FRAME	(1 << 15)
 #define			FIFO_RX_MASK_LONG_EVENT		(1 << 14)
 #define			FIFO_RX_MASK_VLAN_TAG		(1 << 13)
 #define			FIFO_RX_MASK_UNSUP_OPCODE	(1 << 12)
 #define			FIFO_RX_MASK_PAUSE_FRAME	(1 << 11)
 #define			FIFO_RX_MASK_CTRL_FRAME		(1 << 10)
 #define			FIFO_RX_MASK_DRIBBLE_NIBBLE	(1 <<  9)
 #define			FIFO_RX_MASK_BCAST		(1 <<  8)
 #define			FIFO_RX_MASK_MCAST		(1 <<  7)
 #define			FIFO_RX_MASK_OK			(1 <<  6)
 #define			FIFO_RX_MASK_OORANGE		(1 <<  5)
 #define			FIFO_RX_MASK_LEN_MSMTCH		(1 <<  4)
 #define			FIFO_RX_MASK_CODE_ERROR		(1 <<  3)
 #define			FIFO_RX_MASK_FALSE_CARRIER	(1 <<  2)
 #define			FIFO_RX_MASK_RX_DV_EVENT	(1 <<  1)
 #define			FIFO_RX_MASK_DROP_EVENT		(1 <<  0)
 
 /*
  *  Len. mismatch, unsup. opcode and short frmae bits excluded
  */
 #define			FIFO_RX_FILTMASK_DEFAULT \
 				(FIFO_RX_MASK_NO_SHORT_FRAME	| \
 				FIFO_RX_MASK_BIT17		| \
 				FIFO_RX_MASK_BIT16		| \
 				FIFO_RX_MASK_TRUNC_FRAME	| \
 				FIFO_RX_MASK_LONG_EVENT		| \
 				FIFO_RX_MASK_VLAN_TAG		| \
 				FIFO_RX_MASK_PAUSE_FRAME	| \
 				FIFO_RX_MASK_CTRL_FRAME		| \
 				FIFO_RX_MASK_DRIBBLE_NIBBLE	| \
 				FIFO_RX_MASK_BCAST		| \
 				FIFO_RX_MASK_MCAST		| \
 				FIFO_RX_MASK_OK			| \
 				FIFO_RX_MASK_OORANGE		| \
 				FIFO_RX_MASK_CODE_ERROR		| \
 				FIFO_RX_MASK_FALSE_CARRIER	| \
 				FIFO_RX_MASK_RX_DV_EVENT	| \
 				FIFO_RX_MASK_DROP_EVENT)
 
 #define		AR71XX_MAC_FIFO_RAM0		0x60
 #define		AR71XX_MAC_FIFO_RAM1		0x64
 #define		AR71XX_MAC_FIFO_RAM2		0x68
 #define		AR71XX_MAC_FIFO_RAM3		0x6C
 #define		AR71XX_MAC_FIFO_RAM4		0x70
 #define		AR71XX_MAC_FIFO_RAM5		0x74
 #define		AR71XX_MAC_FIFO_RAM6		0x78
 #define		AR71XX_DMA_TX_CONTROL		0x180
 #define			DMA_TX_CONTROL_EN		(1 << 0)
 #define		AR71XX_DMA_TX_DESC		0x184
 #define		AR71XX_DMA_TX_STATUS		0x188
 #define			DMA_TX_STATUS_PCOUNT_MASK	0xff
 #define			DMA_TX_STATUS_PCOUNT_SHIFT	16
 #define			DMA_TX_STATUS_BUS_ERROR		(1 << 3) 
 #define			DMA_TX_STATUS_UNDERRUN		(1 << 1) 
 #define			DMA_TX_STATUS_PKT_SENT		(1 << 0) 
 #define		AR71XX_DMA_RX_CONTROL		0x18C
 #define			DMA_RX_CONTROL_EN		(1 << 0)
 #define		AR71XX_DMA_RX_DESC		0x190
 #define		AR71XX_DMA_RX_STATUS		0x194
 #define			DMA_RX_STATUS_PCOUNT_MASK	0xff
 #define			DMA_RX_STATUS_PCOUNT_SHIFT	16
 #define			DMA_RX_STATUS_BUS_ERROR		(1 << 3)
 #define			DMA_RX_STATUS_OVERFLOW		(1 << 2)
 #define			DMA_RX_STATUS_PKT_RECVD		(1 << 0)
 #define		AR71XX_DMA_INTR				0x198
 #define		AR71XX_DMA_INTR_STATUS			0x19C
 #define			DMA_INTR_ALL			((1 << 8) - 1)
 #define			DMA_INTR_RX_BUS_ERROR		(1 << 7)
 #define			DMA_INTR_RX_OVERFLOW		(1 << 6)
 #define			DMA_INTR_RX_PKT_RCVD		(1 << 4)
 #define			DMA_INTR_TX_BUS_ERROR		(1 << 3)
 #define			DMA_INTR_TX_UNDERRUN		(1 << 1)
 #define			DMA_INTR_TX_PKT_SENT		(1 << 0)
 
 #define	AR71XX_SPI_BASE	0x1f000000
 #define		AR71XX_SPI_FS		0x00
 #define		AR71XX_SPI_CTRL		0x04
 #define			SPI_CTRL_REMAP_DISABLE		(1 << 6)
 #define			SPI_CTRL_CLOCK_DIVIDER_MASK	((1 << 6) - 1)
 #define		AR71XX_SPI_IO_CTRL	0x08
 #define			SPI_IO_CTRL_CS2			(1 << 18)
 #define			SPI_IO_CTRL_CS1			(1 << 17)
 #define			SPI_IO_CTRL_CS0			(1 << 16)
 #define			SPI_IO_CTRL_CSMASK		(7 << 16)
 #define			SPI_IO_CTRL_CLK			(1 << 8)
 #define			SPI_IO_CTRL_DO			1
 #define		AR71XX_SPI_RDS		0x0C
 
 #define ATH_READ_REG(reg) \
     *((volatile uint32_t *)MIPS_PHYS_TO_KSEG1((reg)))
 
 #define ATH_WRITE_REG(reg, val) \
     do { \
       *((volatile uint32_t *)MIPS_PHYS_TO_KSEG1((reg))) = (val); \
       (void) ATH_READ_REG(reg); \
     } while (0)
 
 static inline void
 ar71xx_ddr_flush(uint32_t reg)
 { 
 	ATH_WRITE_REG(reg, 1);
 	while ((ATH_READ_REG(reg) & 0x1))
 		;
 	ATH_WRITE_REG(reg, 1);
 	while ((ATH_READ_REG(reg) & 0x1))
 		;
 } 
 
 static inline void
 ar71xx_write_pll(uint32_t cfg_reg, uint32_t pll_reg, uint32_t pll, uint32_t pll_reg_shift)
 {
 	uint32_t sec_cfg;
 
 	/* set PLL registers */
 	sec_cfg = ATH_READ_REG(cfg_reg);
 	sec_cfg &= ~(3 << pll_reg_shift);
 	sec_cfg |= (2 << pll_reg_shift);
 
 	ATH_WRITE_REG(cfg_reg, sec_cfg);
 	DELAY(100);
 
 	ATH_WRITE_REG(pll_reg, pll);
 	sec_cfg |= (3 << pll_reg_shift);
 	ATH_WRITE_REG(cfg_reg, sec_cfg);
 	DELAY(100);
 
 	sec_cfg &= ~(3 << pll_reg_shift);
 	ATH_WRITE_REG(cfg_reg, sec_cfg);
 	DELAY(100);
 }
 
 #endif /* _AR71XX_REG_H_ */
Index: projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/uart_bus_ar71xx.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/uart_bus_ar71xx.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/mips/atheros/uart_bus_ar71xx.c	(revision 312218)
@@ -1,89 +1,106 @@
 /*-
  * Copyright (c) 2009, Oleksandr Tymoshenko <gonzo@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  */
 #include "opt_uart.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 
 #include <machine/bus.h>
 
 #include <dev/uart/uart.h>
 #include <dev/uart/uart_cpu.h>
 #include <dev/uart/uart_bus.h>
 
 #include <mips/atheros/ar71xxreg.h>
 #include <mips/atheros/ar71xx_cpudef.h>
 
 #include "uart_if.h"
 
 static int uart_ar71xx_probe(device_t dev);
 extern struct uart_class uart_ar71xx_uart_class;
 
 static device_method_t uart_ar71xx_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		uart_ar71xx_probe),
 	DEVMETHOD(device_attach,	uart_bus_attach),
 	DEVMETHOD(device_detach,	uart_bus_detach),
 	{ 0, 0 }
 };
 
 static driver_t uart_ar71xx_driver = {
 	uart_driver_name,
 	uart_ar71xx_methods,
 	sizeof(struct uart_softc),
 };
 
 extern SLIST_HEAD(uart_devinfo_list, uart_devinfo) uart_sysdevs;
 
 static int
 uart_ar71xx_probe(device_t dev)
 {
 	struct uart_softc *sc;
 	uint64_t freq;
 
 	freq = ar71xx_uart_freq();
 
 	sc = device_get_softc(dev);
 	sc->sc_sysdev = SLIST_FIRST(&uart_sysdevs);
 	sc->sc_class = &uart_ns8250_class;
 	bcopy(&sc->sc_sysdev->bas, &sc->sc_bas, sizeof(sc->sc_bas));
 	sc->sc_sysdev->bas.regshft = 2;
 	sc->sc_sysdev->bas.bst = mips_bus_space_generic;
 	sc->sc_sysdev->bas.bsh = MIPS_PHYS_TO_KSEG1(AR71XX_UART_ADDR) + 3;
 	sc->sc_bas.regshft = 2;
 	sc->sc_bas.bst = mips_bus_space_generic;
 	sc->sc_bas.bsh = MIPS_PHYS_TO_KSEG1(AR71XX_UART_ADDR) + 3;
 
 	return (uart_bus_probe(dev, 2, freq, 0, 0));
 }
 
+#ifdef	EARLY_PRINTF
+static void
+ar71xx_early_putc(int c)
+{
+	int i;
+
+	for (i = 0; i < 1000; i++) {
+		if (ATH_READ_REG(AR71XX_UART_ADDR + AR71XX_UART_LSR)
+		    & AR71XX_UART_LSR_THRE)
+			break;
+	}
+
+	ATH_WRITE_REG(AR71XX_UART_ADDR + AR71XX_UART_THR, (c & 0xff));
+}
+early_putc_t *early_putc = ar71xx_early_putc;
+#endif
+
 DRIVER_MODULE(uart, apb, uart_ar71xx_driver, uart_devclass, 0, 0);
Index: projects/netbsd-tests-upstream-01-2017/sys/mips/conf/ONIONOMEGA.hints
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/mips/conf/ONIONOMEGA.hints	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/mips/conf/ONIONOMEGA.hints	(revision 312218)
@@ -1,94 +1,125 @@
 #
 # This file adds to the values in AR933X_BASE.hints.
 #
 # $FreeBSD$
 
 # mdiobus on arge1
 hint.argemdio.0.at="nexus0"
 hint.argemdio.0.maddr=0x1a000000
 hint.argemdio.0.msize=0x1000
 hint.argemdio.0.order=0
 
 # Embedded Atheros Switch
 hint.arswitch.0.at="mdio0"
 
 # XXX this should really say it's an AR933x switch, as there
 # are some vlan specific differences here!
 hint.arswitch.0.is_7240=1
 hint.arswitch.0.numphys=4
 hint.arswitch.0.phy4cpu=1	# phy 4 is a "CPU" separate PHY
 hint.arswitch.0.is_rgmii=0
 hint.arswitch.0.is_gmii=1	# arge1 <-> switch PHY is GMII
 
 # arge0 - MII, autoneg, phy(4)
 hint.arge.0.phymask=0x10	# PHY4
 hint.arge.0.mdio=mdioproxy1	# .. off of the switch mdiobus
 hint.arge.0.eeprommac=0x1fff0000
 
 # arge1 - GMII, 1000/full
 hint.arge.1.phymask=0x0		# No directly mapped PHYs
 hint.arge.1.media=1000
 hint.arge.1.fduplex=1
 hint.arge.1.eeprommac=0x1fff0006
 
+# ath0
+hint.ath.0.eepromaddr=0x1fff0000
+hint.ath.0.eepromsize=16384
+
 # 16MB flash layout:
 # [    0.510000] 5 tp-link partitions found on MTD device spi0.0
 # [    0.510000] Creating 5 MTD partitions on "spi0.0":
 # [    0.520000] 0x000000000000-0x000000020000 : "u-boot"
 # [    0.520000] 0x000000020000-0x000000136468 : "kernel"
 # [    0.530000] 0x000000136468-0x000000ff0000 : "rootfs"
 # [    0.530000] mtd: device 2 (rootfs) set to be root filesystem
 # [    0.540000] 1 squashfs-split partitions found on MTD device rootfs
 # [    0.540000] 0x000000730000-0x000000fe0000 : "rootfs_data"
 # [    0.540000] 0x000000fe0000-0x000000ff0000 : "nvram"
 # [    0.550000] 0x000000ff0000-0x000001000000 : "art"
 # [    0.560000] 0x000000020000-0x000000fe0000 : "firmware"
 
 # 64KiB uboot
 hint.map.0.at="flash/spi0"
 hint.map.0.start=0x00000000
 hint.map.0.end=0x00010000
 hint.map.0.name="u-boot"
 hint.map.0.readonly=1
 
 # 64KiB uboot
 hint.map.1.at="flash/spi0"
 hint.map.1.start=0x00010000
 hint.map.1.end=0x00020000
 hint.map.1.name="uboot-env"
 hint.map.1.readonly=1
 
 # kernel
 hint.map.2.at="flash/spi0"
 hint.map.2.start=0x00020000
 hint.map.2.end="search:0x00020000:0x10000:.!/bin/sh"
 hint.map.2.name="kernel"
 hint.map.2.readonly=1
 
 # rootfs ulzma
 hint.map.3.at="flash/spi0"
 hint.map.3.start="search:0x00020000:0x10000:.!/bin/sh"
 hint.map.3.end=0x00fe0000
 hint.map.3.name="rootfs"
 hint.map.3.readonly=1
 
 # 64KiB cfg
 hint.map.4.at="flash/spi0"
 hint.map.4.start=0x00fe0000
 hint.map.4.end=0x00ff0000
 hint.map.4.name="cfg"
 hint.map.4.readonly=0
 
 # all firmware 16000KiB
 hint.map.5.at="flash/spi0"
 hint.map.5.start=0x00020000
 hint.map.5.end=0x00ff0000
 hint.map.5.name="firmware"
 hint.map.5.readonly=0
 
 # 64KiB ART
 hint.map.6.at="flash/spi0"
 hint.map.6.start=0x00ff0000
 hint.map.6.end=0x01000000
 hint.map.6.name="ART"
 hint.map.6.readonly=1
+
+# GPIO
+hint.gpio.0.pinmask=0x0c8ff1c3
+
+hint.gpioled.0.at="gpiobus0"
+hint.gpioled.0.pins=0x08000000
+hint.gpioled.0.name="board"
+hint.gpioled.0.invert=0
+
+#Red
+hint.gpioled.1.at="gpiobus0"
+hint.gpioled.1.pins=0x00020000
+hint.gpioled.1.name="red"
+hint.gpioled.1.invert=0
+
+#Green
+hint.gpioled.2.at="gpiobus0"
+hint.gpioled.2.pins=0x00010000
+hint.gpioled.2.name="green"
+hint.gpioled.2.invert=0
+
+#Blue
+hint.gpioled.3.at="gpiobus0"
+hint.gpioled.3.pins=0x00008000
+hint.gpioled.3.name="blue"
+hint.gpioled.3.invert=0
+
Index: projects/netbsd-tests-upstream-01-2017/sys/net/iflib.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/net/iflib.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/net/iflib.c	(revision 312218)
@@ -1,5139 +1,5147 @@
 /*-
  * Copyright (c) 2014-2016, Matthew Macy <mmacy@nextbsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/led/led.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
 #include <net/iflib.h>
 
 #include "ifdi_if.h"
 
 #if defined(__i386__) || defined(__amd64__)
 #include <sys/memdesc.h>
 #include <machine/bus.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/include/busdma_impl.h>
 #include <x86/iommu/busdma_dmar.h>
 #endif
 
 /*
  * enable accounting of every mbuf as it comes in to and goes out of iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
  * Enable mbuf vectors for compressing long mbuf chains
  */
 
 /*
  * NB:
  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  *   the cost of a prefetch. This will of course vary based on the workload:
  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  *        is quite expensive, thus suggesting very little prefetch.
  *      - small packet forwarding which is just returning a single mbuf to
  *        UMA will typically be very fast vis a vis the cost of a memory
  *        access.
  */
 
 
 /*
  * File organization:
  *  - private structures
  *  - iflib private utility functions
  *  - ifnet functions
  *  - vlan registry and other exported functions
  *  - iflib public core functions
  *
  *
  */
 static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
 
 struct iflib_txq;
 typedef struct iflib_txq *iflib_txq_t;
 struct iflib_rxq;
 typedef struct iflib_rxq *iflib_rxq_t;
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
+struct iflib_ctx;
+
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
+	struct iflib_ctx *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
 	KOBJ_FIELDS;
    /*
    * Pointer to hardware driver's softc
    */
 	void *ifc_softc;
 	device_t ifc_dev;
 	if_t ifc_ifp;
 
 	cpuset_t ifc_cpus;
 	if_shared_ctx_t ifc_sctx;
 	struct if_softc_ctx ifc_softc_ctx;
 
 	struct mtx ifc_mtx;
 
 	uint16_t ifc_nhwtxqs;
 	uint16_t ifc_nhwrxqs;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
 	uint32_t ifc_if_flags;
 	uint32_t ifc_flags;
 	uint32_t ifc_max_fl_buf_size;
 	int ifc_in_detach;
 
 	int ifc_link_state;
 	int ifc_link_irq;
 	int ifc_pause_frames;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
 
 	struct if_irq ifc_legacy_irq;
 	struct grouptask ifc_admin_task;
 	struct grouptask ifc_vflr_task;
 	struct iflib_filter_info ifc_filter_info;
 	struct ifmedia	ifc_media;
 
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
 
 	uint16_t ifc_sysctl_ntxds[8];
 	uint16_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
 #define isc_rxd_available ifc_txrx.ift_rxd_available
 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	uint8_t ifc_mac[ETHER_ADDR_LEN];
 	char ifc_mtx_name[16];
 };
 
 
 void *
 iflib_get_softc(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_softc);
 }
 
 device_t
 iflib_get_dev(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_dev);
 }
 
 if_t
 iflib_get_ifp(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_ifp);
 }
 
 struct ifmedia *
 iflib_get_media(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_media);
 }
 
 void
 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
 {
 
 	bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
 }
 
 if_softc_ctx_t
 iflib_get_softc_ctx(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_softc_ctx);
 }
 
 if_shared_ctx_t
 iflib_get_sctx(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_sctx);
 }
 
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
 #define RX_SW_DESC_INUSE        (1 << 3)
 #define TX_SW_DESC_MAPPED       (1 << 4)
 
 typedef struct iflib_sw_rx_desc {
 	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
 	struct mbuf    *ifsd_m;           /* rx: uninitialized mbuf */
 	caddr_t         ifsd_cl;          /* direct cluster pointer for rx */
 	uint16_t	ifsd_flags;
 } *iflib_rxsd_t;
 
 typedef struct iflib_sw_tx_desc_val {
 	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
 	struct mbuf    *ifsd_m;           /* pkthdr mbuf */
 	uint8_t		ifsd_flags;
 } *iflib_txsd_val_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 	uint8_t		*ifsd_flags;
 } iflib_txsd_array_t;
 
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
 #define IFLIB_MAX_RX_SEGS		32
 #define IFLIB_RX_COPY_THRESH		128
 #define IFLIB_MAX_RX_REFRESH		32
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
 
 /* this should really scale with ring size - 32 is a fairly arbitrary value for this */
 #define TX_BATCH_SIZE			16
 
 #define IFLIB_RESTART_BUDGET		8
 
 #define	IFC_LEGACY		0x01
 #define	IFC_QFLUSH		0x02
 #define	IFC_MULTISEG		0x04
 #define	IFC_DMAR		0x08
 #define	IFC_SC_ALLOCATED	0x10
+#define	IFC_INIT_DONE		0x20
 
+
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 struct iflib_txq {
 	uint16_t	ift_in_use;
 	uint16_t	ift_cidx;
 	uint16_t	ift_cidx_processed;
 	uint16_t	ift_pidx;
 	uint8_t		ift_gen;
 	uint8_t		ift_db_pending;
 	uint8_t		ift_db_pending_queued;
 	uint8_t		ift_npending;
 	uint8_t		ift_br_offset;
 	/* implicit pad */
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
 #endif
 	uint64_t	ift_no_tx_dma_setup;
 	uint64_t	ift_no_desc_avail;
 	uint64_t	ift_mbuf_defrag_failed;
 	uint64_t	ift_mbuf_defrag;
 	uint64_t	ift_map_failed;
 	uint64_t	ift_txd_encap_efbig;
 	uint64_t	ift_pullups;
 
 	struct mtx	ift_mtx;
 	struct mtx	ift_db_mtx;
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
 	struct ifmp_ring        **ift_br;
 	struct grouptask	ift_task;
 	uint16_t	ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
 	struct callout	ift_db_check;
 
 	iflib_txsd_array_t	ift_sds;
 	uint8_t			ift_nbr;
 	uint8_t			ift_qstatus;
 	uint8_t			ift_active;
 	uint8_t			ift_closed;
 	int			ift_watchdog_time;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t		ift_desc_tag;
 	bus_dma_tag_t		ift_tso_desc_tag;
 	iflib_dma_info_t	ift_ifdi;
 #define MTX_NAME_LEN 16
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	char                    ift_db_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ift_cpu_exec_count[256];
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
 	uint16_t	ifl_cidx;
 	uint16_t	ifl_pidx;
 	uint16_t	ifl_credits;
 	uint8_t		ifl_gen;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
 	uint64_t	ifl_cl_enqueued;
 	uint64_t	ifl_cl_dequeued;
 #endif
 	/* implicit pad */
 
 	/* constant */
 	uint16_t	ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
 	iflib_rxsd_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t           ifl_desc_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
 static inline int
 get_inuse(int size, int cidx, int pidx, int gen)
 {
 	int used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
 	else if (pidx < cidx)
 		used = size - cidx + pidx;
 	else if (gen == 0 && pidx == cidx)
 		used = 0;
 	else if (gen == 1 && pidx == cidx)
 		used = size;
 	else
 		panic("bad state");
 
 	return (used);
 }
 
 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
 
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 struct iflib_rxq {
 	/* If there is a separate completion queue -
 	 * these are the cq cidx and pidx. Otherwise
 	 * these are unused.
 	 */
 	uint16_t	ifr_size;
 	uint16_t	ifr_cq_cidx;
 	uint16_t	ifr_cq_pidx;
 	uint8_t		ifr_cq_gen;
 	uint8_t		ifr_fl_offset;
 
 	if_ctx_t	ifr_ctx;
 	iflib_fl_t	ifr_fl;
 	uint64_t	ifr_rx_irq;
 	uint16_t	ifr_id;
 	uint8_t		ifr_lro_enabled;
 	uint8_t		ifr_nfl;
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ifr_cpu_exec_count[256];
 #endif
 }  __aligned(CACHE_LINE_SIZE);
 
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
 static int enable_msix = 1;
 
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF)
 
 #define CTX_LOCK(ctx) mtx_lock(&(ctx)->ifc_mtx)
 #define CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_mtx)
 #define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx)
 
 
 #define TXDB_LOCK_INIT(txq)  mtx_init(&(txq)->ift_db_mtx, (txq)->ift_db_mtx_name, NULL, MTX_DEF)
 #define TXDB_TRYLOCK(txq) mtx_trylock(&(txq)->ift_db_mtx)
 #define TXDB_LOCK(txq) mtx_lock(&(txq)->ift_db_mtx)
 #define TXDB_UNLOCK(txq) mtx_unlock(&(txq)->ift_db_mtx)
 #define TXDB_LOCK_DESTROY(txq) mtx_destroy(&(txq)->ift_db_mtx)
 
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
 
 /* Our boot-time initialization hook */
 static int	iflib_module_event_handler(module_t, int, void *);
 
 static moduledata_t iflib_moduledata = {
 	"iflib",
 	iflib_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(iflib, 1);
 
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
 #define IFLIB_DEBUG_COUNTERS 1
 #else
 #define IFLIB_DEBUG_COUNTERS 0
 #endif /* !INVARIANTS */
 #endif
 
 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
                    "iflib driver parameters");
 
 /*
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
 		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
 
 
 #if IFLIB_DEBUG_COUNTERS
 
 static int iflib_tx_seen;
 static int iflib_tx_sent;
 static int iflib_tx_encap;
 static int iflib_rx_allocs;
 static int iflib_fl_refills;
 static int iflib_fl_refills_large;
 static int iflib_tx_frees;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
 		   &iflib_tx_seen, 0, "# tx mbufs seen");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
 		   &iflib_tx_sent, 0, "# tx mbufs sent");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
 		   &iflib_tx_encap, 0, "# tx mbufs encapped");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
 		   &iflib_tx_frees, 0, "# tx frees");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
 		   &iflib_rx_allocs, 0, "# rx allocations");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
 		   &iflib_fl_refills, 0, "# refills");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
 		   &iflib_fl_refills_large, 0, "# large refills");
 
 
 static int iflib_txq_drain_flushing;
 static int iflib_txq_drain_oactive;
 static int iflib_txq_drain_notready;
 static int iflib_txq_drain_encapfail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
 		   &iflib_txq_drain_notready, 0, "# drain notready");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD,
 		   &iflib_txq_drain_encapfail, 0, "# drain encap fails");
 
 
 static int iflib_encap_load_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
 
 static int iflib_task_fn_rxs;
 static int iflib_rx_intr_enables;
 static int iflib_fast_intrs;
 static int iflib_intr_link;
 static int iflib_intr_msix; 
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_zero_len;
 static int iflib_rx_if_input;
 static int iflib_rx_mbuf_null;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD,
 		   &iflib_intr_link, 0, "# intr link calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD,
 		   &iflib_intr_msix, 0, "# intr msix calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
 		   &iflib_rx_intr_enables, 0, "# rx intr enables");
 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
 		   &iflib_fast_intrs, 0, "# fast_intr calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD,
 		   &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
 		   &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
 static void
 iflib_debug_reset(void)
 {
 	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
 		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
 		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
 		iflib_txq_drain_notready = iflib_txq_drain_encapfail =
 		iflib_encap_load_mbuf_fail = iflib_encap_txq_avail_fail =
 		iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables =
 		iflib_fast_intrs = iflib_intr_link = iflib_intr_msix = iflib_rx_unavail =
 		iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input =
 		iflib_rx_mbuf_null = iflib_rxd_flush = 0;
 }
 
 #else
 #define DBG_COUNTER_INC(name)
 static void iflib_debug_reset(void) {}
 #endif
 
 
 
 #define IFLIB_DEBUG 0
 
 static void iflib_tx_structures_free(if_ctx_t ctx);
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str);
 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
 static int iflib_register(if_ctx_t);
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
 static void iflib_ifmp_purge(iflib_txq_t txq);
 static void _iflib_pre_assert(if_softc_ctx_t scctx);
 
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
 /*
  * device-specific sysctl variables:
  *
  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  *	During regular operations the CRC is stripped, but on some
  *	hardware reception of frames not multiple of 64 is slower,
  *	so using crcstrip=0 helps in benchmarks.
  *
  * iflib_rx_miss, iflib_rx_miss_bufs:
  *	count packets that might be missed due to lost interrupts.
  */
 SYSCTL_DECL(_dev_netmap);
 /*
  * The xl driver by default strips CRCs and we do not override it.
  */
 
 int iflib_crcstrip = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames");
 
 int iflib_rx_miss, iflib_rx_miss_bufs;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr");
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs");
 
 /*
  * Register/unregister. We are already under netmap lock.
  * Only called on the first register or the last unregister.
  */
 static int
 iflib_netmap_register(struct netmap_adapter *na, int onoff)
 {
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 
 	CTX_LOCK(ctx);
 	IFDI_INTR_DISABLE(ctx);
 
 	/* Tell the stack that the interface is no longer active */
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	if (!CTX_IS_VF(ctx))
 		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	/* enable or disable flags and callbacks in na and ifp */
 	if (onoff) {
 		nm_set_native_flags(na);
 	} else {
 		nm_clear_native_flags(na);
 	}
 	IFDI_INIT(ctx);
 	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
 	CTX_UNLOCK(ctx);
 	return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
 }
 
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
  * All information is in the kring.
  * Userspace wants to send packets up to the one before kring->rhead,
  * kernel knows kring->nr_hwcur is the first unsent packet.
  *
  * Here we push packets out (as many as possible), and possibly
  * reclaim buffers from previously completed transmission.
  *
  * The caller (netmap) guarantees that there is only one instance
  * running at any time. Any interference with other driver
  * methods should be handled by the individual drivers.
  */
 static int
 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	struct if_pkt_info pi;
 
 	/*
 	 * interrupts on every tx packet are expensive so request
 	 * them every half ring, or where NS_REPORT is set
 	 */
 	u_int report_frequency = kring->nkr_num_slots >> 1;
 	/* device-specific */
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
 	pi.ipi_segs = txq->ift_segs;
 	pi.ipi_qsidx = kring->ring_id;
 	pi.ipi_ndescs = 0;
 
 	bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
 					BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap ring,
 	 * nic_i is the corresponding index in the NIC ring.
 	 *
 	 * If we have packets to send (nm_i != head)
 	 * iterate over the netmap ring, fetch length and update
 	 * the corresponding slot in the NIC ring. Some drivers also
 	 * need to update the buffer's physical address in the NIC slot
 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 	 *
 	 * The netmap_reload_map() calls is especially expensive,
 	 * even when (as in this case) the tag is 0, so do only
 	 * when the buffer has actually changed.
 	 *
 	 * If possible do not set the report/intr bit on all slots,
 	 * but only a few times per ring or when NS_REPORT is set.
 	 *
 	 * Finally, on 10G and faster drivers, it might be useful
 	 * to prefetch the next slot and txr entry.
 	 */
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 			int flags = (slot->flags & NS_REPORT ||
 				nic_i == 0 || nic_i == report_frequency) ?
 				IPI_TX_INTR : 0;
 
 			/* device-specific */
 			pi.ipi_pidx = nic_i;
 			pi.ipi_flags = flags;
 
 			/* Fill the slot in the NIC ring. */
 			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
 			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
 			/* make sure changes to the buffer are synced */
 			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
 							BUS_DMASYNC_PREWRITE);
 
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = head;
 
 		/* synchronize the NIC ring */
 		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 	}
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 */
 	if (iflib_tx_credits_update(ctx, txq)) {
 		/* some tx completed, increment avail */
 		nic_i = txq->ift_cidx_processed;
 		kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 	}
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the receive ring.
  * Same as for the txsync, this routine must be efficient.
  * The caller guarantees a single invocations, but races against
  * the rest of the driver should be handled here.
  *
  * On call, kring->rhead is the first packet that userspace wants
  * to keep, and kring->rcur is the wakeup point.
  * The kernel has previously reported packets up to kring->rtail.
  *
  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
  * of whether or not we received an interrupt.
  */
 static int
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int i, n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	struct if_rxd_info ri;
 	/* device-specific */
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	iflib_fl_t fl = rxq->ifr_fl;
 	if (head > lim)
 		return netmap_ring_reinit(kring);
 
 	bzero(&ri, sizeof(ri));
 	ri.iri_qsidx = kring->ring_id;
 	ri.iri_ifp = ctx->ifc_ifp;
 	/* XXX check sync modes */
 	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++)
 		bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map,
 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: import newly received packets.
 	 *
 	 * nm_i is the index of the next free slot in the netmap ring,
 	 * nic_i is the index of the next received packet in the NIC ring,
 	 * and they may differ in case if_init() has been called while
 	 * in netmap mode. For the receive ring we have
 	 *
 	 *	nic_i = rxr->next_check;
 	 *	nm_i = kring->nr_hwtail (previous)
 	 * and
 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 *
 	 * rxr->next_check is set to 0 on a ring reinit
 	 */
 	if (netmap_no_pendintr || force_update) {
 		int crclen = iflib_crcstrip ? 0 : 4;
 		int error, avail;
 		uint16_t slot_flags = kring->nkr_slot_flags;
 
 		for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) {
 			nic_i = fl->ifl_cidx;
 			nm_i = netmap_idx_n2k(kring, nic_i);
 			avail = ctx->isc_rxd_available(ctx->ifc_softc, kring->ring_id, nic_i, INT_MAX);
 			for (n = 0; avail > 0; n++, avail--) {
 				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 				if (error)
 					ring->slot[nm_i].len = 0;
 				else
 					ring->slot[nm_i].len = ri.iri_len - crclen;
 				ring->slot[nm_i].flags = slot_flags;
 				bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
 								fl->ifl_sds[nic_i].ifsd_map, BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				nic_i = nm_next(nic_i, lim);
 			}
 			if (n) { /* update the state variables */
 				if (netmap_no_pendintr && !force_update) {
 					/* diagnostics */
 					iflib_rx_miss ++;
 					iflib_rx_miss_bufs += n;
 				}
 				fl->ifl_cidx = nic_i;
 				kring->nr_hwtail = nm_i;
 			}
 			kring->nr_kflags &= ~NKR_PENDINTR;
 		}
 	}
 	/*
 	 * Second part: skip past packets that userspace has released.
 	 * (kring->nr_hwcur to head excluded),
 	 * and make the buffers available for reception.
 	 * As usual nm_i is the index in the netmap ring,
 	 * nic_i is the index in the NIC ring, and
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	/* XXX not sure how this will work with multiple free lists */
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {
 		nic_i = netmap_idx_k2n(kring, nm_i);
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
 			caddr_t vaddr;
 			void *addr = PNMB(na, slot, &paddr);
 
 			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			vaddr = addr;
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			/*
 			 * XXX we should be batching this operation - TODO
 			 */
 			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i, &paddr, &vaddr, 1, fl->ifl_buf_size);
 			bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map,
 			    BUS_DMASYNC_PREREAD);
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = head;
 
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		/*
 		 * IMPORTANT: we must leave one free slot in the ring,
 		 * so move nic_i back by one unit
 		 */
 		nic_i = nm_prev(nic_i, lim);
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
 	}
 
 	return 0;
 
 ring_reset:
 	return netmap_ring_reinit(kring);
 }
 
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
 	struct netmap_adapter na;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = ctx->ifc_ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP;
 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 
 	na.num_tx_desc = scctx->isc_ntxd[0];
 	na.num_rx_desc = scctx->isc_nrxd[0];
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
 }
 
 static void
 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 	if (slot == 0)
 		return;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 
 		/*
 		 * In netmap mode, set the map for the packet buffer.
 		 * NOTE: Some drivers (not this one) also need to set
 		 * the physical buffer address in the NIC ring.
 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
 		 * netmap slot index, si
 		 */
 		int si = netmap_idx_n2k(&na->tx_rings[txq->ift_id], i);
 		netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si));
 	}
 }
 static void
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 	iflib_rxsd_t sd;
 	int nrxd;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 	if (slot == 0)
 		return;
 	sd = rxq->ifr_fl[0].ifl_sds;
 	nrxd = ctx->ifc_softc_ctx.isc_nrxd[0];
 	for (int i = 0; i < nrxd; i++, sd++) {
 			int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i);
 			uint64_t paddr;
 			void *addr;
 			caddr_t vaddr;
 
 			vaddr = addr = PNMB(na, slot + sj, &paddr);
 			netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, sd->ifsd_map, addr);
 			/* Update descriptor and the cached value */
 			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, i, &paddr, &vaddr, 1, rxq->ifr_fl[0].ifl_buf_size);
 	}
 	/* preserve queue */
 	if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) {
 		struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
 		int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t);
 	} else
 		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1);
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 
 #else
 #define iflib_netmap_txq_init(ctx, txq)
 #define iflib_netmap_rxq_init(ctx, rxq)
 #define iflib_netmap_detach(ifp)
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
 
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
 #else
 #define prefetch(x)
 #endif
 
 static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
 		return;
 	*(bus_addr_t *) arg = segs[0].ds_addr;
 }
 
 int
 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 {
 	int err;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	device_t dev = ctx->ifc_dev;
 
 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 				sctx->isc_q_align, 0,	/* alignment, bounds */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				size,			/* maxsize */
 				1,			/* nsegments */
 				size,			/* maxsegsize */
 				BUS_DMA_ALLOCNOW,	/* flags */
 				NULL,			/* lockfunc */
 				NULL,			/* lockarg */
 				&dma->idi_tag);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dma_tag_create failed: %d\n",
 		    __func__, err);
 		goto fail_0;
 	}
 
 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 		    __func__, (uintmax_t)size, err);
 		goto fail_1;
 	}
 
 	dma->idi_paddr = IF_BAD_DMA;
 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 	if (err || dma->idi_paddr == IF_BAD_DMA) {
 		device_printf(dev,
 		    "%s: bus_dmamap_load failed: %d\n",
 		    __func__, err);
 		goto fail_2;
 	}
 
 	dma->idi_size = size;
 	return (0);
 
 fail_2:
 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 fail_1:
 	bus_dma_tag_destroy(dma->idi_tag);
 fail_0:
 	dma->idi_tag = NULL;
 
 	return (err);
 }
 
 int
 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 {
 	int i, err;
 	iflib_dma_info_t *dmaiter;
 
 	dmaiter = dmalist;
 	for (i = 0; i < count; i++, dmaiter++) {
 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 			break;
 	}
 	if (err)
 		iflib_dma_free_multi(dmalist, i);
 	return (err);
 }
 
 void
 iflib_dma_free(iflib_dma_info_t dma)
 {
 	if (dma->idi_tag == NULL)
 		return;
 	if (dma->idi_paddr != IF_BAD_DMA) {
 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 		dma->idi_paddr = IF_BAD_DMA;
 	}
 	if (dma->idi_vaddr != NULL) {
 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 		dma->idi_vaddr = NULL;
 	}
 	bus_dma_tag_destroy(dma->idi_tag);
 	dma->idi_tag = NULL;
 }
 
 void
 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 {
 	int i;
 	iflib_dma_info_t *dmaiter = dmalist;
 
 	for (i = 0; i < count; i++, dmaiter++)
 		iflib_dma_free(*dmaiter);
 }
 
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 
-	if (!smp_started)
+	if (!smp_started && mp_ncpus > 1)
 		return (FILTER_HANDLED);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
 		return (FILTER_HANDLED);
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 	driver_filter_t filter, driver_intr_t handler, void *arg,
 				 char *name)
 {
 	int rc;
 	struct resource *res;
 	void *tag;
 	device_t dev = ctx->ifc_dev;
 
 	MPASS(rid < 512);
 	irq->ii_rid = rid;
 	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid,
 				     RF_SHAREABLE | RF_ACTIVE);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 	irq->ii_res = res;
 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 						filter, handler, arg, &tag);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 					  rid, name ? name : "unknown", rc);
 		return (rc);
 	} else if (name)
 		bus_describe_intr(dev, res, tag, "%s", name);
 
 	irq->ii_tag = tag;
 	return (0);
 }
 
 
 /*********************************************************************
  *
  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
  *  the information needed to transmit a packet on the wire. This is
  *  called only once at attach, setup is done every reset.
  *
  **********************************************************************/
 
 static int
 iflib_txsd_alloc(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int err, nsegments, ntsosegments;
 
 	nsegments = scctx->isc_tx_nsegments;
 	ntsosegments = scctx->isc_tx_tso_segments_max;
 	MPASS(scctx->isc_ntxd[0] > 0);
 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 	MPASS(nsegments > 0);
 	MPASS(ntsosegments > 0);
 	/*
 	 * Setup DMA descriptor areas.
 	 */
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       sctx->isc_tx_maxsize,		/* maxsize */
 			       nsegments,	/* nsegments */
 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_desc_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 		device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n",
 					  sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       scctx->isc_tx_tso_size_max,		/* maxsize */
 			       ntsosegments,	/* nsegments */
 			       scctx->isc_tx_tso_segsize_max,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_tso_desc_tag))) {
 		device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err);
 
 		goto fail;
 	}
 	if (!(txq->ift_sds.ifsd_flags =
 	    (uint8_t *) malloc(sizeof(uint8_t) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	if (!(txq->ift_sds.ifsd_m =
 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
         /* Create the descriptor buffer dma maps */
 #if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__)))
 	if ((ctx->ifc_flags & IFC_DMAR) == 0)
 		return (0);
 
 	if (!(txq->ift_sds.ifsd_map =
 	    (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 		err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 	}
 #endif
 	return (0);
 fail:
 	/* We free all, it handles case where we are in the middle */
 	iflib_tx_structures_free(ctx);
 	return (err);
 }
 
 static void
 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	bus_dmamap_t map;
 
 	map = NULL;
 	if (txq->ift_sds.ifsd_map != NULL)
 		map = txq->ift_sds.ifsd_map[i];
 	if (map != NULL) {
 		bus_dmamap_unload(txq->ift_desc_tag, map);
 		bus_dmamap_destroy(txq->ift_desc_tag, map);
 		txq->ift_sds.ifsd_map[i] = NULL;
 	}
 }
 
 static void
 iflib_txq_destroy(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 
 	for (int i = 0; i < txq->ift_size; i++)
 		iflib_txsd_destroy(ctx, txq, i);
 	if (txq->ift_sds.ifsd_map != NULL) {
 		free(txq->ift_sds.ifsd_map, M_IFLIB);
 		txq->ift_sds.ifsd_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_m != NULL) {
 		free(txq->ift_sds.ifsd_m, M_IFLIB);
 		txq->ift_sds.ifsd_m = NULL;
 	}
 	if (txq->ift_sds.ifsd_flags != NULL) {
 		free(txq->ift_sds.ifsd_flags, M_IFLIB);
 		txq->ift_sds.ifsd_flags = NULL;
 	}
 	if (txq->ift_desc_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_desc_tag);
 		txq->ift_desc_tag = NULL;
 	}
 	if (txq->ift_tso_desc_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_tso_desc_tag);
 		txq->ift_tso_desc_tag = NULL;
 	}
 }
 
 static void
 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	struct mbuf **mp;
 
 	mp = &txq->ift_sds.ifsd_m[i];
 	if (*mp == NULL)
 		return;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		bus_dmamap_sync(txq->ift_desc_tag,
 				txq->ift_sds.ifsd_map[i],
 				BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_desc_tag,
 				  txq->ift_sds.ifsd_map[i]);
 	}
 	m_free(*mp);
 	DBG_COUNTER_INC(tx_frees);
 	*mp = NULL;
 }
 
 static int
 iflib_txq_setup(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_dma_info_t di;
 	int i;
 
 	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 
 	/* Reset indices */
 	txq->ift_cidx_processed = txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
 		bzero((void *)di->idi_vaddr, di->idi_size);
 
 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate memory for rx_buffer structures. Since we use one
  *  rx_buffer per received packet, the maximum number of rx_buffer's
  *  that we'll need is equal to the number of receive descriptors
  *  that we've allocated.
  *
  **********************************************************************/
 static int
 iflib_rxsd_alloc(iflib_rxq_t rxq)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
 	iflib_rxsd_t	rxsd;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
 		fl->ifl_sds = malloc(sizeof(struct iflib_sw_rx_desc) *
 		    scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB,
 		    M_WAITOK | M_ZERO);
 		if (fl->ifl_sds == NULL) {
 			device_printf(dev, "Unable to allocate rx sw desc memory\n");
 			return (ENOMEM);
 		}
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
 					 BUS_SPACE_MAXADDR,	/* lowaddr */
 					 BUS_SPACE_MAXADDR,	/* highaddr */
 					 NULL, NULL,		/* filter, filterarg */
 					 sctx->isc_rx_maxsize,	/* maxsize */
 					 sctx->isc_rx_nsegments,	/* nsegments */
 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
 					 0,			/* flags */
 					 NULL,			/* lockfunc */
 					 NULL,			/* lockarg */
 					 &fl->ifl_desc_tag);
 		if (err) {
 			device_printf(dev, "%s: bus_dma_tag_create failed %d\n",
 				__func__, err);
 			goto fail;
 		}
 
 		rxsd = fl->ifl_sds;
 		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++, rxsd++) {
 			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &rxsd->ifsd_map);
 			if (err) {
 				device_printf(dev, "%s: bus_dmamap_create failed: %d\n",
 					__func__, err);
 				goto fail;
 			}
 		}
 	}
 	return (0);
 
 fail:
 	iflib_rx_structures_free(ctx);
 	return (err);
 }
 
 
 /*
  * Internal service routines
  */
 
 struct rxq_refill_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 static void
 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct rxq_refill_cb_arg *cb_arg = arg;
 
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 }
 
 
 #ifdef ACPI_DMAR
 #define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR)
 #else
 #define IS_DMAR(ctx) (0)
 #endif
 
 /**
  *	rxq_refill - refill an rxq  free-buffer list
  *	@ctx: the iflib context
  *	@rxq: the free-list to refill
  *	@n: the number of new buffers to allocate
  *
  *	(Re)populate an rxq free-buffer list with up to @n new packet buffers.
  *	The caller must assure that @n does not exceed the queue's capacity.
  */
 static void
 _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct mbuf *m;
 	int pidx = fl->ifl_pidx;
 	iflib_rxsd_t rxsd = &fl->ifl_sds[pidx];
 	caddr_t cl;
 	int n, i = 0;
 	uint64_t bus_addr;
 	int err;
 
 	n  = count;
 	MPASS(n > 0);
 	MPASS(fl->ifl_credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
 	if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
 
 	while (n--) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 *
 		 * If the cluster is still set then we know a minimum sized packet was received
 		 */
 		if ((cl = rxsd->ifsd_cl) == NULL) {
 			if ((cl = rxsd->ifsd_cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
 				break;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
 #endif
 		}
 		if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
 			break;
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_enqueued++;
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
 #ifdef notyet
 		if ((rxsd->ifsd_flags & RX_SW_DESC_MAP_CREATED) == 0) {
 			int err;
 
 			if ((err = bus_dmamap_create(fl->ifl_ifdi->idi_tag, 0, &rxsd->ifsd_map))) {
 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
 				uma_zfree(fl->ifl_zone, cl);
 				n = 0;
 				goto done;
 			}
 			rxsd->ifsd_flags |= RX_SW_DESC_MAP_CREATED;
 		}
 #endif
 #if defined(__i386__) || defined(__amd64__)
 		if (!IS_DMAR(ctx)) {
 			bus_addr = pmap_kextract((vm_offset_t)cl);
 		} else
 #endif
 		{
 			struct rxq_refill_cb_arg cb_arg;
 			iflib_rxq_t q;
 
 			cb_arg.error = 0;
 			q = fl->ifl_rxq;
 			err = bus_dmamap_load(fl->ifl_desc_tag, rxsd->ifsd_map,
 		         cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0);
 
 			if (err != 0 || cb_arg.error) {
 				/*
 				 * !zone_pack ?
 				 */
 				if (fl->ifl_zone == zone_pack)
 					uma_zfree(fl->ifl_zone, cl);
 				m_free(m);
 				n = 0;
 				goto done;
 			}
 			bus_addr = cb_arg.seg.ds_addr;
 		}
 		rxsd->ifsd_flags |= RX_SW_DESC_INUSE;
 
 		MPASS(rxsd->ifsd_m == NULL);
 		rxsd->ifsd_cl = cl;
 		rxsd->ifsd_m = m;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		fl->ifl_vm_addrs[i] = cl;
 		rxsd++;
 		fl->ifl_credits++;
 		i++;
 		MPASS(fl->ifl_credits <= fl->ifl_size);
 		if (++fl->ifl_pidx == fl->ifl_size) {
 			fl->ifl_pidx = 0;
 			fl->ifl_gen = 1;
 			rxsd = fl->ifl_sds;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
 			ctx->isc_rxd_refill(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx,
 								 fl->ifl_bus_addrs, fl->ifl_vm_addrs, i, fl->ifl_buf_size);
 			i = 0;
 			pidx = fl->ifl_pidx;
 		}
 	}
 done:
 	DBG_COUNTER_INC(rxd_flush);
 	if (fl->ifl_pidx == 0)
 		pidx = fl->ifl_size - 1;
 	else
 		pidx = fl->ifl_pidx - 1;
 	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
 }
 
 static __inline void
 __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
 {
 	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 #ifdef INVARIANTS
 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 #endif
 
 	MPASS(fl->ifl_credits <= fl->ifl_size);
 	MPASS(reclaimable == delta);
 
 	if (reclaimable > 0)
 		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
 }
 
 static void
 iflib_fl_bufs_free(iflib_fl_t fl)
 {
 	iflib_dma_info_t idi = fl->ifl_ifdi;
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
 		iflib_rxsd_t d = &fl->ifl_sds[i];
 
 		if (d->ifsd_flags & RX_SW_DESC_INUSE) {
 			bus_dmamap_unload(fl->ifl_desc_tag, d->ifsd_map);
 			bus_dmamap_destroy(fl->ifl_desc_tag, d->ifsd_map);
 			if (d->ifsd_m != NULL) {
 				m_init(d->ifsd_m, M_NOWAIT, MT_DATA, 0);
 				uma_zfree(zone_mbuf, d->ifsd_m);
 			}
 			if (d->ifsd_cl != NULL)
 				uma_zfree(fl->ifl_zone, d->ifsd_cl);
 			d->ifsd_flags = 0;
 		} else {
 			MPASS(d->ifsd_cl == NULL);
 			MPASS(d->ifsd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
 		d->ifsd_cl = NULL;
 		d->ifsd_m = NULL;
 	}
 	/*
 	 * Reset free list values
 	 */
 	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
 /*********************************************************************
  *
  *  Initialize a receive ring and its buffers.
  *
  **********************************************************************/
 static int
 iflib_fl_setup(iflib_fl_t fl)
 {
 	iflib_rxq_t rxq = fl->ifl_rxq;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
 	iflib_fl_bufs_free(fl);
 	/* Now replenish the mbufs */
 	MPASS(fl->ifl_credits == 0);
 	/*
 	 * XXX don't set the max_frame_size to larger
 	 * than the hardware can handle
 	 */
 	if (sctx->isc_max_frame_size <= 2048)
 		fl->ifl_buf_size = MCLBYTES;
 	else if (sctx->isc_max_frame_size <= 4096)
 		fl->ifl_buf_size = MJUMPAGESIZE;
 	else if (sctx->isc_max_frame_size <= 9216)
 		fl->ifl_buf_size = MJUM9BYTES;
 	else
 		fl->ifl_buf_size = MJUM16BYTES;
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 
 
 	/* avoid pre-allocating zillions of clusters to an idle card
 	 * potentially speeding up attach
 	 */
 	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
 	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
 	if (min(128, fl->ifl_size) != fl->ifl_credits)
 		return (ENOBUFS);
 	/*
 	 * handle failure
 	 */
 	MPASS(rxq != NULL);
 	MPASS(fl->ifl_ifdi != NULL);
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Free receive ring data structures
  *
  **********************************************************************/
 static void
 iflib_rx_sds_free(iflib_rxq_t rxq)
 {
 	iflib_fl_t fl;
 	int i;
 
 	if (rxq->ifr_fl != NULL) {
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			if (fl->ifl_desc_tag != NULL) {
 				bus_dma_tag_destroy(fl->ifl_desc_tag);
 				fl->ifl_desc_tag = NULL;
 			}
 		}
 		if (rxq->ifr_fl->ifl_sds != NULL)
 			free(rxq->ifr_fl->ifl_sds, M_IFLIB);
 
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 }
 
 /*
  * MI independent logic
  *
  */
 static void
 iflib_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 	/*
 	** Check on the state of the TX queue(s), this
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
 	IFDI_TIMER(ctx, txq->ift_id);
 	if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
 		(ctx->ifc_pause_frames == 0))
 		goto hung;
 
 	if (TXQ_AVAIL(txq) <= 2*scctx->isc_tx_nsegments ||
 	    ifmp_ring_is_stalled(txq->ift_br[0]))
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
 	ctx->ifc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	return;
 hung:
 	CTX_LOCK(ctx);
 	if_setdrvflagbits(ctx->ifc_ifp, 0, IFF_DRV_RUNNING);
 	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
 				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 
 	IFDI_WATCHDOG_RESET(ctx);
 	ctx->ifc_watchdog_events++;
 	ctx->ifc_pause_frames = 0;
 
 	iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
 	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
 	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		iflib_netmap_txq_init(ctx, txq);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		iflib_netmap_rxq_init(ctx, rxq);
 	}
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
 				goto done;
 			}
 		}
 	}
 	done:
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	IFDI_INTR_ENABLE(ctx);
 	txq = ctx->ifc_txqs;
 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
 			txq->ift_timer.c_cpu);
 }
 
 static int
 iflib_media_change(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int err;
 
 	CTX_LOCK(ctx);
 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 static void
 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	CTX_LOCK(ctx);
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	IFDI_MEDIA_STATUS(ctx, ifmr);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_stop(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_dma_info_t di;
 	iflib_fl_t fl;
 	int i, j;
 
 	/* Tell the stack that the interface is no longer active */
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
 	DELAY(100000);
 	IFDI_STOP(ctx);
 	DELAY(100000);
 
 	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		/* clean any enqueued buffers */
 		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
 		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
 		ifmp_ring_reset_stats(txq->ift_br[0]);
 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
 }
 
 static iflib_rxsd_t
 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int *cltype, int unload)
 {
 	int flid, cidx;
 	iflib_rxsd_t sd;
 	iflib_fl_t fl;
 	iflib_dma_info_t di;
 
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
 	if (cltype)
 		fl->ifl_cl_dequeued++;
 #endif
 	sd = &fl->ifl_sds[cidx];
 	di = fl->ifl_ifdi;
 	bus_dmamap_sync(di->idi_tag, di->idi_map,
 			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/* not valid assert if bxe really does SGE from non-contiguous elements */
 	MPASS(fl->ifl_cidx == cidx);
 	if (unload)
 		bus_dmamap_unload(fl->ifl_desc_tag, sd->ifsd_map);
 
 	if (__predict_false(++fl->ifl_cidx == fl->ifl_size)) {
 		fl->ifl_cidx = 0;
 		fl->ifl_gen = 0;
 	}
 	/* YES ick */
 	if (cltype)
 		*cltype = fl->ifl_cltype;
 	return (sd);
 }
 
 static struct mbuf *
 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	int i, padlen , flags, cltype;
 	struct mbuf *m, *mh, *mt;
 	iflib_rxsd_t sd;
 	caddr_t cl;
 
 	i = 0;
 	mh = NULL;
 	do {
 		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[i], &cltype, TRUE);
 
 		MPASS(sd->ifsd_cl != NULL);
 		MPASS(sd->ifsd_m != NULL);
 
 		/* Don't include zero-length frags */
 		if (ri->iri_frags[i].irf_len == 0) {
 			/* XXX we can save the cluster here, but not the mbuf */
 			m_init(sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
 			m_free(sd->ifsd_m);
 			sd->ifsd_m = NULL;
 			continue;
 		}
 
 		m = sd->ifsd_m;
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
 			padlen = ri->iri_pad;
 		} else {
 			flags = M_EXT;
 			mt->m_next = m;
 			mt = m;
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
 		sd->ifsd_m = NULL;
 		cl = sd->ifsd_cl;
 		sd->ifsd_cl = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
 		m_cljset(m, cl, cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
 		m->m_data += padlen;
 		ri->iri_len -= padlen;
 		m->m_len = ri->iri_frags[i].irf_len;
 	} while (++i < ri->iri_nfrags);
 
 	return (mh);
 }
 
 /*
  * Process one software descriptor
  */
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	struct mbuf *m;
 	iflib_rxsd_t sd;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len <= IFLIB_RX_COPY_THRESH) {
 		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[0], NULL, FALSE);
 		m = sd->ifsd_m;
 		sd->ifsd_m = NULL;
 		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
 		memcpy(m->m_data, sd->ifsd_cl, ri->iri_len);
 		m->m_len = ri->iri_frags[0].irf_len;
        } else {
 		m = assemble_segments(rxq, ri);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
 	m->m_flags |= ri->iri_flags;
 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
 	m->m_pkthdr.flowid = ri->iri_flowid;
 	M_HASHTYPE_SET(m, ri->iri_rsstype);
 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 	m->m_pkthdr.csum_data = ri->iri_csum_data;
 	return (m);
 }
 
 static bool
 iflib_rxeof(iflib_rxq_t rxq, int budget)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
 	uint16_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	struct ifnet *ifp;
 	int lro_enabled;
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
 	struct mbuf *m, *mh, *mt;
 
 	if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &budget)) {
 		return (FALSE);
 	}
 
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidxp = &rxq->ifr_cq_cidx;
 	else
 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 			__iflib_fl_refill_lt(ctx, fl, budget + 8);
 		DBG_COUNTER_INC(rx_unavail);
 		return (false);
 	}
 
 	for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) {
 		if (__predict_false(!CTX_ACTIVE(ctx))) {
 			DBG_COUNTER_INC(rx_ctx_inactive);
 			break;
 		}
 		/*
 		 * Reset client set fields to their default values
 		 */
 		bzero(&ri, sizeof(ri));
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
 		ri.iri_ifp = ctx->ifc_ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
 		/* in lieu of handling correctly - make sure it isn't being unhandled */
 		MPASS(err == 0);
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 				rxq->ifr_cq_gen = 0;
 			}
 			/* was this only a completion queue message? */
 			if (__predict_false(ri.iri_nfrags == 0))
 				continue;
 		}
 		MPASS(ri.iri_nfrags != 0);
 		MPASS(ri.iri_len != 0);
 
 		/* will advance the cidx on the corresponding free lists */
 		m = iflib_rxd_pkt_get(rxq, &ri);
 		if (avail == 0 && budget_left)
 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 
 		if (__predict_false(m == NULL)) {
 			DBG_COUNTER_INC(rx_mbuf_null);
 			continue;
 		}
 		/* imm_pkt: -- cxgb */
 		if (mh == NULL)
 			mh = mt = m;
 		else {
 			mt->m_nextpkt = m;
 			mt = m;
 		}
 	}
 	/* make sure that we can refill faster than drain */
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		__iflib_fl_refill_lt(ctx, fl, budget + 8);
 
 	ifp = ctx->ifc_ifp;
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
 		rx_bytes += m->m_pkthdr.len;
 		rx_pkts++;
 #if defined(INET6) || defined(INET)
 		if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
 			continue;
 #endif
 		DBG_COUNTER_INC(rx_if_input);
 		ifp->if_input(ifp, m);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	/*
 	 * Flush any outstanding LRO work
 	 */
 #if defined(INET6) || defined(INET)
 	tcp_lro_flush_all(&rxq->ifr_lc);
 #endif
 	if (avail)
 		return true;
 	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
 }
 
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
 #define TXQ_MAX_DB_DEFERRED(size) (size >> 5)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
 static __inline void
 iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring)
 {
 	uint32_t dbval;
 
 	if (ring || txq->ift_db_pending >=
 	    TXQ_MAX_DB_DEFERRED(txq->ift_size)) {
 
 		/* the lock will only ever be contended in the !min_latency case */
 		if (!TXDB_TRYLOCK(txq))
 			return;
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 		txq->ift_db_pending = txq->ift_npending = 0;
 		TXDB_UNLOCK(txq);
 	}
 }
 
 static void
 iflib_txd_deferred_db_check(void * arg)
 {
 	iflib_txq_t txq = arg;
 
 	/* simple non-zero boolean so use bitwise OR */
 	if ((txq->ift_db_pending | txq->ift_npending) &&
 	    txq->ift_db_pending >= txq->ift_db_pending_queued)
 		iflib_txd_db_check(txq->ift_ctx, txq, TRUE);
 	txq->ift_db_pending_queued = 0;
 	if (ifmp_ring_is_stalled(txq->ift_br[0]))
 		iflib_txq_check_drain(txq, 4);
 }
 
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
 {
 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 }
 #endif
 
 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
 	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct ether_vlan_header *eh;
 	struct mbuf *m, *n;
 
 	n = m = *mp;
 	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
 	    M_WRITABLE(m) == 0) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			n = *mp = m;
 		}
 	}
 
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present,
 	 * helpful for QinQ too.
 	 */
 	if (__predict_false(m->m_len < sizeof(*eh))) {
 		txq->ift_pullups++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		pi->ipi_etype = ntohs(eh->evl_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = NULL;
 		struct tcphdr *th = NULL;
 		int minthlen;
 
 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 		if (__predict_false(m->m_len < minthlen)) {
 			/*
 			 * if this code bloat is causing too much of a hit
 			 * move it to a separate function and mark it noinline
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				if (n->m_len >= sizeof(*ip))  {
 					ip = (struct ip *)n->m_data;
 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 				} else {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		if (pi->ipi_csum_flags & CSUM_IP)
                        ip->ip_sum = 0;
 
 		if (pi->ipi_ipproto == IPPROTO_TCP) {
 			if (__predict_false(th == NULL)) {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
 					return (ENOMEM);
 				th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
 			}
 			pi->ipi_tcp_hflags = th->th_flags;
 			pi->ipi_tcp_hlen = th->th_off << 2;
 			pi->ipi_tcp_seq = th->th_seq;
 		}
 		if (IS_TSO4(pi)) {
 			if (__predict_false(ip->ip_p != IPPROTO_TCP))
 				return (ENXIO);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 			if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
 				ip->ip_sum = 0;
 				ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
 			}
 		}
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 		struct tcphdr *th;
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		if (pi->ipi_ipproto == IPPROTO_TCP) {
 			if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
 				if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
 					return (ENOMEM);
 			}
 			pi->ipi_tcp_hflags = th->th_flags;
 			pi->ipi_tcp_hlen = th->th_off << 2;
 		}
 		if (IS_TSO6(pi)) {
 
 			if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 				return (ENXIO);
 			/*
 			 * The corresponding flag is set by the stack in the IPv4
 			 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
 			 * So, set it here because the rest of the flow requires it.
 			 */
 			pi->ipi_csum_flags |= CSUM_TCP_IPV6;
 			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 		}
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 static  __noinline  struct mbuf *
 collapse_pkthdr(struct mbuf *m0)
 {
 	struct mbuf *m, *m_next, *tmp;
 
 	m = m0;
 	m_next = m->m_next;
 	while (m_next != NULL && m_next->m_len == 0) {
 		m = m_next;
 		m->m_next = NULL;
 		m_free(m);
 		m_next = m_next->m_next;
 	}
 	m = m0;
 	m->m_next = m_next;
 	if ((m_next->m_flags & M_EXT) == 0) {
 		m = m_defrag(m, M_NOWAIT);
 	} else {
 		tmp = m_next->m_next;
 		memcpy(m_next, m, MPKTHSIZE);
 		m = m_next;
 		m->m_next = tmp;
 	}
 	return (m);
 }
 
 /*
  * If dodgy hardware rejects the scatter gather chain we've handed it
  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
  * m_defrag'd mbufs
  */
 static __noinline struct mbuf *
 iflib_remove_mbuf(iflib_txq_t txq)
 {
 	int ntxd, i, pidx;
 	struct mbuf *m, *mh, **ifsd_m;
 
 	pidx = txq->ift_pidx;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	mh = m = ifsd_m[pidx];
 	ifsd_m[pidx] = NULL;
 #if MEMORY_LOGGING
 	txq->ift_dequeued++;
 #endif
 	i = 1;
 
 	while (m) {
 		ifsd_m[(pidx + i) & (ntxd -1)] = NULL;
 #if MEMORY_LOGGING
 		txq->ift_dequeued++;
 #endif
 		m = m->m_next;
 		i++;
 	}
 	return (mh);
 }
 
 static int
 iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map,
 			  struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs,
 			  int max_segs, int flags)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	int i, next, pidx, mask, err, maxsegsz, ntxd, count;
 	struct mbuf *m, *tmp, **ifsd_m, **mp;
 
 	m = *m0;
 
 	/*
 	 * Please don't ever do this
 	 */
 	if (__predict_false(m->m_len == 0))
 		*m0 = m = collapse_pkthdr(m);
 
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	pidx = txq->ift_pidx;
 	if (map != NULL) {
 		uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags;
 
 		err = bus_dmamap_load_mbuf_sg(tag, map,
 					      *m0, segs, nsegs, BUS_DMA_NOWAIT);
 		if (err)
 			return (err);
 		ifsd_flags[pidx] |= TX_SW_DESC_MAPPED;
 		i = 0;
 		next = pidx;
 		mask = (txq->ift_size-1);
 		m = *m0;
 		do {
 			mp = &ifsd_m[next];
 			*mp = m;
 			m = m->m_next;
 			if (__predict_false((*mp)->m_len == 0)) {
 				m_free(*mp);
 				*mp = NULL;
 			} else
 				next = (pidx + i) & (ntxd-1);
 		} while (m != NULL);
 	} else {
 		int buflen, sgsize, max_sgsize;
 		vm_offset_t vaddr;
 		vm_paddr_t curaddr;
 
 		count = i = 0;
 		maxsegsz = sctx->isc_tx_maxsize;
 		m = *m0;
 		do {
 			if (__predict_false(m->m_len <= 0)) {
 				tmp = m;
 				m = m->m_next;
 				tmp->m_next = NULL;
 				m_free(tmp);
 				continue;
 			}
 			buflen = m->m_len;
 			vaddr = (vm_offset_t)m->m_data;
 			/*
 			 * see if we can't be smarter about physically
 			 * contiguous mappings
 			 */
 			next = (pidx + count) & (ntxd-1);
 			MPASS(ifsd_m[next] == NULL);
 #if MEMORY_LOGGING
 			txq->ift_enqueued++;
 #endif
 			ifsd_m[next] = m;
 			while (buflen > 0) {
 				max_sgsize = MIN(buflen, maxsegsz);
 				curaddr = pmap_kextract(vaddr);
 				sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
 				sgsize = MIN(sgsize, max_sgsize);
 				segs[i].ds_addr = curaddr;
 				segs[i].ds_len = sgsize;
 				vaddr += sgsize;
 				buflen -= sgsize;
 				i++;
 				if (i >= max_segs)
 					goto err;
 			}
 			count++;
 			tmp = m;
 			m = m->m_next;
 		} while (m != NULL);
 		*nsegs = i;
 	}
 	return (0);
 err:
 	*m0 = iflib_remove_mbuf(txq);
 	return (EFBIG);
 }
 
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
 	if_ctx_t		ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 	bus_dma_tag_t desc_tag;
 
 	segs = txq->ift_segs;
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	segs = txq->ift_segs;
 	ntxd = txq->ift_size;
 	m_head = *m_headp;
 	map = NULL;
 
 	/*
 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
 	next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
 
 	/* prefetch the next cache line of mbuf pointers and flags */
 	prefetch(&txq->ift_sds.ifsd_m[next]);
 	if (txq->ift_sds.ifsd_map != NULL) {
 		prefetch(&txq->ift_sds.ifsd_map[next]);
 		map = txq->ift_sds.ifsd_map[pidx];
 		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
 		prefetch(&txq->ift_sds.ifsd_flags[next]);
 	}
 
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		desc_tag = txq->ift_tso_desc_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
 	} else {
 		desc_tag = txq->ift_desc_tag;
 		max_segs = scctx->isc_tx_nsegments;
 	}
 	m_head = *m_headp;
 	bzero(&pi, sizeof(pi));
 	pi.ipi_len = m_head->m_pkthdr.len;
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
 	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
 	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0))
 			return (err);
 		m_head = *m_headp;
 	}
 
 retry:
 	err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT);
 defrag:
 	if (__predict_false(err)) {
 		switch (err) {
 		case EFBIG:
 			/* try collapse once and defrag once */
 			if (remap == 0)
 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 			if (remap == 1)
 				m_head = m_defrag(*m_headp, M_NOWAIT);
 			remap++;
 			if (__predict_false(m_head == NULL))
 				goto defrag_failed;
 			txq->ift_mbuf_defrag++;
 			*m_headp = m_head;
 			goto retry;
 			break;
 		case ENOMEM:
 			txq->ift_no_tx_dma_setup++;
 			break;
 		default:
 			txq->ift_no_tx_dma_setup++;
 			m_freem(*m_headp);
 			DBG_COUNTER_INC(tx_frees);
 			*m_headp = NULL;
 			break;
 		}
 		txq->ift_map_failed++;
 		DBG_COUNTER_INC(encap_load_mbuf_fail);
 		return (err);
 	}
 
 	/*
 	 * XXX assumes a 1 to 1 relationship between segments and
 	 *        descriptors - this does not hold true on all drivers, e.g.
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 		txq->ift_no_desc_avail++;
 		if (map != NULL)
 			bus_dmamap_unload(desc_tag, map);
 		DBG_COUNTER_INC(encap_txq_avail_fail);
 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
 	MPASS(pidx >= 0 && pidx < txq->ift_size);
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		DBG_COUNTER_INC(tx_encap);
 		MPASS(pi.ipi_new_pidx >= 0 &&
 		    pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
 		/*
 		 * drivers can need as many as 
 		 * two sentinels
 		 */
 		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
 		 */
 		txq->ift_pidx = pi.ipi_new_pidx;
 		txq->ift_npending += pi.ipi_ndescs;
 	} else if (__predict_false(err == EFBIG && remap < 2)) {
 		*m_headp = m_head = iflib_remove_mbuf(txq);
 		remap = 1;
 		txq->ift_txd_encap_efbig++;
 		goto defrag;
 	} else
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 	return (err);
 
 defrag_failed:
 	txq->ift_mbuf_defrag_failed++;
 	txq->ift_map_failed++;
 	m_freem(*m_headp);
 	DBG_COUNTER_INC(tx_frees);
 	*m_headp = NULL;
 	return (ENOMEM);
 }
 
 /* forward compatibility for cxgb */
 #define FIRST_QSET(ctx) 0
 
 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 #define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
 
 
 
 /* if there are more than TXQ_MIN_OCCUPANCY packets pending we consider deferring
  * doorbell writes
  *
  * ORing with 2 assures that min occupancy is never less than 2 without any conditional logic
  */
 #define TXQ_MIN_OCCUPANCY(size) ((size >> 6)| 0x2)
 
 static inline int
 iflib_txq_min_occupancy(iflib_txq_t txq)
 {
 	if_ctx_t ctx;
 
 	ctx = txq->ift_ctx;
 	return (get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx,
 	    txq->ift_gen) < TXQ_MIN_OCCUPANCY(txq->ift_size) +
 	    MAX_TX_DESC(ctx));
 }
 
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
 	int hasmap;
 	uint32_t qsize, cidx, mask, gen;
 	struct mbuf *m, **ifsd_m;
 	uint8_t *ifsd_flags;
 	bus_dmamap_t *ifsd_map;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
 	qsize = txq->ift_size;
 	mask = qsize-1;
 	hasmap = txq->ift_sds.ifsd_map != NULL;
 	ifsd_flags = txq->ift_sds.ifsd_flags;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ifsd_map = txq->ift_sds.ifsd_map;
 
 	while (n--) {
 		prefetch(ifsd_m[(cidx + 3) & mask]);
 		prefetch(ifsd_m[(cidx + 4) & mask]);
 
 		if (ifsd_m[cidx] != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) {
 				/*
 				 * does it matter if it's not the TSO tag? If so we'll
 				 * have to add the type to flags
 				 */
 				bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]);
 				ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED;
 			}
 			if ((m = ifsd_m[cidx]) != NULL) {
 				/* XXX we don't support any drivers that batch packets yet */
 				MPASS(m->m_nextpkt == NULL);
 
 				m_free(m);
 				ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 				txq->ift_dequeued++;
 #endif
 				DBG_COUNTER_INC(tx_frees);
 			}
 		}
 		if (__predict_false(++cidx == qsize)) {
 			cidx = 0;
 			gen = 0;
 		}
 	}
 	txq->ift_cidx = cidx;
 	txq->ift_gen = gen;
 }
 
 static __inline int
 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 {
 	int reclaim;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
 	iflib_tx_credits_update(ctx, txq);
 	reclaim = DESC_RECLAIMABLE(txq);
 
 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 #ifdef INVARIANTS
 		if (iflib_verbose_debug) {
 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 			       reclaim, thresh);
 
 		}
 #endif
 		return (0);
 	}
 	iflib_tx_desc_free(txq, reclaim);
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
 	if (txq->ift_active == FALSE)
 		txq->ift_active = TRUE;
 
 	return (reclaim);
 }
 
 static struct mbuf **
 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset)
 {
 
 	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (r->size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
 	ifmp_ring_check_drainage(txq->ift_br[0], budget);
 }
 
 static uint32_t
 iflib_txq_can_drain(struct ifmp_ring *r)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) ||
 		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, false));
 }
 
 static uint32_t
 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	struct mbuf **mp, *m;
 	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail, err, in_use_prev, desc_used;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		DBG_COUNTER_INC(txq_drain_flushing);
 		for (i = 0; i < avail; i++) {
 			m_free(r->items[(cidx + i) & (r->size-1)]);
 			r->items[(cidx + i) & (r->size-1)] = NULL;
 		}
 		return (avail);
 	}
 	iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
 	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
 		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
 #endif
 
 	for (desc_used = i = 0; i < count && TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2; i++) {
 		mp = _ring_peek_one(r, cidx, i);
 		MPASS(mp != NULL && *mp != NULL);
 		in_use_prev = txq->ift_in_use;
 		if ((err = iflib_encap(txq, mp)) == ENOBUFS) {
 			DBG_COUNTER_INC(txq_drain_encapfail);
 			/* no room - bail out */
 			break;
 		}
 		consumed++;
 		if (err) {
 			DBG_COUNTER_INC(txq_drain_encapfail);
 			/* we can't send this packet - skip it */
 			continue;
 		}
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
 		if (m->m_flags & M_MCAST)
 			mcast_sent++;
 
 		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
 		desc_used += (txq->ift_in_use - in_use_prev);
 		iflib_txd_db_check(ctx, txq, FALSE);
 		ETHER_BPF_MTAP(ifp, m);
 		if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 			break;
 
 		if (desc_used >= TXQ_MAX_DB_CONSUMED(txq->ift_size))
 			break;
 	}
 
 	if ((iflib_min_tx_latency || iflib_txq_min_occupancy(txq)) && txq->ift_db_pending)
 		iflib_txd_db_check(ctx, txq, TRUE);
 	else if ((txq->ift_db_pending || TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2) &&
 		 (callout_pending(&txq->ift_db_check) == 0)) {
 		txq->ift_db_pending_queued = txq->ift_db_pending;
 		callout_reset_on(&txq->ift_db_check, 1, iflib_txd_deferred_db_check,
 				 txq, txq->ift_db_check.c_cpu);
 	}
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("consumed=%d\n", consumed);
 #endif
 	return (consumed);
 }
 
 static uint32_t
 iflib_txq_drain_always(struct ifmp_ring *r)
 {
 	return (1);
 }
 
 static uint32_t
 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	int i, avail;
 	struct mbuf **mp;
 	iflib_txq_t txq;
 
 	txq = r->cookie;
 
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	CALLOUT_LOCK(txq);
 	callout_stop(&txq->ift_timer);
 	callout_stop(&txq->ift_db_check);
 	CALLOUT_UNLOCK(txq);
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	for (i = 0; i < avail; i++) {
 		mp = _ring_peek_one(r, cidx, i);
 		m_freem(*mp);
 	}
 	MPASS(ifmp_ring_is_stalled(r) == 0);
 	return (avail);
 }
 
 static void
 iflib_ifmp_purge(iflib_txq_t txq)
 {
 	struct ifmp_ring *r;
 
 	r = txq->ift_br[0];
 	r->drain = iflib_txq_drain_free;
 	r->can_drain = iflib_txq_drain_always;
 
 	ifmp_ring_check_drainage(r, r->size);
 
 	r->drain = iflib_txq_drain;
 	r->can_drain = iflib_txq_can_drain;
 }
 
 static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
 
 #ifdef IFLIB_DIAGNOSTICS
 	txq->ift_cpu_exec_count[curcpu]++;
 #endif
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 	ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
 }
 
 static void
 _task_fn_rx(void *context)
 {
 	iflib_rxq_t rxq = context;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	bool more;
 	int rc;
 
 #ifdef IFLIB_DIAGNOSTICS
 	rxq->ifr_cpu_exec_count[curcpu]++;
 #endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 
 	if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else {
 			DBG_COUNTER_INC(rx_intr_enables);
 			rc = IFDI_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
 		}
 	}
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 	if (more)
 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
 }
 
 static void
 _task_fn_admin(void *context)
 {
 	if_ctx_t ctx = context;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	iflib_txq_t txq;
 	int i;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 	}
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	IFDI_LINK_INTR_ENABLE(ctx);
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
 		return;
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 }
 
 
 static void
 _task_fn_iov(void *context)
 {
 	if_ctx_t ctx = context;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VFLR_HANDLE(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 	if_int_delay_info_t info;
 	if_ctx_t ctx;
 
 	info = (if_int_delay_info_t)arg1;
 	ctx = info->iidi_ctx;
 	info->iidi_req = req;
 	info->iidi_oidp = oidp;
 	CTX_LOCK(ctx);
 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 /*********************************************************************
  *
  *  IFNET FUNCTIONS
  *
  **********************************************************************/
 
 static void
 iflib_if_init_locked(if_ctx_t ctx)
 {
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 }
 
 
 static void
 iflib_if_init(void *arg)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t	ctx = if_getsoftc(ifp);
 
 	iflib_txq_t txq;
 	int err, qidx;
 
 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
 	qidx = 0;
 	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
 		qidx = QIDX(ctx, m);
 	/*
 	 * XXX calculate buf_ring based on flowid (divvy up bits?)
 	 */
 	txq = &ctx->ifc_txqs[qidx];
 
 #ifdef DRIVER_BACKPRESSURE
 	if (txq->ift_closed) {
 		while (m != NULL) {
 			next = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 			m = next;
 		}
 		return (ENOBUFS);
 	}
 #endif
 #ifdef notyet
 	qidx = count = 0;
 	mp = marr;
 	next = m;
 	do {
 		count++;
 		next = next->m_nextpkt;
 	} while (next != NULL);
 
 	if (count > nitems(marr))
 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 			/* XXX check nextpkt */
 			m_freem(m);
 			/* XXX simplify for now */
 			DBG_COUNTER_INC(tx_frees);
 			return (ENOBUFS);
 		}
 	for (next = m, i = 0; next != NULL; i++) {
 		mp[i] = next;
 		next = next->m_nextpkt;
 		mp[i]->m_nextpkt = NULL;
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
 	err = ifmp_ring_enqueue(txq->ift_br[0], (void **)&m, 1, TX_BATCH_SIZE);
 
 	if (err) {
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
 		ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
 		m_freem(m);
 	} else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) {
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 
 	return (err);
 }
 
 static void
 iflib_if_qflush(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	CTX_LOCK(ctx);
 	ctx->ifc_flags |= IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		while (!(ifmp_ring_is_idle(txq->ift_br[0]) || ifmp_ring_is_stalled(txq->ift_br[0])))
 			iflib_txq_check_drain(txq, 0);
 	CTX_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 
 	if_qflush(ifp);
 }
 
 
 #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
 		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING |	\
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO)
 
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	struct ifreq	*ifr = (struct ifreq *)data;
 #if defined(INET) || defined(INET6)
 	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
 	bool		avoid_reset = FALSE;
 	int		err = 0, reinit = 0, bits;
 
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			avoid_reset = TRUE;
 #endif
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6)
 			avoid_reset = TRUE;
 #endif
 		/*
 		** Calling init results in link renegotiation,
 		** so we avoid doing it when possible.
 		*/
 		if (avoid_reset) {
 			if_setflagbits(ifp, IFF_UP,0);
 			if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING))
 				reinit = 1;
 #ifdef INET
 			if (!(if_getflags(ifp) & IFF_NOARP))
 				arp_ifinit(ifp, ifa);
 #endif
 		} else
 			err = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
 		CTX_LOCK(ctx);
 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
 			CTX_UNLOCK(ctx);
 			break;
 		}
 		bits = if_getdrvflags(ifp);
 		/* stop the driver and free any clusters before proceeding */
 		iflib_stop(ctx);
 
 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 				ctx->ifc_flags |= IFC_MULTISEG;
 			else
 				ctx->ifc_flags &= ~IFC_MULTISEG;
 			err = if_setmtu(ifp, ifr->ifr_mtu);
 		}
 		iflib_init_locked(ctx);
 		if_setdrvflags(ifp, bits);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCSIFFLAGS:
 		CTX_LOCK(ctx);
 		if (if_getflags(ifp) & IFF_UP) {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 				}
 			} else
 				reinit = 1;
 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			iflib_stop(ctx);
 		}
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
 
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			CTX_LOCK(ctx);
 			IFDI_INTR_DISABLE(ctx);
 			IFDI_MULTI_SET(ctx);
 			IFDI_INTR_ENABLE(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	case SIOCSIFMEDIA:
 		CTX_LOCK(ctx);
 		IFDI_MEDIA_SET(ctx);
 		CTX_UNLOCK(ctx);
 		/* falls thru */
 	case SIOCGIFMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
 		break;
 	case SIOCGI2C:
 	{
 		struct ifi2creq i2c;
 
 		err = copyin(ifr->ifr_data, &i2c, sizeof(i2c));
 		if (err != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			err = EINVAL;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			err = EINVAL;
 			break;
 		}
 
 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 			err = copyout(&i2c, ifr->ifr_data, sizeof(i2c));
 		break;
 	}
 	case SIOCSIFCAP:
 	{
 		int mask, setmask;
 
 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
 		setmask = 0;
 #ifdef TCP_OFFLOAD
 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 
 		if (setmask  & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
 			setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		if ((mask & IFCAP_WOL) &&
 		    (if_getcapabilities(ifp) & IFCAP_WOL) != 0)
 			setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC));
 		if_vlancap(ifp);
 		/*
 		 * want to ensure that traffic has stopped before we change any of the flags
 		 */
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
 			if (bits & IFF_DRV_RUNNING)
 				iflib_stop(ctx);
 			if_togglecapenable(ifp, setmask);
 			if (bits & IFF_DRV_RUNNING)
 				iflib_init_locked(ctx);
 			if_setdrvflags(ifp, bits);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	    }
 	case SIOCGPRIVATE_0:
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		CTX_LOCK(ctx);
 		err = IFDI_PRIV_IOCTL(ctx, command, data);
 		CTX_UNLOCK(ctx);
 		break;
 	default:
 		err = ether_ioctl(ifp, command, data);
 		break;
 	}
 	if (reinit)
 		iflib_if_init(ctx);
 	return (err);
 }
 
 static uint64_t
 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	return (IFDI_GET_COUNTER(ctx, cnt));
 }
 
 /*********************************************************************
  *
  *  OTHER FUNCTIONS EXPORTED TO THE STACK
  *
  **********************************************************************/
 
 static void
 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_led_func(void *arg, int onoff)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	IFDI_LED_FUNC(ctx, onoff);
 	CTX_UNLOCK(ctx);
 }
 
 /*********************************************************************
  *
  *  BUS FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 int
 iflib_device_probe(device_t dev)
 {
 	pci_vendor_info_t *ent;
 
 	uint16_t	pci_vendor_id, pci_device_id;
 	uint16_t	pci_subvendor_id, pci_subdevice_id;
 	uint16_t	pci_rev_id;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 	pci_subvendor_id = pci_get_subvendor(dev);
 	pci_subdevice_id = pci_get_subdevice(dev);
 	pci_rev_id = pci_get_revid(dev);
 	if (sctx->isc_parse_devinfo != NULL)
 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 
 	ent = sctx->isc_vendor_info;
 	while (ent->pvi_vendor_id != 0) {
 		if (pci_vendor_id != ent->pvi_vendor_id) {
 			ent++;
 			continue;
 		}
 		if ((pci_device_id == ent->pvi_device_id) &&
 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 		     (ent->pvi_subvendor_id == 0)) &&
 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 		     (ent->pvi_subdevice_id == 0)) &&
 		    ((pci_rev_id == ent->pvi_rev_id) ||
 		     (ent->pvi_rev_id == 0))) {
 
 			device_set_desc_copy(dev, ent->pvi_name);
 			/* this needs to be changed to zero if the bus probing code
 			 * ever stops re-probing on best match because the sctx
 			 * may have its values over written by register calls
 			 * in subsequent probes
 			 */
 			return (BUS_PROBE_DEFAULT);
 		}
 		ent++;
 	}
 	return (ENXIO);
 }
 
 int
 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 {
 	int err, rid, msix, msix_bar;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	uint16_t main_txq;
 	uint16_t main_rxq;
 
 
 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 
 	if (sc == NULL) {
 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 		device_set_softc(dev, ctx);
 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	}
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "iflib_register failed %d\n", err);
 		return (err);
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	/*
 	 * XXX sanity check that ntxd & nrxd are a power of 2
 	 */
 	if (ctx->ifc_sysctl_ntxqs != 0)
 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 	if (ctx->ifc_sysctl_nrxqs != 0)
 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (ctx->ifc_sysctl_ntxds[i] != 0)
 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 		else
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (ctx->ifc_sysctl_nrxds[i] != 0)
 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 		else
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 		}
 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 		}
 	}
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 		}
 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 		}
 	}
 
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		return (err);
 	}
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 #ifdef INVARIANTS
 	MPASS(scctx->isc_capenable);
 	if (scctx->isc_capenable & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp, scctx->isc_capenable);
 	if_setcapenable(ifp, scctx->isc_capenable);
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 #ifdef ACPI_DMAR
 	if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL)
 		ctx->ifc_flags |= IFC_DMAR;
 #endif
 
 	msix_bar = scctx->isc_msix_bar;
 
 	if(sctx->isc_flags & IFLIB_HAS_TXCQ)
 		main_txq = 1;
 	else
 		main_txq = 0;
 
 	if(sctx->isc_flags & IFLIB_HAS_RXCQ)
 		main_rxq = 1;
 	else
 		main_rxq = 0;
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "using %d tx descriptors and %d rx descriptors\n",
 		      scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			/* round down instead? */
 			device_printf(dev, "# rx descriptors must be a power of 2\n");
 			err = EINVAL;
 			goto fail;
 		}
 	}
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev,
 			    "# tx descriptors must be a power of 2");
 			err = EINVAL;
 			goto fail;
 		}
 	}
 
 	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
 
 	/*
 	 * Protect the stack against modern hardware
 	 */
 	if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX)
 		scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX;
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max;
 	ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max;
 	ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max;
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin");
 	/*
 	** Now setup MSI or MSI/X, should
 	** return us the number of supported
 	** vectors. (Will be 1 for MSI)
 	*/
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
 		scctx->isc_ntxqsets = 1;
 		scctx->isc_nrxqsets = 1;
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 		msix = 0;
 	}
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx))) {
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
 
 	IFDI_INTR_DISABLE(ctx);
 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
 		goto fail_intr_free;
 	}
 	if (msix <= 1) {
 		rid = 0;
 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
 			MPASS(msix == 1);
 			rid = 1;
 		}
 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 			goto fail_intr_free;
 		}
 	}
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 	if ((err = iflib_netmap_attach(ctx))) {
 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 		goto fail_detach;
 	}
 	*ctxp = ctx;
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
+	ctx->ifc_flags |= IFC_INIT_DONE;
 	return (0);
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_intr_free:
 	if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI)
 		pci_release_msi(ctx->ifc_dev);
 fail_queues:
 	/* XXX free queues */
 fail:
 	IFDI_DETACH(ctx);
 	return (err);
 }
 
 int
 iflib_device_attach(device_t dev)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_enable_busmaster(dev);
 
 	return (iflib_device_register(dev, NULL, sctx, &ctx));
 }
 
 int
 iflib_device_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	device_t dev = ctx->ifc_dev;
 	int i;
 	struct taskqgroup *tqg;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev,"Vlan in use, detach first\n");
 		return (EBUSY);
 	}
 
 	CTX_LOCK(ctx);
 	ctx->ifc_in_detach = 1;
 	iflib_stop(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 	if (ctx->ifc_vlan_detach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 
 	iflib_netmap_detach(ifp);
 	ether_ifdetach(ifp);
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 	/* XXX drain any dependent tasks */
 	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
 		callout_drain(&txq->ift_db_check);
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 
 	IFDI_DETACH(ctx);
 	device_set_softc(ctx->ifc_dev, NULL);
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 		pci_release_msi(dev);
 	}
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 	}
 	if (ctx->ifc_msix_mem != NULL) {
 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 			ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 
 	bus_generic_detach(dev);
 	if_free(ifp);
 
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 
 int
 iflib_device_detach(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	return (iflib_device_deregister(ctx));
 }
 
 int
 iflib_device_suspend(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SUSPEND(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 int
 iflib_device_shutdown(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SHUTDOWN(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 
 
 int
 iflib_device_resume(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	CTX_LOCK(ctx);
 	IFDI_RESUME(ctx);
 	iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 
 	return (bus_generic_resume(dev));
 }
 
 int
 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 void
 iflib_device_iov_uninit(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_IOV_UNINIT(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 int
 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 /*********************************************************************
  *
  *  MODULE FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 /*
  * - Start a fast taskqueue thread for each core
  * - Start a taskqueue for control operations
  */
 static int
 iflib_module_init(void)
 {
 	return (0);
 }
 
 static int
 iflib_module_event_handler(module_t mod, int what, void *arg)
 {
 	int err;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((err = iflib_module_init()) != 0)
 			return (err);
 		break;
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 /*********************************************************************
  *
  *  PUBLIC FUNCTION DEFINITIONS
  *     ordered as in iflib.h
  *
  **********************************************************************/
 
 
 static void
 _iflib_assert(if_shared_ctx_t sctx)
 {
 	MPASS(sctx->isc_tx_maxsize);
 	MPASS(sctx->isc_tx_maxsegsize);
 
 	MPASS(sctx->isc_rx_maxsize);
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
 	MPASS(sctx->isc_nrxd_min[0]);
 	MPASS(sctx->isc_nrxd_max[0]);
 	MPASS(sctx->isc_nrxd_default[0]);
 	MPASS(sctx->isc_ntxd_min[0]);
 	MPASS(sctx->isc_ntxd_max[0]);
 	MPASS(sctx->isc_ntxd_default[0]);
 }
 
 static void
 _iflib_pre_assert(if_softc_ctx_t scctx)
 {
 
 	MPASS(scctx->isc_txrx->ift_txd_encap);
 	MPASS(scctx->isc_txrx->ift_txd_flush);
 	MPASS(scctx->isc_txrx->ift_txd_credits_update);
 	MPASS(scctx->isc_txrx->ift_rxd_available);
 	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
 	MPASS(scctx->isc_txrx->ift_rxd_refill);
 	MPASS(scctx->isc_txrx->ift_rxd_flush);
 }
 
 static int
 iflib_register(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	driver_t *driver = sctx->isc_driver;
 	device_t dev = ctx->ifc_dev;
 	if_t ifp;
 
 	_iflib_assert(sctx);
 
 	CTX_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 
 	ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "can not allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Initialize our context's device specific methods
 	 */
 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 	kobj_class_compile((kobj_class_t) driver);
 	driver->refs++;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setsoftc(ifp, ctx);
 	if_setdev(ifp, dev);
 	if_setinitfn(ifp, iflib_if_init);
 	if_setioctlfn(ifp, iflib_if_ioctl);
 	if_settransmitfn(ifp, iflib_if_transmit);
 	if_setqflushfn(ifp, iflib_if_qflush);
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 	ctx->ifc_vlan_detach_event =
 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 
 	ifmedia_init(&ctx->ifc_media, IFM_IMASK,
 					 iflib_media_change, iflib_media_status);
 
 	return (0);
 }
 
 
 static int
 iflib_queues_alloc(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int nrxqsets = scctx->isc_nrxqsets;
 	int ntxqsets = scctx->isc_ntxqsets;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
 	int i, j, cpu, err, txconf, rxconf;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
 	uint32_t *txqsizes = scctx->isc_txqsizes;
 	uint8_t nrxqs = sctx->isc_nrxqs;
 	uint8_t ntxqs = sctx->isc_ntxqs;
 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 	struct ifmp_ring **brscp;
 	int nbuf_rings = 1; /* XXX determine dynamically */
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 
 	brscp = NULL;
 	txq = NULL;
 	rxq = NULL;
 
 /* Allocate the TX ring struct memory */
 	if (!(txq =
 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX ring memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/* Now allocate the RX */
 	if (!(rxq =
 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate RX ring memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 	if (!(brscp = malloc(sizeof(void *) * nbuf_rings * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to buf_ring_sc * memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 
 	ctx->ifc_txqs = txq;
 	ctx->ifc_rxqs = rxq;
 
 	/*
 	 * XXX handle allocation failure
 	 */
 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
 			device_printf(dev, "failed to allocate iflib_dma_info\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		txq->ift_ifdi = ifdip;
 		for (j = 0; j < ntxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 			txq->ift_br_offset = 1;
 		} else {
 			txq->ift_br_offset = 0;
 		}
 		/* XXX fix this */
 		txq->ift_timer.c_cpu = cpu;
 		txq->ift_db_check.c_cpu = cpu;
 		txq->ift_nbr = nbuf_rings;
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		/* Initialize the TX lock */
 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout",
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
 		callout_init_mtx(&txq->ift_db_check, &txq->ift_mtx, 0);
 
 		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
 			 device_get_nameunit(dev), txq->ift_id);
 		TXDB_LOCK_INIT(txq);
 
 		txq->ift_br = brscp + i*nbuf_rings;
 		for (j = 0; j < nbuf_rings; j++) {
 			err = ifmp_ring_alloc(&txq->ift_br[j], 2048, txq, iflib_txq_drain,
 					      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
 			if (err) {
 				/* XXX free any allocated rings */
 				device_printf(dev, "Unable to allocate buf_ring\n");
 				goto err_tx_desc;
 			}
 		}
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
 			device_printf(dev, "failed to allocate iflib_dma_info\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		rxq->ifr_ifdi = ifdip;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 		}
 		rxq->ifr_ctx = ctx;
 		rxq->ifr_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			rxq->ifr_fl_offset = 1;
 		} else {
 			rxq->ifr_fl_offset = 0;
 		}
 		rxq->ifr_nfl = nfree_lists;
 		if (!(fl =
 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate free list memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
 			rxq->ifr_fl[j].ifl_rxq = rxq;
 			rxq->ifr_fl[j].ifl_id = j;
 			rxq->ifr_fl[j].ifl_ifdi =
 			    &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
 		}
         /* Allocate receive buffers for the ring*/
 		if (iflib_rxsd_alloc(rxq)) {
 			device_printf(dev,
 			    "Critical Failure setting up receive buffers\n");
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
 	}
 
 	/* TXQs */
 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < ntxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 
 		for (j = 0; j < ntxqs; j++, di++) {
 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
 			paddrs[i*ntxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	/* RXQs */
 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < nrxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 
 		for (j = 0; j < nrxqs; j++, di++) {
 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
 			paddrs[i*nrxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	return (0);
 
 /* XXX handle allocation failure changes */
 err_rx_desc:
 err_tx_desc:
 	if (ctx->ifc_rxqs != NULL)
 		free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 	if (ctx->ifc_txqs != NULL)
 		free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 rx_fail:
 	if (brscp != NULL)
 		free(brscp, M_IFLIB);
 	if (rxq != NULL)
 		free(rxq, M_IFLIB);
 	if (txq != NULL)
 		free(txq, M_IFLIB);
 fail:
 	return (err);
 }
 
 static int
 iflib_tx_structures_setup(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_setup(txq);
 
 	return (0);
 }
 
 static void
 iflib_tx_structures_free(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i, j;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		iflib_txq_destroy(txq);
 		for (j = 0; j < ctx->ifc_nhwtxqs; j++)
 			iflib_dma_free(&txq->ift_ifdi[j]);
 	}
 	free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 	IFDI_QUEUES_FREE(ctx);
 }
 
 /*********************************************************************
  *
  *  Initialize all receive rings.
  *
  **********************************************************************/
 static int
 iflib_rx_structures_setup(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	int q;
 #if defined(INET6) || defined(INET)
 	int i, err;
 #endif
 
 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 #if defined(INET6) || defined(INET)
 		tcp_lro_free(&rxq->ifr_lc);
 		if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 		    TCP_LRO_ENTRIES, min(1024,
 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) {
 			device_printf(ctx->ifc_dev, "LRO Initialization failed!\n");
 			goto fail;
 		}
 		rxq->ifr_lro_enabled = TRUE;
 #endif
 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 	}
 	return (0);
 #if defined(INET6) || defined(INET)
 fail:
 	/*
 	 * Free RX software descriptors allocated so far, we will only handle
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself. 'q' failed, so its the terminus.
 	 */
 	rxq = ctx->ifc_rxqs;
 	for (i = 0; i < q; ++i, rxq++) {
 		iflib_rx_sds_free(rxq);
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 	return (err);
 #endif
 }
 
 /*********************************************************************
  *
  *  Free all receive rings.
  *
  **********************************************************************/
 static void
 iflib_rx_structures_free(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 		iflib_rx_sds_free(rxq);
 	}
 }
 
 static int
 iflib_qset_structures_setup(if_ctx_t ctx)
 {
 	int err;
 
 	if ((err = iflib_tx_structures_setup(ctx)) != 0)
 		return (err);
 
 	if ((err = iflib_rx_structures_setup(ctx)) != 0) {
 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 		iflib_tx_structures_free(ctx);
 		iflib_rx_structures_free(ctx);
 	}
 	return (err);
 }
 
 int
 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 				driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name)
 {
 
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
 static int
 find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid)
 {
 	int i, cpuid, eqid, count;
 
 	CPU_COPY(&ctx->ifc_cpus, cpus);
 	count = CPU_COUNT(&ctx->ifc_cpus);
 	eqid = qid % count;
 	/* clear up to the qid'th bit */
 	for (i = 0; i < eqid; i++) {
 		cpuid = CPU_FFS(cpus);
 		MPASS(cpuid != 0);
 		CPU_CLR(cpuid-1, cpus);
 	}
 	cpuid = CPU_FFS(cpus);
 	MPASS(cpuid != 0);
 	return (cpuid-1);
 }
 
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 						iflib_intr_type_t type, driver_filter_t *filter,
 						void *filter_arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
 	cpuset_t cpus;
 	gtask_fn_t *fn;
 	int tqrid, err, cpuid;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
 	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
 		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_admin;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
+	info->ifi_ctx = ctx;
 
 	err = _iflib_irq_alloc(ctx, irq, rid, iflib_fast_intr, NULL, info,  name);
 	if (err != 0) {
 		device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
 	}
 	if (type == IFLIB_INTR_ADMIN)
 		return (0);
 
 	if (tqrid != -1) {
 		cpuid = find_nth(ctx, &cpus, qid);
 		taskqgroup_attach_cpu(tqg, gtask, q, cpuid, irq->ii_rid, name);
 	} else {
 		taskqgroup_attach(tqg, gtask, q, tqrid, name);
 	}
 
 	return (0);
 }
 
 void
 iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
 		rid = -1;
 		fn = _task_fn_iov;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	GROUPTASK_INIT(gtask, 0, fn, q);
 	taskqgroup_attach(tqg, gtask, q, rid, name);
 }
 
 void
 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 {
 	if (irq->ii_tag)
 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 
 	if (irq->ii_res)
 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res);
 }
 
 static int
 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_irq_t irq = &ctx->ifc_legacy_irq;
 	iflib_filter_info_t info;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	int tqrid;
 	void *q;
 	int err;
 
 	/*
 	 * group taskqueues aren't properly set up until SMP is started
 	 * so we disable interrupts until we can handle them post
 	 * SI_SUB_SMP
 	 */
 	IFDI_INTR_DISABLE(ctx);
 
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
 	tqg = qgroup_if_io_tqg;
 	tqrid = irq->ii_rid = *rid;
 	fn = _task_fn_rx;
 
 	ctx->ifc_flags |= IFC_LEGACY;
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
+	info->ifi_ctx = ctx;
 
 	/* We allocate a single interrupt resource */
 	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr, NULL, info, name)) != 0)
 		return (err);
 	GROUPTASK_INIT(gtask, 0, fn, q);
 	taskqgroup_attach(tqg, gtask, q, tqrid, name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
 	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, tqrid, "tx");
 	return (0);
 }
 
 void
 iflib_led_create(if_ctx_t ctx)
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
 								  device_get_nameunit(ctx->ifc_dev));
 }
 
 void
 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 }
 
 void
 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 }
 
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
 #ifdef INVARIANTS
 	struct grouptask *gtask;
 
 	gtask = &ctx->ifc_admin_task;
 	MPASS(gtask->gt_taskqueue != NULL);
 #endif
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
 void
 iflib_iov_intr_deferred(if_ctx_t ctx)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 }
 
 void
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
 {
 
 	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name);
 }
 
 void
 iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask, gtask_fn_t *fn,
 	char *name)
 {
 
 	GROUPTASK_INIT(gtask, 0, fn, ctx);
 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name);
 }
 
 void
 iflib_config_gtask_deinit(struct grouptask *gtask)
 {
 
 	taskqgroup_detach(qgroup_if_config_tqg, gtask);	
 }
 
 void
 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	if_setbaudrate(ifp, baudrate);
 
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	}
 	ctx->ifc_link_state = link_state;
 	if_link_state_change(ifp, link_state);
 }
 
 static int
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
 #ifdef INVARIANTS
 	int credits_pre = txq->ift_cidx_processed;
 #endif	
 
 	if (ctx->isc_txd_credits_update == NULL)
 		return (0);
 
 	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
 	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget)
 {
 
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 	    budget));
 }
 
 void
 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 	const char *description, if_int_delay_info_t info,
 	int offset, int value)
 {
 	info->iidi_ctx = ctx;
 	info->iidi_offset = offset;
 	info->iidi_value = value;
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
 	    info, 0, iflib_sysctl_int_delay, "I", description);
 }
 
 struct mtx *
 iflib_ctx_lock_get(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_mtx);
 }
 
 static int
 iflib_msix_init(if_ctx_t ctx)
 {
 	device_t dev = ctx->ifc_dev;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs;
 	int iflib_num_tx_queues, iflib_num_rx_queues;
 	int err, admincnt, bar;
 
 	iflib_num_tx_queues = scctx->isc_ntxqsets;
 	iflib_num_rx_queues = scctx->isc_nrxqsets;
 
 	device_printf(dev, "msix_init qsets capped at %d\n", iflib_num_tx_queues);
 	
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	admincnt = sctx->isc_admin_intrcnt;
 	/* Override by tuneable */
 	if (enable_msix == 0)
 		goto msi;
 
 	/*
 	** When used in a virtualized environment
 	** PCI BUSMASTER capability may not be set
 	** so explicity set it here and rewrite
 	** the ENABLE in the MSIX control register
 	** at this point to cause the host to
 	** successfully initialize us.
 	*/
 	{
 		uint16_t pci_cmd_word;
 		int msix_ctrl, rid;
 
 		rid = 0;
 		pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
 		pci_cmd_word |= PCIM_CMD_BUSMASTEREN;
 		pci_write_config(dev, PCIR_COMMAND, pci_cmd_word, 2);
 		pci_find_cap(dev, PCIY_MSIX, &rid);
 		rid += PCIR_MSIX_CTRL;
 		msix_ctrl = pci_read_config(dev, rid, 2);
 		msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
 		pci_write_config(dev, rid, msix_ctrl, 2);
 	}
 
 	/*
 	 * bar == -1 => "trust me I know what I'm doing"
 	 * https://www.youtube.com/watch?v=nnwWKkNau4I
 	 * Some drivers are for hardware that is so shoddily
 	 * documented that no one knows which bars are which
 	 * so the developer has to map all bars. This hack
 	 * allows shoddy garbage to use msix in this framework.
 	 */
 	if (bar != -1) {
 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
 		if (ctx->ifc_msix_mem == NULL) {
 			/* May not be enabled */
 			device_printf(dev, "Unable to map MSIX table \n");
 			goto msi;
 		}
 	}
 	/* First try MSI/X */
 	if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */
 		device_printf(dev, "System has MSIX disabled \n");
 		bus_release_resource(dev, SYS_RES_MEMORY,
 		    bar, ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 		goto msi;
 	}
 #if IFLIB_DEBUG
 	/* use only 1 qset in debug mode */
 	queuemsgs = min(msgs - admincnt, 1);
 #else
 	queuemsgs = msgs - admincnt;
 #endif
 	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) {
 #ifdef RSS
 		queues = imin(queuemsgs, rss_getnumbuckets());
 #else
 		queues = queuemsgs;
 #endif
 		queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
 		device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
 					  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 	} else {
 		device_printf(dev, "Unable to fetch CPU list\n");
 		/* Figure out a reasonable auto config value */
 		queues = min(queuemsgs, mp_ncpus);
 	}
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
 		queues = rss_getnumbuckets();
 #endif
 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 		tx_queues = iflib_num_tx_queues;
 	else
 		tx_queues = mp_ncpus;
 
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
 			device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 				      min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 #endif
 		tx_queues = min(rx_queues, tx_queues);
 		rx_queues = min(rx_queues, tx_queues);
 	}
 
 	device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues);
 
 	vectors = rx_queues + admincnt;
 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 		device_printf(dev,
 					  "Using MSIX interrupts with %d vectors\n", vectors);
 		scctx->isc_vectors = vectors;
 		scctx->isc_nrxqsets = rx_queues;
 		scctx->isc_ntxqsets = tx_queues;
 		scctx->isc_intr = IFLIB_INTR_MSIX;
 
 		return (vectors);
 	} else {
 		device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err);
 	}
 msi:
 	vectors = pci_msi_count(dev);
 	scctx->isc_nrxqsets = 1;
 	scctx->isc_ntxqsets = 1;
 	scctx->isc_vectors = vectors;
 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 		device_printf(dev,"Using an MSI interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_MSI;
 	} else {
 		device_printf(dev,"Using a Legacy interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 	}
 
 	return (vectors);
 }
 
 char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 
 static int
 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 	struct sbuf *sb;
 	char *ring_state = "UNKNOWN";
 
 	/* XXX needed ? */
 	rc = sysctl_wire_old_buffer(req, 0);
 	MPASS(rc == 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 	MPASS(sb != NULL);
 	if (sb == NULL)
 		return (ENOMEM);
 	if (state[3] <= 3)
 		ring_state = ring_states[state[3]];
 
 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 		    state[0], state[1], state[2], ring_state);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
         return(rc);
 }
 
 enum iflib_ndesc_handler {
 	IFLIB_NTXD_HANDLER,
 	IFLIB_NRXD_HANDLER,
 };
 
 static int
 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 {
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
 	uint16_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
 	MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER);
 
 	nqs = 8;
 	switch(type) {
 	case IFLIB_NTXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_ntxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_ntxqs;
 		break;
 	case IFLIB_NRXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_nrxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_nrxqs;
 		break;
 	}
 	if (nqs == 0)
 		nqs = 8;
 
 	for (i=0; i<8; i++) {
 		if (i >= nqs)
 			break;
 		if (i)
 			strcat(buf, ",");
 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
 	}
 
 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (rc || req->newptr == NULL)
 		return rc;
 
 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 	    i++, p = strsep(&next, " ,")) {
 		ndesc[i] = strtoul(p, NULL, 10);
 	}
 
 	return(rc);
 }
 
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
 	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
 	ctx_list = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 						      CTLFLAG_RD, NULL, "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
 	SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0,
 		       "driver version");
 
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 			"# of txqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 			"# of rxqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of tx descriptors to use, 0 = use default #");
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of rx descriptors to use, 0 = use default #");
 }
 
 static void
 iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx_list;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	char namebuf[NAME_BUFLEN];
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
 	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
 		qfmt = "txq%02d";
 	else
 		qfmt = "txq%d";
 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 #if MEMORY_LOGGING
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 				CTLFLAG_RD,
 				&txq->ift_dequeued, "total mbufs freed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 				CTLFLAG_RD,
 				&txq->ift_enqueued, "total mbufs enqueued");
 #endif
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 				   CTLFLAG_RD,
 				   &txq->ift_pullups, "# of times m_pullup was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 				   CTLFLAG_RD,
 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_map_failed, "# of times dma map failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 				   CTLFLAG_RD,
 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 				   CTLFLAG_RD,
 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 				   CTLFLAG_RD,
 				   &txq->ift_pidx, 1, "Producer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx, 1, "Consumer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 				   CTLFLAG_RD,
 				   &txq->ift_in_use, 1, "descriptors in use");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_processed, "descriptors procesed for clean");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
 				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br[0]->state),
 				0, mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
 				       CTLFLAG_RD, &txq->ift_br[0]->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
 				       CTLFLAG_RD, &txq->ift_br[0]->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
 				       CTLFLAG_RD, &txq->ift_br[0]->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
 				       CTLFLAG_RD, &txq->ift_br[0]->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
 			       CTLFLAG_RD, &txq->ift_br[0]->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
 				       CTLFLAG_RD, &txq->ift_br[0]->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
 	}
 
 	if (scctx->isc_nrxqsets > 100)
 		qfmt = "rxq%03d";
 	else if (scctx->isc_nrxqsets > 10)
 		qfmt = "rxq%02d";
 	else
 		qfmt = "rxq%d";
 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 						     CTLFLAG_RD, NULL, "freelist Name");
 			fl_list = SYSCTL_CHILDREN(fl_node);
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_cidx, 1, "Consumer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 				       CTLFLAG_RD,
 				       &fl->ifl_credits, 1, "credits available");
 #if MEMORY_LOGGING
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_m_enqueued, "mbufs allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_m_dequeued, "mbufs freed");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_enqueued, "clusters allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_dequeued, "clusters freed");
 #endif
 
 		}
 	}
 
 }
Index: projects/netbsd-tests-upstream-01-2017/sys/sys/ata.h
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/sys/ata.h	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/sys/ata.h	(revision 312218)
@@ -1,1015 +1,1015 @@
 /*-
  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_ATA_H_
 #define _SYS_ATA_H_
 
 #include <sys/ioccom.h>
 
 /* ATA/ATAPI device parameters */
 struct ata_params {
 /*000*/ u_int16_t       config;         /* configuration info */
 #define ATA_PROTO_MASK                  0x8003
 #define ATA_PROTO_ATAPI                 0x8000
 #define ATA_PROTO_ATAPI_12              0x8000
 #define ATA_PROTO_ATAPI_16              0x8001
 #define ATA_PROTO_CFA                   0x848a
 #define ATA_ATAPI_TYPE_MASK             0x1f00
 #define ATA_ATAPI_TYPE_DIRECT           0x0000  /* disk/floppy */
 #define ATA_ATAPI_TYPE_TAPE             0x0100  /* streaming tape */
 #define ATA_ATAPI_TYPE_CDROM            0x0500  /* CD-ROM device */
 #define ATA_ATAPI_TYPE_OPTICAL          0x0700  /* optical disk */
 #define ATA_DRQ_MASK                    0x0060
 #define ATA_DRQ_SLOW                    0x0000  /* cpu 3 ms delay */
 #define ATA_DRQ_INTR                    0x0020  /* interrupt 10 ms delay */
 #define ATA_DRQ_FAST                    0x0040  /* accel 50 us delay */
 #define ATA_RESP_INCOMPLETE             0x0004
 
 /*001*/ u_int16_t       cylinders;              /* # of cylinders */
 /*002*/ u_int16_t       specconf;		/* specific configuration */
 /*003*/ u_int16_t       heads;                  /* # heads */
 	u_int16_t       obsolete4;
 	u_int16_t       obsolete5;
 /*006*/ u_int16_t       sectors;                /* # sectors/track */
 /*007*/ u_int16_t       vendor7[3];
 /*010*/ u_int8_t        serial[20];             /* serial number */
 /*020*/ u_int16_t       retired20;
 	u_int16_t       retired21;
 	u_int16_t       obsolete22;
 /*023*/ u_int8_t        revision[8];            /* firmware revision */
 /*027*/ u_int8_t        model[40];              /* model name */
 /*047*/ u_int16_t       sectors_intr;           /* sectors per interrupt */
 /*048*/ u_int16_t       usedmovsd;              /* double word read/write? */
 /*049*/ u_int16_t       capabilities1;
 #define ATA_SUPPORT_DMA                 0x0100
 #define ATA_SUPPORT_LBA                 0x0200
 #define ATA_SUPPORT_IORDY               0x0400
 #define ATA_SUPPORT_IORDYDIS            0x0800
 #define ATA_SUPPORT_OVERLAP             0x4000
 
 /*050*/ u_int16_t       capabilities2;
 /*051*/ u_int16_t       retired_piomode;        /* PIO modes 0-2 */
 #define ATA_RETIRED_PIO_MASK            0x0300
 
 /*052*/ u_int16_t       retired_dmamode;        /* DMA modes */
 #define ATA_RETIRED_DMA_MASK            0x0003
 
 /*053*/ u_int16_t       atavalid;               /* fields valid */
 #define ATA_FLAG_54_58                  0x0001  /* words 54-58 valid */
 #define ATA_FLAG_64_70                  0x0002  /* words 64-70 valid */
 #define ATA_FLAG_88                     0x0004  /* word 88 valid */
 
 /*054*/ u_int16_t       current_cylinders;
 /*055*/ u_int16_t       current_heads;
 /*056*/ u_int16_t       current_sectors;
 /*057*/ u_int16_t       current_size_1;
 /*058*/ u_int16_t       current_size_2;
 /*059*/ u_int16_t       multi;
 #define ATA_MULTI_VALID                 0x0100
 
 /*060*/ u_int16_t       lba_size_1;
 	u_int16_t       lba_size_2;
 	u_int16_t       obsolete62;
 /*063*/ u_int16_t       mwdmamodes;             /* multiword DMA modes */
 /*064*/ u_int16_t       apiomodes;              /* advanced PIO modes */
 
 /*065*/ u_int16_t       mwdmamin;               /* min. M/W DMA time/word ns */
 /*066*/ u_int16_t       mwdmarec;               /* rec. M/W DMA time ns */
 /*067*/ u_int16_t       pioblind;               /* min. PIO cycle w/o flow */
 /*068*/ u_int16_t       pioiordy;               /* min. PIO cycle IORDY flow */
 /*069*/ u_int16_t       support3;
 #define ATA_SUPPORT_RZAT                0x0020
 #define ATA_SUPPORT_DRAT                0x4000
 #define	ATA_SUPPORT_ZONE_MASK		0x0003
 #define	ATA_SUPPORT_ZONE_NR		0x0000
 #define	ATA_SUPPORT_ZONE_HOST_AWARE	0x0001
 #define	ATA_SUPPORT_ZONE_DEV_MANAGED	0x0002
 	u_int16_t       reserved70;
 /*071*/ u_int16_t       rlsovlap;               /* rel time (us) for overlap */
 /*072*/ u_int16_t       rlsservice;             /* rel time (us) for service */
 	u_int16_t       reserved73;
 	u_int16_t       reserved74;
 /*075*/ u_int16_t       queue;
 #define ATA_QUEUE_LEN(x)                ((x) & 0x001f)
 
 /*76*/  u_int16_t       satacapabilities;
 #define ATA_SATA_GEN1                   0x0002
 #define ATA_SATA_GEN2                   0x0004
 #define ATA_SATA_GEN3                   0x0008
 #define ATA_SUPPORT_NCQ                 0x0100
 #define ATA_SUPPORT_IFPWRMNGTRCV        0x0200
 #define ATA_SUPPORT_PHYEVENTCNT         0x0400
 #define ATA_SUPPORT_NCQ_UNLOAD          0x0800
 #define ATA_SUPPORT_NCQ_PRIO            0x1000
 #define ATA_SUPPORT_HAPST               0x2000
 #define ATA_SUPPORT_DAPST               0x4000
 #define ATA_SUPPORT_READLOGDMAEXT       0x8000
 
 /*77*/  u_int16_t       satacapabilities2;
 #define ATA_SATA_CURR_GEN_MASK          0x0006
 #define ATA_SUPPORT_NCQ_STREAM          0x0010
 #define ATA_SUPPORT_NCQ_QMANAGEMENT     0x0020
 #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040
 /*78*/  u_int16_t       satasupport;
 #define ATA_SUPPORT_NONZERO             0x0002
 #define ATA_SUPPORT_AUTOACTIVATE        0x0004
 #define ATA_SUPPORT_IFPWRMNGT           0x0008
 #define ATA_SUPPORT_INORDERDATA         0x0010
 #define ATA_SUPPORT_ASYNCNOTIF          0x0020
 #define ATA_SUPPORT_SOFTSETPRESERVE     0x0040
 /*79*/  u_int16_t       sataenabled;
 #define ATA_ENABLED_DAPST               0x0080
 
 /*080*/ u_int16_t       version_major;
 /*081*/ u_int16_t       version_minor;
 
 	struct {
 /*082/085*/ u_int16_t   command1;
 #define ATA_SUPPORT_SMART               0x0001
 #define ATA_SUPPORT_SECURITY            0x0002
 #define ATA_SUPPORT_REMOVABLE           0x0004
 #define ATA_SUPPORT_POWERMGT            0x0008
 #define ATA_SUPPORT_PACKET              0x0010
 #define ATA_SUPPORT_WRITECACHE          0x0020
 #define ATA_SUPPORT_LOOKAHEAD           0x0040
 #define ATA_SUPPORT_RELEASEIRQ          0x0080
 #define ATA_SUPPORT_SERVICEIRQ          0x0100
 #define ATA_SUPPORT_RESET               0x0200
 #define ATA_SUPPORT_PROTECTED           0x0400
 #define ATA_SUPPORT_WRITEBUFFER         0x1000
 #define ATA_SUPPORT_READBUFFER          0x2000
 #define ATA_SUPPORT_NOP                 0x4000
 
 /*083/086*/ u_int16_t   command2;
 #define ATA_SUPPORT_MICROCODE           0x0001
 #define ATA_SUPPORT_QUEUED              0x0002
 #define ATA_SUPPORT_CFA                 0x0004
 #define ATA_SUPPORT_APM                 0x0008
 #define ATA_SUPPORT_NOTIFY              0x0010
 #define ATA_SUPPORT_STANDBY             0x0020
 #define ATA_SUPPORT_SPINUP              0x0040
 #define ATA_SUPPORT_MAXSECURITY         0x0100
 #define ATA_SUPPORT_AUTOACOUSTIC        0x0200
 #define ATA_SUPPORT_ADDRESS48           0x0400
 #define ATA_SUPPORT_OVERLAY             0x0800
 #define ATA_SUPPORT_FLUSHCACHE          0x1000
 #define ATA_SUPPORT_FLUSHCACHE48        0x2000
 
 /*084/087*/ u_int16_t   extension;
 #define ATA_SUPPORT_SMARTLOG		0x0001
 #define ATA_SUPPORT_SMARTTEST		0x0002
 #define ATA_SUPPORT_MEDIASN		0x0004
 #define ATA_SUPPORT_MEDIAPASS		0x0008
 #define ATA_SUPPORT_STREAMING		0x0010
 #define ATA_SUPPORT_GENLOG		0x0020
 #define ATA_SUPPORT_WRITEDMAFUAEXT	0x0040
 #define ATA_SUPPORT_WRITEDMAQFUAEXT	0x0080
 #define ATA_SUPPORT_64BITWWN		0x0100
 #define ATA_SUPPORT_UNLOAD		0x2000
 	} __packed support, enabled;
 
 /*088*/ u_int16_t       udmamodes;              /* UltraDMA modes */
 /*089*/ u_int16_t       erase_time;             /* time req'd in 2min units */
 /*090*/ u_int16_t       enhanced_erase_time;    /* time req'd in 2min units */
 /*091*/ u_int16_t       apm_value;
 /*092*/ u_int16_t       master_passwd_revision; /* password revision code */
 /*093*/ u_int16_t       hwres;
 #define ATA_CABLE_ID                    0x2000
 
 /*094*/ u_int16_t       acoustic;
 #define ATA_ACOUSTIC_CURRENT(x)         ((x) & 0x00ff)
 #define ATA_ACOUSTIC_VENDOR(x)          (((x) & 0xff00) >> 8)
 
 /*095*/ u_int16_t       stream_min_req_size;
 /*096*/ u_int16_t       stream_transfer_time;
 /*097*/ u_int16_t       stream_access_latency;
 /*098*/ u_int32_t       stream_granularity;
 /*100*/ u_int16_t       lba_size48_1;
 	u_int16_t       lba_size48_2;
 	u_int16_t       lba_size48_3;
 	u_int16_t       lba_size48_4;
 	u_int16_t       reserved104;
 /*105*/	u_int16_t       max_dsm_blocks;
 /*106*/	u_int16_t       pss;
 #define ATA_PSS_LSPPS			0x000F
 #define ATA_PSS_LSSABOVE512		0x1000
 #define ATA_PSS_MULTLS			0x2000
 #define ATA_PSS_VALID_MASK		0xC000
 #define ATA_PSS_VALID_VALUE		0x4000
 /*107*/ u_int16_t       isd;
 /*108*/ u_int16_t       wwn[4];
 	u_int16_t       reserved112[5];
 /*117*/ u_int16_t       lss_1;
 /*118*/ u_int16_t       lss_2;
 /*119*/ u_int16_t       support2;
 #define ATA_SUPPORT_WRITEREADVERIFY	0x0002
 #define ATA_SUPPORT_WRITEUNCORREXT	0x0004
 #define ATA_SUPPORT_RWLOGDMAEXT		0x0008
 #define ATA_SUPPORT_MICROCODE3		0x0010
 #define ATA_SUPPORT_FREEFALL		0x0020
 #define ATA_SUPPORT_SENSE_REPORT	0x0040
 #define ATA_SUPPORT_EPC			0x0080
 /*120*/ u_int16_t       enabled2;
 #define ATA_ENABLED_WRITEREADVERIFY	0x0002
 #define ATA_ENABLED_WRITEUNCORREXT	0x0004
 #define ATA_ENABLED_FREEFALL		0x0020
 #define ATA_ENABLED_SENSE_REPORT	0x0040
 #define ATA_ENABLED_EPC			0x0080
 	u_int16_t       reserved121[6];
 /*127*/ u_int16_t       removable_status;
 /*128*/ u_int16_t       security_status;
 #define ATA_SECURITY_LEVEL		0x0100	/* 0: high, 1: maximum */
 #define ATA_SECURITY_ENH_SUPP		0x0020	/* enhanced erase supported */
 #define ATA_SECURITY_COUNT_EXP		0x0010	/* count expired */
 #define ATA_SECURITY_FROZEN		0x0008	/* security config is frozen */
 #define ATA_SECURITY_LOCKED		0x0004	/* drive is locked */
 #define ATA_SECURITY_ENABLED		0x0002	/* ATA Security is enabled */
 #define ATA_SECURITY_SUPPORTED		0x0001	/* ATA Security is supported */
 
 	u_int16_t       reserved129[31];
 /*160*/ u_int16_t       cfa_powermode1;
 	u_int16_t       reserved161;
 /*162*/ u_int16_t       cfa_kms_support;
 /*163*/ u_int16_t       cfa_trueide_modes;
 /*164*/ u_int16_t       cfa_memory_modes;
 	u_int16_t       reserved165[4];
 /*169*/	u_int16_t       support_dsm;
 #define ATA_SUPPORT_DSM_TRIM		0x0001
 	u_int16_t       reserved170[6];
 /*176*/ u_int8_t        media_serial[60];
 /*206*/ u_int16_t       sct;
 	u_int16_t       reserved206[2];
 /*209*/ u_int16_t       lsalign;
 /*210*/ u_int16_t       wrv_sectors_m3_1;
 	u_int16_t       wrv_sectors_m3_2;
 /*212*/ u_int16_t       wrv_sectors_m2_1;
 	u_int16_t       wrv_sectors_m2_2;
 /*214*/ u_int16_t       nv_cache_caps;
 /*215*/ u_int16_t       nv_cache_size_1;
 	u_int16_t       nv_cache_size_2;
 /*217*/ u_int16_t       media_rotation_rate;
 #define ATA_RATE_NOT_REPORTED		0x0000
 #define ATA_RATE_NON_ROTATING		0x0001
 	u_int16_t       reserved218;
 /*219*/ u_int16_t       nv_cache_opt;
 /*220*/ u_int16_t       wrv_mode;
 	u_int16_t       reserved221;
 /*222*/ u_int16_t       transport_major;
 /*223*/ u_int16_t       transport_minor;
 	u_int16_t       reserved224[31];
 /*255*/ u_int16_t       integrity;
 } __packed;
 
 /* ATA Dataset Management */
 #define ATA_DSM_BLK_SIZE	512
 #define ATA_DSM_BLK_RANGES	64
 #define ATA_DSM_RANGE_SIZE	8
 #define ATA_DSM_RANGE_MAX	65535
 
 /*
  * ATA Device Register
  *
  * bit 7 Obsolete (was 1 in early ATA specs)
  * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS 
  * bit 5 Obsolete (was 1 in early ATA specs)
  * bit 4 1 = Slave Drive, 0 = Master Drive
  * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number
 */
 
 #define ATA_DEV_MASTER		0x00
 #define ATA_DEV_SLAVE		0x10
 #define ATA_DEV_LBA		0x40
 
 /* ATA limits */
 #define ATA_MAX_28BIT_LBA	268435455UL
 
 /* ATA Status Register */
 #define ATA_STATUS_ERROR		0x01
 #define ATA_STATUS_SENSE_AVAIL		0x02
 #define ATA_STATUS_ALIGN_ERR		0x04
 #define ATA_STATUS_DATA_REQ		0x08
 #define ATA_STATUS_DEF_WRITE_ERR	0x10
 #define ATA_STATUS_DEVICE_FAULT		0x20
 #define ATA_STATUS_DEVICE_READY		0x40
 #define ATA_STATUS_BUSY			0x80
 
 /* ATA Error Register */
 #define ATA_ERROR_ABORT		0x04
 #define ATA_ERROR_ID_NOT_FOUND	0x10
 
 /* ATA HPA Features */
 #define ATA_HPA_FEAT_MAX_ADDR	0x00
 #define ATA_HPA_FEAT_SET_PWD	0x01
 #define ATA_HPA_FEAT_LOCK	0x02
 #define ATA_HPA_FEAT_UNLOCK	0x03
 #define ATA_HPA_FEAT_FREEZE	0x04
 
 /* ATA transfer modes */
 #define ATA_MODE_MASK           0x0f
 #define ATA_DMA_MASK            0xf0
 #define ATA_PIO                 0x00
 #define ATA_PIO0                0x08
 #define ATA_PIO1                0x09
 #define ATA_PIO2                0x0a
 #define ATA_PIO3                0x0b
 #define ATA_PIO4                0x0c
 #define ATA_PIO_MAX             0x0f
 #define ATA_DMA                 0x10
 #define ATA_WDMA0               0x20
 #define ATA_WDMA1               0x21
 #define ATA_WDMA2               0x22
 #define ATA_UDMA0               0x40
 #define ATA_UDMA1               0x41
 #define ATA_UDMA2               0x42
 #define ATA_UDMA3               0x43
 #define ATA_UDMA4               0x44
 #define ATA_UDMA5               0x45
 #define ATA_UDMA6               0x46
 #define ATA_SA150               0x47
 #define ATA_SA300               0x48
 #define ATA_SA600               0x49
 #define ATA_DMA_MAX             0x4f
 
 
 /* ATA commands */
 #define ATA_NOP                         0x00    /* NOP */
 #define         ATA_NF_FLUSHQUEUE       0x00    /* flush queued cmd's */
 #define         ATA_NF_AUTOPOLL         0x01    /* start autopoll function */
 #define ATA_DATA_SET_MANAGEMENT		0x06
 #define 	ATA_DSM_TRIM		0x01
 #define ATA_DEVICE_RESET                0x08    /* reset device */
 #define ATA_READ                        0x20    /* read */
 #define ATA_READ48                      0x24    /* read 48bit LBA */
 #define ATA_READ_DMA48                  0x25    /* read DMA 48bit LBA */
 #define ATA_READ_DMA_QUEUED48           0x26    /* read DMA QUEUED 48bit LBA */
 #define ATA_READ_NATIVE_MAX_ADDRESS48   0x27    /* read native max addr 48bit */
 #define ATA_READ_MUL48                  0x29    /* read multi 48bit LBA */
 #define ATA_READ_STREAM_DMA48           0x2a    /* read DMA stream 48bit LBA */
 #define ATA_READ_LOG_EXT                0x2f    /* read log ext - PIO Data-In */
 #define ATA_READ_STREAM48               0x2b    /* read stream 48bit LBA */
 #define ATA_WRITE                       0x30    /* write */
 #define ATA_WRITE48                     0x34    /* write 48bit LBA */
 #define ATA_WRITE_DMA48                 0x35    /* write DMA 48bit LBA */
 #define ATA_WRITE_DMA_QUEUED48          0x36    /* write DMA QUEUED 48bit LBA*/
 #define ATA_SET_MAX_ADDRESS48           0x37    /* set max address 48bit */
 #define ATA_WRITE_MUL48                 0x39    /* write multi 48bit LBA */
 #define ATA_WRITE_STREAM_DMA48          0x3a
 #define ATA_WRITE_STREAM48              0x3b
 #define ATA_WRITE_DMA_FUA48             0x3d
 #define ATA_WRITE_DMA_QUEUED_FUA48      0x3e
 #define ATA_WRITE_LOG_EXT               0x3f
 #define ATA_READ_VERIFY                 0x40
 #define ATA_READ_VERIFY48               0x42
 #define ATA_WRITE_UNCORRECTABLE48       0x45    /* write uncorrectable 48bit LBA */
 #define         ATA_WU_PSEUDO           0x55    /* pseudo-uncorrectable error */
 #define         ATA_WU_FLAGGED          0xaa    /* flagged-uncorrectable error */
 #define ATA_READ_LOG_DMA_EXT            0x47    /* read log DMA ext - PIO Data-In */
 #define	ATA_ZAC_MANAGEMENT_IN		0x4a	/* ZAC management in */
 #define		ATA_ZM_REPORT_ZONES	0x00	/* report zones */
 #define ATA_READ_FPDMA_QUEUED           0x60    /* read DMA NCQ */
 #define ATA_WRITE_FPDMA_QUEUED          0x61    /* write DMA NCQ */
 #define ATA_NCQ_NON_DATA		0x63	/* NCQ non-data command */
 #define		ATA_ABORT_NCQ_QUEUE	0x00	/* abort NCQ queue */
 #define		ATA_DEADLINE_HANDLING	0x01	/* deadline handling */
 #define		ATA_SET_FEATURES	0x05	/* set features */
 #define		ATA_ZERO_EXT		0x06	/* zero ext */
 #define		ATA_NCQ_ZAC_MGMT_OUT	0x07	/* NCQ ZAC mgmt out no data */
 #define ATA_SEND_FPDMA_QUEUED           0x64    /* send DMA NCQ */
 #define		ATA_SFPDMA_DSM		0x00	/* Data set management */
 #define			ATA_SFPDMA_DSM_TRIM	0x01	/* Set trim bit in auxiliary */
 #define		ATA_SFPDMA_HYBRID_EVICT	0x01	/* Hybrid Evict */
 #define		ATA_SFPDMA_WLDMA	0x02	/* Write Log DMA EXT */
 #define		ATA_SFPDMA_ZAC_MGMT_OUT	0x03	/* NCQ ZAC mgmt out w/data */
 #define ATA_RECV_FPDMA_QUEUED           0x65    /* receive DMA NCQ */
 #define		ATA_RFPDMA_RL_DMA_EXT	0x00	/* Read Log DMA EXT */
 #define		ATA_RFPDMA_ZAC_MGMT_IN	0x02	/* NCQ ZAC mgmt in w/data */
 #define ATA_SEP_ATTN                    0x67    /* SEP request */
 #define ATA_SEEK                        0x70    /* seek */
 #define	ATA_ZAC_MANAGEMENT_OUT		0x9f	/* ZAC management out */
 #define		ATA_ZM_CLOSE_ZONE	0x01	/* close zone */
 #define		ATA_ZM_FINISH_ZONE	0x02	/* finish zone */
 #define		ATA_ZM_OPEN_ZONE	0x03	/* open zone */
 #define		ATA_ZM_RWP		0x04	/* reset write pointer */
 #define ATA_PACKET_CMD                  0xa0    /* packet command */
 #define ATA_ATAPI_IDENTIFY              0xa1    /* get ATAPI params*/
 #define ATA_SERVICE                     0xa2    /* service command */
 #define ATA_SMART_CMD                   0xb0    /* SMART command */
 #define ATA_CFA_ERASE                   0xc0    /* CFA erase */
 #define ATA_READ_MUL                    0xc4    /* read multi */
 #define ATA_WRITE_MUL                   0xc5    /* write multi */
 #define ATA_SET_MULTI                   0xc6    /* set multi size */
 #define ATA_READ_DMA_QUEUED             0xc7    /* read DMA QUEUED */
 #define ATA_READ_DMA                    0xc8    /* read DMA */
 #define ATA_WRITE_DMA                   0xca    /* write DMA */
 #define ATA_WRITE_DMA_QUEUED            0xcc    /* write DMA QUEUED */
 #define ATA_WRITE_MUL_FUA48             0xce
 #define ATA_STANDBY_IMMEDIATE           0xe0    /* standby immediate */
 #define ATA_IDLE_IMMEDIATE              0xe1    /* idle immediate */
 #define ATA_STANDBY_CMD                 0xe2    /* standby */
 #define ATA_IDLE_CMD                    0xe3    /* idle */
 #define ATA_READ_BUFFER                 0xe4    /* read buffer */
 #define ATA_READ_PM                     0xe4    /* read portmultiplier */
 #define ATA_CHECK_POWER_MODE            0xe5    /* device power mode */
 #define ATA_SLEEP                       0xe6    /* sleep */
 #define ATA_FLUSHCACHE                  0xe7    /* flush cache to disk */
 #define ATA_WRITE_PM                    0xe8    /* write portmultiplier */
 #define ATA_FLUSHCACHE48                0xea    /* flush cache to disk */
 #define ATA_ATA_IDENTIFY                0xec    /* get ATA params */
 #define ATA_SETFEATURES                 0xef    /* features command */
 #define         ATA_SF_ENAB_WCACHE      0x02    /* enable write cache */
 #define         ATA_SF_DIS_WCACHE       0x82    /* disable write cache */
 #define         ATA_SF_SETXFER          0x03    /* set transfer mode */
 #define		ATA_SF_APM		0x05	/* Enable APM feature set */
 #define         ATA_SF_ENAB_PUIS        0x06    /* enable PUIS */
 #define         ATA_SF_DIS_PUIS         0x86    /* disable PUIS */
 #define         ATA_SF_PUIS_SPINUP      0x07    /* PUIS spin-up */
 #define		ATA_SF_WRV		0x0b	/* Enable Write-Read-Verify */
 #define 	ATA_SF_DLC		0x0c	/* Enable device life control */
 #define 	ATA_SF_SATA		0x10	/* Enable use of SATA feature */
 #define 	ATA_SF_FFC		0x41	/* Free-fall Control */
 #define 	ATA_SF_MHIST		0x43	/* Set Max Host Sect. Times */
 #define 	ATA_SF_RATE		0x45	/* Set Rate Basis */
 #define 	ATA_SF_EPC		0x4A	/* Extended Power Conditions */
 #define         ATA_SF_ENAB_RCACHE      0xaa    /* enable readahead cache */
 #define         ATA_SF_DIS_RCACHE       0x55    /* disable readahead cache */
 #define         ATA_SF_ENAB_RELIRQ      0x5d    /* enable release interrupt */
 #define         ATA_SF_DIS_RELIRQ       0xdd    /* disable release interrupt */
 #define         ATA_SF_ENAB_SRVIRQ      0x5e    /* enable service interrupt */
 #define         ATA_SF_DIS_SRVIRQ       0xde    /* disable service interrupt */
 #define 	ATA_SF_LPSAERC		0x62	/* Long Phys Sect Align ErrRep*/
 #define 	ATA_SF_DSN		0x63	/* Device Stats Notification */
 #define ATA_CHECK_POWER_MODE		0xe5	/* Check Power Mode */
 #define ATA_SECURITY_SET_PASSWORD       0xf1    /* set drive password */
 #define ATA_SECURITY_UNLOCK             0xf2    /* unlock drive using passwd */
 #define ATA_SECURITY_ERASE_PREPARE      0xf3    /* prepare to erase drive */
 #define ATA_SECURITY_ERASE_UNIT         0xf4    /* erase all blocks on drive */
 #define ATA_SECURITY_FREEZE_LOCK        0xf5    /* freeze security config */
 #define ATA_SECURITY_DISABLE_PASSWORD   0xf6    /* disable drive password */
 #define ATA_READ_NATIVE_MAX_ADDRESS     0xf8    /* read native max address */
 #define ATA_SET_MAX_ADDRESS             0xf9    /* set max address */
 
 
 /* ATAPI commands */
 #define ATAPI_TEST_UNIT_READY           0x00    /* check if device is ready */
 #define ATAPI_REZERO                    0x01    /* rewind */
 #define ATAPI_REQUEST_SENSE             0x03    /* get sense data */
 #define ATAPI_FORMAT                    0x04    /* format unit */
 #define ATAPI_READ                      0x08    /* read data */
 #define ATAPI_WRITE                     0x0a    /* write data */
 #define ATAPI_WEOF                      0x10    /* write filemark */
 #define         ATAPI_WF_WRITE          0x01
 #define ATAPI_SPACE                     0x11    /* space command */
 #define         ATAPI_SP_FM             0x01
 #define         ATAPI_SP_EOD            0x03
 #define ATAPI_INQUIRY			0x12	/* get inquiry data */
 #define ATAPI_MODE_SELECT               0x15    /* mode select */
 #define ATAPI_ERASE                     0x19    /* erase */
 #define ATAPI_MODE_SENSE                0x1a    /* mode sense */
 #define ATAPI_START_STOP                0x1b    /* start/stop unit */
 #define         ATAPI_SS_LOAD           0x01
 #define         ATAPI_SS_RETENSION      0x02
 #define         ATAPI_SS_EJECT          0x04
 #define ATAPI_PREVENT_ALLOW             0x1e    /* media removal */
 #define ATAPI_READ_FORMAT_CAPACITIES    0x23    /* get format capacities */
 #define ATAPI_READ_CAPACITY             0x25    /* get volume capacity */
 #define ATAPI_READ_BIG                  0x28    /* read data */
 #define ATAPI_WRITE_BIG                 0x2a    /* write data */
 #define ATAPI_LOCATE                    0x2b    /* locate to position */
 #define ATAPI_READ_POSITION             0x34    /* read position */
 #define ATAPI_SYNCHRONIZE_CACHE         0x35    /* flush buf, close channel */
 #define ATAPI_WRITE_BUFFER              0x3b    /* write device buffer */
 #define ATAPI_READ_BUFFER               0x3c    /* read device buffer */
 #define ATAPI_READ_SUBCHANNEL           0x42    /* get subchannel info */
 #define ATAPI_READ_TOC                  0x43    /* get table of contents */
 #define ATAPI_PLAY_10                   0x45    /* play by lba */
 #define ATAPI_PLAY_MSF                  0x47    /* play by MSF address */
 #define ATAPI_PLAY_TRACK                0x48    /* play by track number */
 #define ATAPI_PAUSE                     0x4b    /* pause audio operation */
 #define ATAPI_READ_DISK_INFO            0x51    /* get disk info structure */
 #define ATAPI_READ_TRACK_INFO           0x52    /* get track info structure */
 #define ATAPI_RESERVE_TRACK             0x53    /* reserve track */
 #define ATAPI_SEND_OPC_INFO             0x54    /* send OPC structurek */
 #define ATAPI_MODE_SELECT_BIG           0x55    /* set device parameters */
 #define ATAPI_REPAIR_TRACK              0x58    /* repair track */
 #define ATAPI_READ_MASTER_CUE           0x59    /* read master CUE info */
 #define ATAPI_MODE_SENSE_BIG            0x5a    /* get device parameters */
 #define ATAPI_CLOSE_TRACK               0x5b    /* close track/session */
 #define ATAPI_READ_BUFFER_CAPACITY      0x5c    /* get buffer capicity */
 #define ATAPI_SEND_CUE_SHEET            0x5d    /* send CUE sheet */
 #define ATAPI_SERVICE_ACTION_IN         0x96	/* get service data */
 #define ATAPI_BLANK                     0xa1    /* blank the media */
 #define ATAPI_SEND_KEY                  0xa3    /* send DVD key structure */
 #define ATAPI_REPORT_KEY                0xa4    /* get DVD key structure */
 #define ATAPI_PLAY_12                   0xa5    /* play by lba */
 #define ATAPI_LOAD_UNLOAD               0xa6    /* changer control command */
 #define ATAPI_READ_STRUCTURE            0xad    /* get DVD structure */
 #define ATAPI_PLAY_CD                   0xb4    /* universal play command */
 #define ATAPI_SET_SPEED                 0xbb    /* set drive speed */
 #define ATAPI_MECH_STATUS               0xbd    /* get changer status */
 #define ATAPI_READ_CD                   0xbe    /* read data */
 #define ATAPI_POLL_DSC                  0xff    /* poll DSC status bit */
 
 
 struct ata_ioc_devices {
     int                 channel;
     char                name[2][32];
     struct ata_params   params[2];
 };
 
 /* pr channel ATA ioctl calls */
 #define IOCATAGMAXCHANNEL       _IOR('a',  1, int)
 #define IOCATAREINIT            _IOW('a',  2, int)
 #define IOCATAATTACH            _IOW('a',  3, int)
 #define IOCATADETACH            _IOW('a',  4, int)
 #define IOCATADEVICES           _IOWR('a',  5, struct ata_ioc_devices)
 
 /* ATAPI request sense structure */
 struct atapi_sense {
     u_int8_t	error;				/* current or deferred errors */
 #define	ATA_SENSE_VALID			0x80
 
     u_int8_t	segment;			/* segment number */
     u_int8_t	key;				/* sense key */
 #define ATA_SENSE_KEY_MASK		0x0f    /* sense key mask */
 #define ATA_SENSE_NO_SENSE		0x00    /* no specific sense key info */
 #define ATA_SENSE_RECOVERED_ERROR 	0x01    /* command OK, data recovered */
 #define ATA_SENSE_NOT_READY		0x02    /* no access to drive */
 #define ATA_SENSE_MEDIUM_ERROR		0x03    /* non-recovered data error */
 #define ATA_SENSE_HARDWARE_ERROR	0x04    /* non-recoverable HW failure */
 #define ATA_SENSE_ILLEGAL_REQUEST	0x05    /* invalid command param(s) */
 #define ATA_SENSE_UNIT_ATTENTION	0x06    /* media changed */
 #define ATA_SENSE_DATA_PROTECT		0x07    /* write protect */
 #define ATA_SENSE_BLANK_CHECK		0x08    /* blank check */
 #define ATA_SENSE_VENDOR_SPECIFIC	0x09    /* vendor specific skey */
 #define ATA_SENSE_COPY_ABORTED		0x0a    /* copy aborted */
 #define ATA_SENSE_ABORTED_COMMAND	0x0b    /* command aborted, try again */
 #define ATA_SENSE_EQUAL			0x0c    /* equal */
 #define ATA_SENSE_VOLUME_OVERFLOW	0x0d    /* volume overflow */
 #define ATA_SENSE_MISCOMPARE		0x0e    /* data dont match the medium */
 #define ATA_SENSE_RESERVED		0x0f
 #define	ATA_SENSE_ILI			0x20;
 #define	ATA_SENSE_EOM			0x40;
 #define	ATA_SENSE_FILEMARK		0x80;
 
     u_int32_t   cmd_info;		/* cmd information */
     u_int8_t	sense_length;		/* additional sense len (n-7) */
     u_int32_t   cmd_specific_info;	/* additional cmd spec info */
     u_int8_t    asc;			/* additional sense code */
     u_int8_t    ascq;			/* additional sense code qual */
     u_int8_t    replaceable_unit_code;	/* replaceable unit code */
     u_int8_t	specific;		/* sense key specific */
 #define	ATA_SENSE_SPEC_VALID	0x80
 #define	ATA_SENSE_SPEC_MASK	0x7f
 	
     u_int8_t	specific1;		/* sense key specific */
     u_int8_t	specific2;		/* sense key specific */
 } __packed;
 
 /*
  * SET FEATURES subcommands
  */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * These values go in the LBA 3:0.
  */
 #define ATA_SF_EPC_RESTORE	0x00	/* Restore Power Condition Settings */
 #define ATA_SF_EPC_GOTO		0x01	/* Go To Power Condition */
 #define ATA_SF_EPC_SET_TIMER	0x02	/* Set Power Condition Timer */
 #define ATA_SF_EPC_SET_STATE	0x03	/* Set Power Condition State */
 #define ATA_SF_EPC_ENABLE	0x04	/* Enable the EPC feature set */
 #define ATA_SF_EPC_DISABLE	0x05	/* Disable the EPC feature set */
 #define ATA_SF_EPC_SET_SOURCE	0x06	/* Set EPC Power Source */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Power Condition ID field
  * These values go in the count register.
  */
 #define ATA_EPC_STANDBY_Z	0x00	/* Substate of PM2:Standby */
 #define ATA_EPC_STANDBY_Y	0x01	/* Substate of PM2:Standby */
 #define ATA_EPC_IDLE_A		0x81	/* Substate of PM1:Idle */
 #define ATA_EPC_IDLE_B		0x82	/* Substate of PM1:Idle */
 #define ATA_EPC_IDLE_C		0x83	/* Substate of PM1:Idle */
 #define ATA_EPC_ALL		0xff	/* All supported power conditions */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Restore Power Conditions Settings subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_RST_DFLT	0x40	/* 1=Rst from Default, 0= from Saved */
 #define ATA_SF_EPC_RST_SAVE	0x10	/* 1=Save on completion */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Got To Power Condition subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_GOTO_DELAY	0x02000000	/* Delayed entry bit */
 #define ATA_SF_EPC_GOTO_HOLD	0x01000000	/* Hold Power Cond bit */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set Power Condition Timer subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_TIMER_MASK	0x00ffff00	/* Timer field */
 #define ATA_SF_EPC_TIMER_SHIFT	8
 #define ATA_SF_EPC_TIMER_SEC	0x00000080	/* Timer units, 1=sec, 0=.1s */
 #define ATA_SF_EPC_TIMER_EN	0x00000020	/* Enable/disable cond. */
 #define ATA_SF_EPC_TIMER_SAVE	0x00000010	/* Save settings on comp.  */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set Power Condition State subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_SETCON_EN	0x00000020	/* Enable power cond. */
 #define ATA_SF_EPC_SETCON_SAVE	0x00000010	/* Save settings on comp */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set EPC Power Source subcommand
  * These values go in the count register.
  */
 #define ATA_SF_EPC_SRC_UNKNOWN	0x0000	/* Unknown source */
 #define ATA_SF_EPC_SRC_BAT	0x0001	/* battery source */
 #define ATA_SF_EPC_SRC_NOT_BAT	0x0002	/* not battery source */
 
 #define	ATA_LOG_DIRECTORY	0x00	/* Directory of all logs */
 #define	ATA_POWER_COND_LOG	0x08	/* Power Conditions Log */
 #define	ATA_PCL_IDLE		0x00	/* Idle Power Conditions Page */
 #define	ATA_PCL_STANDBY		0x01	/* Standby Power Conditions Page */
 #define	ATA_IDENTIFY_DATA_LOG	0x30	/* Identify Device Data Log */
 #define	ATA_IDL_PAGE_LIST	0x00	/* List of supported pages */
 #define	ATA_IDL_IDENTIFY_DATA	0x01	/* Copy of Identify Device data */
 #define	ATA_IDL_CAPACITY	0x02	/* Capacity */
 #define	ATA_IDL_SUP_CAP		0x03	/* Supported Capabilities */
 #define	ATA_IDL_CUR_SETTINGS	0x04	/* Current Settings */
 #define	ATA_IDL_ATA_STRINGS	0x05	/* ATA Strings */
 #define	ATA_IDL_SECURITY	0x06	/* Security */
 #define	ATA_IDL_PARALLEL_ATA	0x07	/* Parallel ATA */
-#define	ATA_IDL_SERIAL_ATA	0x08	/* Seiral ATA */
+#define	ATA_IDL_SERIAL_ATA	0x08	/* Serial ATA */
 #define	ATA_IDL_ZDI		0x09	/* Zoned Device Information */
 
 struct ata_gp_log_dir {
 	uint8_t header[2];
 #define	ATA_GP_LOG_DIR_VERSION		0x0001
 	uint8_t num_pages[255*2];	/* Number of log pages at address */
 };
 
 /*
  * ATA Power Conditions log descriptor
  */
 struct ata_power_cond_log_desc {
 	uint8_t reserved1;
 	uint8_t flags;
 #define ATA_PCL_COND_SUPPORTED		0x80
 #define ATA_PCL_COND_SAVEABLE		0x40
 #define ATA_PCL_COND_CHANGEABLE		0x20
 #define ATA_PCL_DEFAULT_TIMER_EN	0x10
 #define ATA_PCL_SAVED_TIMER_EN		0x08
 #define ATA_PCL_CURRENT_TIMER_EN	0x04
 #define ATA_PCL_HOLD_PC_NOT_SUP		0x02
 	uint8_t reserved2[2];
 	uint8_t default_timer[4];
 	uint8_t saved_timer[4];
 	uint8_t current_timer[4];
 	uint8_t nom_time_to_active[4];
 	uint8_t min_timer[4];
 	uint8_t max_timer[4];
 	uint8_t num_transitions_to_pc[4];
 	uint8_t hours_in_pc[4];
 	uint8_t reserved3[28];
 };
 
 /*
  * ATA Power Conditions Log (0x08), Idle power conditions page (0x00)
  */
 struct ata_power_cond_log_idle {
 	struct ata_power_cond_log_desc idle_a_desc;
 	struct ata_power_cond_log_desc idle_b_desc;
 	struct ata_power_cond_log_desc idle_c_desc;
 	uint8_t reserved[320];
 };
 
 /*
  * ATA Power Conditions Log (0x08), Standby power conditions page (0x01)
  */
 struct ata_power_cond_log_standby {
 	uint8_t reserved[384];
 	struct ata_power_cond_log_desc standby_y_desc;
 	struct ata_power_cond_log_desc standby_z_desc;
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30) page 0x00
  * List of Supported IDENTIFY DEVICE data pages.
  */
 struct ata_identify_log_pages {
 	uint8_t header[8];
 #define	ATA_IDLOG_REVISION	0x0000000000000001
 	uint8_t entry_count;
 	uint8_t entries[503];
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30)
  * Capacity (Page 0x02).
  */
 struct ata_identify_log_capacity {
 	uint8_t header[8];
 #define	ATA_CAP_HEADER_VALID	0x8000000000000000
 #define	ATA_CAP_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_CAP_PAGE_NUM_SHIFT	16
 #define ATA_CAP_REV_MASK	0x00000000000000ff
 	uint8_t capacity[8];
 #define	ATA_CAP_CAPACITY_VALID	0x8000000000000000
 #define	ATA_CAP_ACCESSIBLE_CAP	0x0000ffffffffffff
 	uint8_t phys_logical_sect_size[8];
 #define	ATA_CAP_PL_VALID	0x8000000000000000
 #define	ATA_CAP_LTOP_REL_SUP	0x4000000000000000
 #define	ATA_CAP_LOG_SECT_SUP	0x2000000000000000
 #define	ATA_CAP_ALIGN_ERR_MASK	0x0000000000300000
 #define	ATA_CAP_LTOP_MASK	0x00000000000f0000
 #define	ATA_CAP_LOG_SECT_OFF	0x000000000000ffff
 	uint8_t logical_sect_size[8];
 #define	ATA_CAP_LOG_SECT_VALID	0x8000000000000000
 #define	ATA_CAP_LOG_SECT_SIZE	0x00000000ffffffff
 	uint8_t nominal_buffer_size[8];
 #define	ATA_CAP_NOM_BUF_VALID	0x8000000000000000
 #define	ATA_CAP_NOM_BUF_SIZE	0x7fffffffffffffff
 	uint8_t reserved[472];
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30)
  * Supported Capabilities (Page 0x03).
  */
 
 struct ata_identify_log_sup_cap {
 	uint8_t header[8];
 #define	ATA_SUP_CAP_HEADER_VALID	0x8000000000000000
 #define	ATA_SUP_CAP_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_SUP_CAP_PAGE_NUM_SHIFT	16
 #define ATA_SUP_CAP_REV_MASK		0x00000000000000ff
 	uint8_t sup_cap[8];
 #define	ATA_SUP_CAP_VALID		0x8000000000000000
 #define	ATA_SC_SET_SECT_CONFIG_SUP	0x0002000000000000 /* Set Sect Conf*/
 #define	ATA_SC_ZERO_EXT_SUP		0x0001000000000000 /* Zero EXT */
 #define	ATA_SC_SUCC_NCQ_SENSE_SUP	0x0000800000000000 /* Succ. NCQ Sns */
 #define	ATA_SC_DLC_SUP			0x0000400000000000 /* DLC */
 #define	ATA_SC_RQSN_DEV_FAULT_SUP	0x0000200000000000 /* Req Sns Dev Flt*/
 #define	ATA_SC_DSN_SUP			0x0000100000000000 /* DSN */
 #define	ATA_SC_LP_STANDBY_SUP		0x0000080000000000 /* LP Standby */
 #define	ATA_SC_SET_EPC_PS_SUP		0x0000040000000000 /* Set EPC PS */
 #define	ATA_SC_AMAX_ADDR_SUP		0x0000020000000000 /* AMAX Addr */
 #define	ATA_SC_DRAT_SUP			0x0000008000000000 /* DRAT */
 #define	ATA_SC_LPS_MISALGN_SUP		0x0000004000000000 /* LPS Misalign */
 #define	ATA_SC_RB_DMA_SUP		0x0000001000000000 /* Read Buf DMA */
 #define	ATA_SC_WB_DMA_SUP		0x0000000800000000 /* Write Buf DMA */
 #define	ATA_SC_DNLD_MC_DMA_SUP		0x0000000200000000 /* DL MCode DMA */
 #define	ATA_SC_28BIT_SUP		0x0000000100000000 /* 28-bit */
 #define	ATA_SC_RZAT_SUP			0x0000000080000000 /* RZAT */
 #define	ATA_SC_NOP_SUP			0x0000000020000000 /* NOP */
 #define	ATA_SC_READ_BUFFER_SUP		0x0000000010000000 /* Read Buffer */
 #define	ATA_SC_WRITE_BUFFER_SUP		0x0000000008000000 /* Write Buffer */
 #define	ATA_SC_READ_LOOK_AHEAD_SUP	0x0000000002000000 /* Read Look-Ahead*/
 #define	ATA_SC_VOLATILE_WC_SUP		0x0000000001000000 /* Volatile WC */
 #define	ATA_SC_SMART_SUP		0x0000000000800000 /* SMART */
 #define	ATA_SC_FLUSH_CACHE_EXT_SUP	0x0000000000400000 /* Flush Cache Ext */
 #define	ATA_SC_48BIT_SUP		0x0000000000100000 /* 48-Bit */
 #define	ATA_SC_SPINUP_SUP		0x0000000000040000 /* Spin-Up */
 #define	ATA_SC_PUIS_SUP			0x0000000000020000 /* PUIS */
 #define	ATA_SC_APM_SUP			0x0000000000010000 /* APM */
 #define	ATA_SC_DL_MICROCODE_SUP		0x0000000000004000 /* DL Microcode */
 #define	ATA_SC_UNLOAD_SUP		0x0000000000002000 /* Unload */
 #define	ATA_SC_WRITE_FUA_EXT_SUP	0x0000000000001000 /* Write FUA EXT */
 #define	ATA_SC_GPL_SUP			0x0000000000000800 /* GPL */
 #define	ATA_SC_STREAMING_SUP		0x0000000000000400 /* Streaming */
 #define	ATA_SC_SMART_SELFTEST_SUP	0x0000000000000100 /* SMART self-test */
 #define	ATA_SC_SMART_ERR_LOG_SUP	0x0000000000000080 /* SMART Err Log */
 #define	ATA_SC_EPC_SUP			0x0000000000000040 /* EPC */
 #define	ATA_SC_SENSE_SUP		0x0000000000000020 /* Sense data */
 #define	ATA_SC_FREEFALL_SUP		0x0000000000000010 /* Free-Fall */
 #define	ATA_SC_DM_MODE3_SUP		0x0000000000000008 /* DM Mode 3 */
 #define	ATA_SC_GPL_DMA_SUP		0x0000000000000004 /* GPL DMA */
 #define ATA_SC_WRITE_UNCOR_SUP		0x0000000000000002 /* Write uncorr.  */
 #define ATA_SC_WRV_SUP			0x0000000000000001 /* WRV */
 	uint8_t download_code_cap[8];
 #define ATA_DL_CODE_VALID		0x8000000000000000
 #define	ATA_DLC_DM_OFFSETS_DEFER_SUP	0x0000000400000000
 #define	ATA_DLC_DM_IMMED_SUP		0x0000000200000000
 #define	ATA_DLC_DM_OFF_IMMED_SUP	0x0000000100000000
 #define	ATA_DLC_DM_MAX_XFER_SIZE_MASK	0x00000000ffff0000
 #define	ATA_DLC_DM_MAX_XFER_SIZE_SHIFT	16
 #define	ATA_DLC_DM_MIN_XFER_SIZE_MASK	0x000000000000ffff
 	uint8_t nom_media_rotation_rate[8];
 #define	ATA_NOM_MEDIA_ROTATION_VALID	0x8000000000000000
 #define	ATA_ROTATION_MASK		0x000000000000ffff
 	uint8_t form_factor[8];
 #define	ATA_FORM_FACTOR_VALID		0x8000000000000000
 #define	ATA_FF_MASK			0x000000000000000f
 #define	ATA_FF_NOT_REPORTED		0x0000000000000000 /* Not reported */
 #define	ATA_FF_525_IN			0x0000000000000001 /* 5.25 inch */
 #define	ATA_FF_35_IN			0x0000000000000002 /* 3.5 inch */
 #define	ATA_FF_25_IN			0x0000000000000003 /* 2.5 inch */
 #define	ATA_FF_18_IN			0x0000000000000004 /* 1.8 inch */
 #define	ATA_FF_LT_18_IN			0x0000000000000005 /* < 1.8 inch */
 #define	ATA_FF_MSATA			0x0000000000000006 /* mSATA */
 #define	ATA_FF_M2			0x0000000000000007 /* M.2 */
 #define	ATA_FF_MICROSSD			0x0000000000000008 /* MicroSSD */
 #define	ATA_FF_CFAST			0x0000000000000009 /* CFast */
 	uint8_t wrv_sec_cnt_mode3[8];
 #define ATA_WRV_MODE3_VALID		0x8000000000000000
 #define ATA_WRV_MODE3_COUNT		0x00000000ffffffff
 	uint8_t wrv_sec_cnt_mode2[8];
 #define	ATA_WRV_MODE2_VALID		0x8000000000000000
 #define ATA_WRV_MODE2_COUNT		0x00000000ffffffff
 	uint8_t wwn[16];
 	/* XXX KDM need to figure out how to handle 128-bit fields */
 	uint8_t dsm[8];
 #define	ATA_DSM_VALID			0x8000000000000000
 #define	ATA_LB_MARKUP_SUP		0x000000000000ff00
 #define	ATA_TRIM_SUP			0x0000000000000001
 	uint8_t util_per_unit_time[16];
 	/* XXX KDM need to figure out how to handle 128-bit fields */
 	uint8_t util_usage_rate_sup[8];
 #define	ATA_UTIL_USAGE_RATE_VALID	0x8000000000000000
 #define	ATA_SETTING_RATE_SUP		0x0000000000800000
 #define	ATA_SINCE_POWERON_SUP		0x0000000000000100
 #define	ATA_POH_RATE_SUP		0x0000000000000010
 #define	ATA_DATE_TIME_RATE_SUP		0x0000000000000001
 	uint8_t zoned_cap[8];
 #define	ATA_ZONED_VALID			0x8000000000000000
 #define	ATA_ZONED_MASK			0x0000000000000003
 	uint8_t sup_zac_cap[8];
 #define	ATA_SUP_ZAC_CAP_VALID		0x8000000000000000
 #define	ATA_ND_RWP_SUP			0x0000000000000010 /* Reset Write Ptr*/
 #define	ATA_ND_FINISH_ZONE_SUP		0x0000000000000008 /* Finish Zone */
 #define	ATA_ND_CLOSE_ZONE_SUP		0x0000000000000004 /* Close Zone */
 #define	ATA_ND_OPEN_ZONE_SUP		0x0000000000000002 /* Open Zone */
 #define	ATA_REPORT_ZONES_SUP		0x0000000000000001 /* Report Zones */
 	uint8_t reserved[392];
 };
 
 /*
  * ATA Identify Device Data Log Zoned Device Information Page (0x09).
  * Current as of ZAC r04a, August 25, 2015.
  */
 struct ata_zoned_info_log {
 	uint8_t header[8];
 #define	ATA_ZDI_HEADER_VALID	0x8000000000000000
 #define	ATA_ZDI_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_ZDI_PAGE_NUM_SHIFT	16
 #define ATA_ZDI_REV_MASK	0x00000000000000ff
 	uint8_t zoned_cap[8];
 #define	ATA_ZDI_CAP_VALID	0x8000000000000000
 #define	ATA_ZDI_CAP_URSWRZ	0x0000000000000001
 	uint8_t zoned_settings[8];
 #define	ATA_ZDI_SETTINGS_VALID	0x8000000000000000
 	uint8_t optimal_seq_zones[8];
 #define	ATA_ZDI_OPT_SEQ_VALID	0x8000000000000000
 #define	ATA_ZDI_OPT_SEQ_MASK	0x00000000ffffffff
 	uint8_t optimal_nonseq_zones[8];
 #define	ATA_ZDI_OPT_NS_VALID	0x8000000000000000
 #define	ATA_ZDI_OPT_NS_MASK	0x00000000ffffffff
 	uint8_t max_seq_req_zones[8];
 #define	ATA_ZDI_MAX_SEQ_VALID	0x8000000000000000
 #define	ATA_ZDI_MAX_SEQ_MASK	0x00000000ffffffff
 	uint8_t version_info[8];
 #define	ATA_ZDI_VER_VALID	0x8000000000000000
 #define	ATA_ZDI_VER_ZAC_SUP	0x0100000000000000
 #define	ATA_ZDI_VER_ZAC_MASK	0x00000000000000ff
 	uint8_t reserved[456];
 };
 
 struct ata_ioc_request {
     union {
 	struct {
 	    u_int8_t            command;
 	    u_int8_t            feature;
 	    u_int64_t           lba;
 	    u_int16_t           count;
 	} ata;
 	struct {
 	    char                ccb[16];
 	    struct atapi_sense	sense;
 	} atapi;
     } u;
     caddr_t             data;
     int                 count;
     int                 flags;
 #define ATA_CMD_CONTROL                 0x01
 #define ATA_CMD_READ                    0x02
 #define ATA_CMD_WRITE                   0x04
 #define ATA_CMD_ATAPI                   0x08
 
     int                 timeout;
     int                 error;
 };
 
 struct ata_security_password {
 	u_int16_t		ctrl;
 #define ATA_SECURITY_PASSWORD_USER	0x0000
 #define ATA_SECURITY_PASSWORD_MASTER	0x0001
 #define ATA_SECURITY_ERASE_NORMAL	0x0000
 #define ATA_SECURITY_ERASE_ENHANCED	0x0002
 #define ATA_SECURITY_LEVEL_HIGH		0x0000
 #define ATA_SECURITY_LEVEL_MAXIMUM	0x0100
 
 	u_int8_t		password[32];
 	u_int16_t		revision;
 	u_int16_t		reserved[238];
 };
 
 /* pr device ATA ioctl calls */
 #define IOCATAREQUEST           _IOWR('a', 100, struct ata_ioc_request)
 #define IOCATAGPARM             _IOR('a', 101, struct ata_params)
 #define IOCATAGMODE             _IOR('a', 102, int)
 #define IOCATASMODE             _IOW('a', 103, int)
 
 #define IOCATAGSPINDOWN		_IOR('a', 104, int)
 #define IOCATASSPINDOWN		_IOW('a', 105, int)
 
 
 struct ata_ioc_raid_config {
 	    int                 lun;
 	    int                 type;
 #define AR_JBOD                         0x0001
 #define AR_SPAN                         0x0002
 #define AR_RAID0                        0x0004
 #define AR_RAID1                        0x0008
 #define AR_RAID01                       0x0010
 #define AR_RAID3                        0x0020
 #define AR_RAID4                        0x0040
 #define AR_RAID5                        0x0080
 
 	    int                 interleave;
 	    int                 status;
 #define AR_READY                        1
 #define AR_DEGRADED                     2
 #define AR_REBUILDING                   4
 
 	    int                 progress;
 	    int                 total_disks;
 	    int                 disks[16];
 };
 
 struct ata_ioc_raid_status {
 	    int                 lun;
 	    int                 type;
 	    int                 interleave;
 	    int                 status;
 	    int                 progress;
 	    int                 total_disks;
 	    struct {
 		    int		state;
 #define AR_DISK_ONLINE			0x01
 #define AR_DISK_PRESENT			0x02
 #define AR_DISK_SPARE			0x04
 		    int		lun;
 	    } disks[16];
 };
 
 /* ATA RAID ioctl calls */
 #define IOCATARAIDCREATE        _IOWR('a', 200, struct ata_ioc_raid_config)
 #define IOCATARAIDDELETE        _IOW('a', 201, int)
 #define IOCATARAIDSTATUS        _IOWR('a', 202, struct ata_ioc_raid_status)
 #define IOCATARAIDADDSPARE      _IOW('a', 203, struct ata_ioc_raid_config)
 #define IOCATARAIDREBUILD       _IOW('a', 204, int)
 
 #endif /* _SYS_ATA_H_ */
Index: projects/netbsd-tests-upstream-01-2017/sys/vm/vm_object.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/vm/vm_object.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/vm/vm_object.c	(revision 312218)
@@ -1,2604 +1,2615 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 		    int pagerflags, int flags, boolean_t *clearobjflags,
 		    boolean_t *eio);
 static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
 		    boolean_t *clearobjflags);
 static void	vm_object_qcollapse(vm_object_t object);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0,
     "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(object->ref_count == 0,
 	    ("object %p ref_count = %d", object, object->ref_count));
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages in its memq", object));
 	KASSERT(vm_radix_is_empty(&object->rtree),
 	    ("object %p has resident pages in its trie", object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 	KASSERT(object->type == OBJT_DEAD,
 	    ("object %p has non-dead type %d",
 	    object, object->type));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
 
 	/* These are true for any object that has been freed */
 	object->type = OBJT_DEAD;
 	object->ref_count = 0;
 	object->rtree.rt_root = 0;
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 	return (0);
 }
 
 static void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->type = type;
 	switch (type) {
 	case OBJT_DEAD:
 		panic("_vm_object_allocate: can't create OBJT_DEAD");
 	case OBJT_DEFAULT:
 	case OBJT_SWAP:
 		object->flags = OBJ_ONEMAPPING;
 		break;
 	case OBJT_DEVICE:
 	case OBJT_SG:
 		object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
 		break;
 	case OBJT_MGTDEVICE:
 		object->flags = OBJ_FICTITIOUS;
 		break;
 	case OBJT_PHYS:
 		object->flags = OBJ_UNMANAGED;
 		break;
 	case OBJT_VNODE:
 		object->flags = 0;
 		break;
 	default:
 		panic("_vm_object_allocate: type %d is undefined", type);
 	}
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->memattr = VM_MEMATTR_DEFAULT;
 	object->cred = NULL;
 	object->charge = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
 	umtx_shm_object_init(object);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	rw_init(&kernel_object->lock, "kernel vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	rw_init(&kmem_object->lock, "kmem vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	vm_radix_init();
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->flags &= ~bits;
 }
 
 /*
  *	Sets the default memory attribute for the specified object.  Pages
  *	that are allocated to this object are by default assigned this memory
  *	attribute.
  *
  *	Presently, this function must be called before any pages are allocated
  *	to the object.  In the future, this requirement may be relaxed for
  *	"default" and "swap" objects.
  */
 int
 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 	case OBJT_DEVICE:
 	case OBJT_MGTDEVICE:
 	case OBJT_PHYS:
 	case OBJT_SG:
 	case OBJT_SWAP:
 	case OBJT_VNODE:
 		if (!TAILQ_EMPTY(&object->memq))
 			return (KERN_FAILURE);
 		break;
 	case OBJT_DEAD:
 		return (KERN_INVALID_ARGUMENT);
 	default:
 		panic("vm_object_set_memattr: object %p is of undefined type",
 		    object);
 	}
 	object->memattr = memattr;
 	return (KERN_SUCCESS);
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		VM_OBJECT_SLEEP(object, object, PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, object);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	if (object == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_reference_locked(object);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vn_printf(vp, "vm_object_vndeallocate ");
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
 	if (!umtx_shm_vnobj_persistent && object->ref_count == 1)
 		umtx_shm_object_terminated(object);
 
 	/*
 	 * The test for text of vp vnode does not need a bypass to
 	 * reach right VV_TEXT there, since it is obtained from
 	 * object->handle.
 	 */
 	if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) {
 		object->ref_count--;
 		VM_OBJECT_WUNLOCK(object);
 		/* vrele may need the vnode lock. */
 		vrele(vp);
 	} else {
 		vhold(vp);
 		VM_OBJECT_WUNLOCK(object);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vdrop(vp);
 		VM_OBJECT_WLOCK(object);
 		object->ref_count--;
 		if (object->type == OBJT_DEAD) {
 			VM_OBJECT_WUNLOCK(object);
 			VOP_UNLOCK(vp, 0);
 		} else {
 			if (object->ref_count == 0)
 				VOP_UNSET_TEXT(vp);
 			VM_OBJECT_WUNLOCK(object);
 			vput(vp);
 		}
 	}
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 	struct vnode *vp;
 
 	while (object != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_VNODE) {
 			vm_object_vndeallocate(object);
 			return;
 		}
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->type == OBJT_SWAP &&
 			    (object->flags & OBJ_TMPFS) != 0) {
 				vp = object->un_pager.swp.swp_tmpfs;
 				vhold(vp);
 				VM_OBJECT_WUNLOCK(object);
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				VM_OBJECT_WLOCK(object);
 				if (object->type == OBJT_DEAD ||
 				    object->ref_count != 1) {
 					VM_OBJECT_WUNLOCK(object);
 					VOP_UNLOCK(vp, 0);
 					vdrop(vp);
 					return;
 				}
 				if ((object->flags & OBJ_TMPFS) != 0)
 					VOP_UNSET_TEXT(vp);
 				VOP_UNLOCK(vp, 0);
 				vdrop(vp);
 			}
 			if (object->shadow_count == 0 &&
 			    object->handle == NULL &&
 			    (object->type == OBJT_DEFAULT ||
 			    (object->type == OBJT_SWAP &&
 			    (object->flags & OBJ_TMPFS_NODE) == 0))) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
 				    ("shadowed tmpfs v_object %p", object));
 				if (!VM_OBJECT_TRYWLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_WUNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_WUNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_WUNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						VM_OBJECT_SLEEP(object, object,
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_WLOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_WUNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_WUNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_WUNLOCK(robject);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 doterm:
 		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
 			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
 			    ("shadowed tmpfs v_object 2 %p", object));
 			VM_OBJECT_WLOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			VM_OBJECT_WUNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_WUNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_destroy removes the object from the global object list
  *      and frees the space for the object.
  */
 void
 vm_object_destroy(vm_object_t object)
 {
 
 	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
 		object->cred = NULL;
 	}
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p, p_next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, 0, 0);
 
 		BO_LOCK(&vp->v_bufobj);
 		vp->v_bufobj.bo_flag |= BO_DEAD;
 		BO_UNLOCK(&vp->v_bufobj);
 
 		VM_OBJECT_WLOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Free any remaining pageable pages.  This also removes them from the
 	 * paging queues.  However, don't free wired pages, just remove them
 	 * from the object.  Rather than incrementally removing each page from
 	 * the object, the page and object are reset to any empty state. 
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
 		vm_page_assert_unbusied(p);
 		vm_page_lock(p);
 		/*
 		 * Optimize the page's removal from the object by resetting
 		 * its "object" field.  Specifically, if the page is not
 		 * wired, then the effect of this assignment is that
 		 * vm_page_free()'s call to vm_page_remove() will return
 		 * immediately without modifying the page or the object.
 		 */ 
 		p->object = NULL;
 		if (p->wire_count == 0) {
 			vm_page_free(p);
 			PCPU_INC(cnt.v_pfree);
 		}
 		vm_page_unlock(p);
 	}
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
 	 * None of the object's fields, including "resident_page_count", were
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
 		vm_radix_reclaim_allnodes(&object->rtree);
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
 			vdrop(object->handle);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
 
 	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
 	    object->type == OBJT_SWAP,
 	    ("%s: non-swap obj %p has cred", __func__, object));
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_WUNLOCK(object);
 
 	vm_object_destroy(object);
 }
 
 /*
  * Make the page read-only so that we can clear the object flags.  However, if
  * this is a nosync mmap then the object is likely to stay dirty so do not
  * mess with the page and do not clear the object flags.  Returns TRUE if the
  * page should be flushed, and FALSE otherwise.
  */
 static boolean_t
 vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags)
 {
 
 	/*
 	 * If we have been asked to skip nosync pages and this is a
 	 * nosync page, skip it.  Note that the object flags were not
 	 * cleared in this case so we do not have to set them.
 	 */
 	if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) {
 		*clearobjflags = FALSE;
 		return (FALSE);
 	} else {
 		pmap_remove_write(p);
 		return (p->dirty != 0);
 	}
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  *
  *	Returns FALSE if some page from the range was not written, as
  *	reported by the pager, and TRUE otherwise.
  */
 boolean_t
 vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end,
     int flags)
 {
 	vm_page_t np, p;
 	vm_pindex_t pi, tend, tstart;
 	int curgeneration, n, pagerflags;
 	boolean_t clearobjflags, eio, res;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE
 	 * objects.  The check below prevents the function from
 	 * operating on non-vnode objects.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 ||
 	    object->resident_page_count == 0)
 		return (TRUE);
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ?
 	    VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0;
 
 	tstart = OFF_TO_IDX(start);
 	tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK);
 	clearobjflags = tstart == 0 && tend >= object->size;
 	res = TRUE;
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = vm_page_find_least(object, tstart); p != NULL; p = np) {
 		pi = p->pindex;
 		if (pi >= tend)
 			break;
 		np = TAILQ_NEXT(p, listq);
 		if (p->valid == 0)
 			continue;
 		if (vm_page_sleep_if_busy(p, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				if ((flags & OBJPC_SYNC) != 0)
 					goto rescan;
 				else
 					clearobjflags = FALSE;
 			}
 			np = vm_page_find_least(object, pi);
 			continue;
 		}
 		if (!vm_object_page_remove_write(p, flags, &clearobjflags))
 			continue;
 
 		n = vm_object_page_collect_flush(object, p, pagerflags,
 		    flags, &clearobjflags, &eio);
 		if (eio) {
 			res = FALSE;
 			clearobjflags = FALSE;
 		}
 		if (object->generation != curgeneration) {
 			if ((flags & OBJPC_SYNC) != 0)
 				goto rescan;
 			else
 				clearobjflags = FALSE;
 		}
 
 		/*
 		 * If the VOP_PUTPAGES() did a truncated write, so
 		 * that even the first page of the run is not fully
 		 * written, vm_pageout_flush() returns 0 as the run
 		 * length.  Since the condition that caused truncated
 		 * write may be permanent, e.g. exhausted free space,
 		 * accepting n == 0 would cause an infinite loop.
 		 *
 		 * Forwarding the iterator leaves the unwritten page
 		 * behind, but there is not much we can do there if
 		 * filesystem refuses to write it.
 		 */
 		if (n == 0) {
 			n = 1;
 			clearobjflags = FALSE;
 		}
 		np = vm_page_find_least(object, pi + n);
 	}
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0);
 #endif
 
 	if (clearobjflags)
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 	return (res);
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags,
     int flags, boolean_t *clearobjflags, boolean_t *eio)
 {
 	vm_page_t ma[vm_pageout_page_count], p_first, tp;
 	int count, i, mreq, runlen;
 
 	vm_page_lock_assert(p, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	count = 1;
 	mreq = 0;
 
 	for (tp = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_next(tp);
 		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 	}
 
 	for (p_first = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_prev(p_first);
 		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 		p_first = tp;
 		mreq++;
 	}
 
 	for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++)
 		ma[i] = tp;
 
 	vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio);
 	return (runlen);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * If the backing object is a device object with unmanaged pages, then any
  * mappings to the specified range of pages must be removed before this
  * function is called.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 boolean_t
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flags, fsync_after;
 	boolean_t res;
 
 	if (object == NULL)
 		return (TRUE);
 	res = TRUE;
 	error = 0;
 	VM_OBJECT_WLOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_WLOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_WUNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vp = object->handle;
 		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
 		    OFF_TO_IDX(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
 			 * async mode, also allowing the clustering,
 			 * and then wait for i/o to complete.
 			 */
 			flags = 0;
 			fsync_after = TRUE;
 		} else {
 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 			flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
 			fsync_after = FALSE;
 		}
 		VM_OBJECT_WLOCK(object);
 		res = vm_object_page_clean(object, offset, offset + size,
 		    flags);
 		VM_OBJECT_WUNLOCK(object);
 		if (fsync_after)
 			error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 		if (error != 0)
 			res = FALSE;
 		VM_OBJECT_WLOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		if (object->type == OBJT_DEVICE)
 			/*
 			 * The option OBJPR_NOTMAPPED must be passed here
 			 * because vm_object_page_remove() cannot remove
 			 * unmanaged mappings.
 			 */
 			flags = OBJPR_NOTMAPPED;
 		else if (old_msync)
 			flags = 0;
 		else
 			flags = OBJPR_CLEANONLY;
 		vm_object_page_remove(object, OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK), flags);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (res);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
-    int advise)
+    int advice)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
+
 	VM_OBJECT_WLOCK(object);
-	/*
-	 * Locate and adjust resident pages
-	 */
-	for (; pindex < end; pindex += 1) {
+	for (m = NULL; pindex < end; pindex++) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
-		if (advise == MADV_FREE) {
+		if (advice == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
 			goto unlock_tobject;
-		m = vm_page_lookup(tobject, tpindex);
-		if (m == NULL) {
-			/*
-			 * There may be swap even if there is no backing page
-			 */
-			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
+
+		/*
+		 * In the common case where the object has no backing object, we
+		 * can avoid performing lookups at each pindex.  In either case,
+		 * when applying MADV_FREE we take care to release any swap
+		 * space used to store non-resident pages.
+		 */
+		if (object->backing_object == NULL) {
+			m = (m != NULL) ? TAILQ_NEXT(m, listq) :
+			    vm_page_find_least(object, pindex);
+			tpindex = (m != NULL && m->pindex < end) ?
+			    m->pindex : end;
+			if (advice == MADV_FREE && object->type == OBJT_SWAP &&
+			    tpindex > pindex)
+				swap_pager_freespace(object, pindex,
+				    tpindex - pindex);
+			if ((pindex = tpindex) == end)
+				break;
+		} else if ((m = vm_page_lookup(tobject, tpindex)) == NULL) {
+			if (advice == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
-			 * next object
+			 * Prepare to search the next object in the chain.
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_WLOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_WUNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
-		} else if (m->valid != VM_PAGE_BITS_ALL)
-			goto unlock_tobject;
+		}
+
 		/*
 		 * If the page is not in a normal state, skip it.
 		 */
+		if (m->valid != VM_PAGE_BITS_ALL)
+			goto unlock_tobject;
 		vm_page_lock(m);
 		if (m->hold_count != 0 || m->wire_count != 0) {
 			vm_page_unlock(m);
 			goto unlock_tobject;
 		}
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_madvise: page %p is fictitious", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("vm_object_madvise: page %p is not managed", m));
 		if (vm_page_busied(m)) {
-			if (advise == MADV_WILLNEED) {
+			if (advice == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it. 
 				 */
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			}
 			if (object != tobject)
 				VM_OBJECT_WUNLOCK(object);
 			VM_OBJECT_WUNLOCK(tobject);
 			vm_page_busy_sleep(m, "madvpo", false);
+			m = NULL;
 			VM_OBJECT_WLOCK(object);
   			goto relookup;
 		}
-		if (advise == MADV_WILLNEED) {
-			vm_page_activate(m);
-		} else {
-			vm_page_advise(m, advise);
-		}
+		vm_page_advise(m, advice);
 		vm_page_unlock(m);
-		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
+		if (advice == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_WUNLOCK(tobject);
-	}	
+	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_WUNLOCK(source);
 			return;
 		}
 		VM_OBJECT_WUNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, atop(length));
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 #if VM_NRESERVLEVEL > 0
 		result->flags |= source->flags & OBJ_COLORED;
 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
 		    ((1 << (VM_NFREEORDER - 1)) - 1);
 #endif
 		VM_OBJECT_WUNLOCK(source);
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_WUNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_WLOCK(new_object);
 	VM_OBJECT_WLOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_WUNLOCK(source);
 			VM_OBJECT_WUNLOCK(orig_object);
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_WLOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	if (orig_object->cred != NULL) {
 		new_object->cred = orig_object->cred;
 		crhold(orig_object->cred);
 		new_object->charge = ptoa(size);
 		KASSERT(orig_object->charge >= ptoa(size),
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
 retry:
 	m = vm_page_find_least(orig_object, offidxstart);
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if (vm_page_busied(m)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(orig_object);
 			vm_page_busy_sleep(m, "spltwt", false);
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 
 		/* vm_page_rename() will dirty the page. */
 		if (vm_page_rename(m, new_object, idx)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
 			VM_WAIT;
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * If some of the reservation's allocated pages remain with
 		 * the original object, then transferring the reservation to
 		 * the new object is neither particularly beneficial nor
 		 * particularly harmful as compared to leaving the reservation
 		 * with the original object.  If, however, all of the
 		 * reservation's allocated pages are transferred to the new
 		 * object, then transferring the reservation is typically
 		 * beneficial.  Determining which of these two cases applies
 		 * would be more costly than unconditionally renaming the
 		 * reservation.
 		 */
 		vm_reserv_rename(m, new_object, orig_object, offidxstart);
 #endif
 		if (orig_object->type == OBJT_SWAP)
 			vm_page_xbusy(m);
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
 	}
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_WLOCK(new_object);
 }
 
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static vm_page_t
 vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
     int op)
 {
 	vm_object_t backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	backing_object = object->backing_object;
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p));
 	KASSERT(p == NULL || p->object == object || p->object == backing_object,
 	    ("invalid ownership %p %p %p", p, object, backing_object));
 	if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
 		return (next);
 	if (p != NULL)
 		vm_page_lock(p);
 	VM_OBJECT_WUNLOCK(object);
 	VM_OBJECT_WUNLOCK(backing_object);
 	if (p == NULL)
 		VM_WAIT;
 	else
 		vm_page_busy_sleep(p, "vmocol", false);
 	VM_OBJECT_WLOCK(object);
 	VM_OBJECT_WLOCK(backing_object);
 	return (TAILQ_FIRST(&backing_object->memq));
 }
 
 static bool
 vm_object_scan_all_shadowed(vm_object_t object)
 {
 	vm_object_t backing_object;
 	vm_page_t p, pp;
 	vm_pindex_t backing_offset_index, new_pindex, pi, ps;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 
 	if (backing_object->type != OBJT_DEFAULT &&
 	    backing_object->type != OBJT_SWAP)
 		return (false);
 
 	pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 	p = vm_page_find_least(backing_object, pi);
 	ps = swap_pager_find_least(backing_object, pi);
 
 	/*
 	 * Only check pages inside the parent object's range and
 	 * inside the parent object's mapping of the backing object.
 	 */
 	for (;; pi++) {
 		if (p != NULL && p->pindex < pi)
 			p = TAILQ_NEXT(p, listq);
 		if (ps < pi)
 			ps = swap_pager_find_least(backing_object, pi);
 		if (p == NULL && ps >= backing_object->size)
 			break;
 		else if (p == NULL)
 			pi = ps;
 		else
 			pi = MIN(p->pindex, ps);
 
 		new_pindex = pi - backing_offset_index;
 		if (new_pindex >= object->size)
 			break;
 
 		/*
 		 * See if the parent has the page or if the parent's object
 		 * pager has the page.  If the parent has the page but the page
 		 * is not valid, the parent's object pager must have the page.
 		 *
 		 * If this fails, the parent does not completely shadow the
 		 * object and we might as well give up now.
 		 */
 		pp = vm_page_lookup(object, new_pindex);
 		if ((pp == NULL || pp->valid == 0) &&
 		    !vm_pager_has_page(object, new_pindex, NULL, NULL))
 			return (false);
 	}
 	return (true);
 }
 
 static bool
 vm_object_collapse_scan(vm_object_t object, int op)
 {
 	vm_object_t backing_object;
 	vm_page_t next, p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if ((op & OBSC_COLLAPSE_WAIT) != 0)
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 
 	/*
 	 * Our scan
 	 */
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
 
 		/*
 		 * Check for busy page
 		 */
 		if (vm_page_busied(p)) {
 			next = vm_object_collapse_scan_wait(object, p, next, op);
 			continue;
 		}
 
 		KASSERT(p->object == backing_object,
 		    ("vm_object_collapse_scan: object mismatch"));
 
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 
 			/*
 			 * Page is out of the parent object's range, we can
 			 * simply destroy it.
 			 */
 			vm_page_lock(p);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (p->wire_count == 0)
 				vm_page_free(p);
 			else
 				vm_page_remove(p);
 			vm_page_unlock(p);
 			continue;
 		}
 
 		pp = vm_page_lookup(object, new_pindex);
 		if (pp != NULL && vm_page_busied(pp)) {
 			/*
 			 * The page in the parent is busy and possibly not
 			 * (yet) valid.  Until its state is finalized by the
 			 * busy bit owner, we can't tell whether it shadows the
 			 * original page.  Therefore, we must either skip it
 			 * and the original (backing_object) page or wait for
 			 * its state to be finalized.
 			 *
 			 * This is due to a race with vm_fault() where we must
 			 * unbusy the original (backing_obj) page before we can
 			 * (re)lock the parent.  Hence we can get here.
 			 */
 			next = vm_object_collapse_scan_wait(object, pp, next,
 			    op);
 			continue;
 		}
 
 		KASSERT(pp == NULL || pp->valid != 0,
 		    ("unbusy invalid page %p", pp));
 
 		if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
 			NULL)) {
 			/*
 			 * The page already exists in the parent OR swap exists
 			 * for this location in the parent.  Leave the parent's
 			 * page alone.  Destroy the original page from the
 			 * backing object.
 			 */
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 			vm_page_lock(p);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (p->wire_count == 0)
 				vm_page_free(p);
 			else
 				vm_page_remove(p);
 			vm_page_unlock(p);
 			continue;
 		}
 
 		/*
 		 * Page does not exist in parent, rename the page from the
 		 * backing object to the main object.
 		 *
 		 * If the page was mapped to a process, it can remain mapped
 		 * through the rename.  vm_page_rename() will dirty the page.
 		 */
 		if (vm_page_rename(p, object, new_pindex)) {
 			next = vm_object_collapse_scan_wait(object, NULL, next,
 			    op);
 			continue;
 		}
 
 		/* Use the old pindex to free the right page. */
 		if (backing_object->type == OBJT_SWAP)
 			swap_pager_freespace(backing_object,
 			    new_pindex + backing_offset_index, 1);
 
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * Rename the reservation.
 		 */
 		vm_reserv_rename(p, object, backing_object,
 		    backing_offset_index);
 #endif
 	}
 	return (true);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	vm_object_t backing_object, new_backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	while (TRUE) {
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_WLOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 
 		if (object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_collapse_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			vm_object_pip_add(object, 1);
 			vm_object_pip_add(backing_object, 1);
 
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.
 			 */
 			vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Break any reservations from backing_object.
 			 */
 			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
 				vm_reserv_break_all(backing_object);
 #endif
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 * Since swap_pager_copy() is being asked to
 				 * destroy the source, it will change the
 				 * backing_object's type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			if (backing_object->backing_object) {
 				VM_OBJECT_WLOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				VM_OBJECT_WUNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, (
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
 			vm_object_pip_wakeup(backing_object);
 			backing_object->type = OBJT_DEAD;
 			backing_object->ref_count = 0;
 			VM_OBJECT_WUNLOCK(backing_object);
 			vm_object_destroy(backing_object);
 
 			vm_object_pip_wakeup(object);
 			object_collapses++;
 		} else {
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    !vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_WLOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_WUNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_WUNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	For the given object, either frees or invalidates each of the
  *	specified pages.  In general, a page is freed.  However, if a page is
  *	wired for any reason other than the existence of a managed, wired
  *	mapping, then it may be invalidated but not removed from the object.
  *	Pages are specified by the given range ["start", "end") and the option
  *	OBJPR_CLEANONLY.  As a special case, if "end" is zero, then the range
  *	extends from "start" to the end of the object.  If the option
  *	OBJPR_CLEANONLY is specified, then only the non-dirty pages within the
  *	specified range are affected.  If the option OBJPR_NOTMAPPED is
  *	specified, then the pages within the specified range must have no
  *	mappings.  Otherwise, if this option is not specified, any mappings to
  *	the specified pages are removed before the pages are freed or
  *	invalidated.
  *
  *	In general, this operation should only be performed on objects that
  *	contain managed pages.  There are, however, two exceptions.  First, it
  *	is performed on the kernel and kmem objects by vm_map_entry_delete().
  *	Second, it is used by msync(..., MS_INVALIDATE) to invalidate device-
  *	backed pages.  In both of these cases, the option OBJPR_CLEANONLY must
  *	not be specified and the option OBJPR_NOTMAPPED must be specified.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	vm_object_pip_add(object, 1);
 again:
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * If the page is wired for any reason besides the existence
 		 * of managed, wired mappings, then it cannot be freed.  For
 		 * example, fictitious pages, which represent device memory,
 		 * are inherently wired and cannot be freed.  They can,
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
 		vm_page_lock(p);
 		if (vm_page_xbusied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopax", true);
 			VM_OBJECT_WLOCK(object);
 			goto again;
 		}
 		if (p->wire_count != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_all(p);
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
 			goto next;
 		}
 		if (vm_page_busied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopar", false);
 			VM_OBJECT_WLOCK(object);
 			goto again;
 		}
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_write(p);
 			if (p->dirty)
 				goto next;
 		}
 		if ((options & OBJPR_NOTMAPPED) == 0)
 			pmap_remove_all(p);
 		vm_page_free(p);
 next:
 		vm_page_unlock(p);
 	}
 	vm_object_pip_wakeup(object);
 }
 
 /*
  *	vm_object_page_noreuse:
  *
  *	For the given object, attempt to move the specified pages to
  *	the head of the inactive queue.  This bypasses regular LRU
  *	operation and allows the pages to be reused quickly under memory
  *	pressure.  If a page is wired for any reason, then it will not
  *	be queued.  Pages are specified by the range ["start", "end").
  *	As a special case, if "end" is zero, then the range extends from
  *	"start" to the end of the object.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	struct mtx *mtx, *new_mtx;
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
 	    ("vm_object_page_noreuse: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	mtx = NULL;
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(p);
 		if (mtx != new_mtx) {
 			if (mtx != NULL)
 				mtx_unlock(mtx);
 			mtx = new_mtx;
 			mtx_lock(mtx);
 		}
 		vm_page_deactivate_noreuse(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 /*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *
  *	Note: This function should be optimized to pass a larger array of
  *	pages to vm_pager_get_pages() before it is applied to a non-
  *	OBJT_DEVICE object.
  *
  *	The object must be locked.
  */
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	for (pindex = start; pindex < end; pindex++) {
 		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				break;
 			}
 		}
 		/*
 		 * Keep "m" busy because a subsequent iteration may unlock
 		 * the object.
 		 */
 	}
 	if (pindex > start) {
 		m = vm_page_lookup(object, start);
 		while (m != NULL && m->pindex < pindex) {
 			vm_page_xunbusy(m);
 			m = TAILQ_NEXT(m, listq);
 		}
 	}
 	return (pindex == end);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *		reserved	Indicator that extension region has
  *				swap accounted for
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_WLOCK(prev_object);
 	if ((prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) ||
 	    (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Account for the charge.
 	 */
 	if (prev_object->cred != NULL) {
 
 		/*
 		 * If prev_object was charged, then this mapping,
 		 * although not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
 		 * cause allocation of the separate object for the map
 		 * entry, and swap reservation for this entry is
 		 * managed in appropriate time.
 		 */
 		if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
 		    prev_object->cred)) {
 			VM_OBJECT_WUNLOCK(prev_object);
 			return (FALSE);
 		}
 		prev_object->charge += ptoa(next_size);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object, next_pindex, next_pindex +
 		    next_size, 0);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 #if 0
 		if (prev_object->cred != NULL) {
 			KASSERT(prev_object->charge >=
 			    ptoa(prev_object->size - next_pindex),
 			    ("object %p overcharged 1 %jx %jx", prev_object,
 				(uintmax_t)next_pindex, (uintmax_t)next_size));
 			prev_object->charge -= ptoa(prev_object->size -
 			    next_pindex);
 		}
 #endif
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_WUNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_VNODE) {
 		if ((object->flags & OBJ_TMPFS_NODE) != 0) {
 			KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs"));
 			vm_object_set_flag(object, OBJ_TMPFS_DIRTY);
 		}
 		return;
 	}
 	object->generation++;
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 }
 
 /*
  *	vm_object_unwire:
  *
  *	For each page offset within the specified range of the given object,
  *	find the highest-level page in the shadow chain and unwire it.  A page
  *	must exist at every page offset, and the highest-level page must be
  *	wired.
  */
 void
 vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
     uint8_t queue)
 {
 	vm_object_t tobject;
 	vm_page_t m, tm;
 	vm_pindex_t end_pindex, pindex, tpindex;
 	int depth, locked_depth;
 
 	KASSERT((offset & PAGE_MASK) == 0,
 	    ("vm_object_unwire: offset is not page aligned"));
 	KASSERT((length & PAGE_MASK) == 0,
 	    ("vm_object_unwire: length is not a multiple of PAGE_SIZE"));
 	/* The wired count of a fictitious page never changes. */
 	if ((object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	pindex = OFF_TO_IDX(offset);
 	end_pindex = pindex + atop(length);
 	locked_depth = 1;
 	VM_OBJECT_RLOCK(object);
 	m = vm_page_find_least(object, pindex);
 	while (pindex < end_pindex) {
 		if (m == NULL || pindex < m->pindex) {
 			/*
 			 * The first object in the shadow chain doesn't
 			 * contain a page at the current index.  Therefore,
 			 * the page must exist in a backing object.
 			 */
 			tobject = object;
 			tpindex = pindex;
 			depth = 0;
 			do {
 				tpindex +=
 				    OFF_TO_IDX(tobject->backing_object_offset);
 				tobject = tobject->backing_object;
 				KASSERT(tobject != NULL,
 				    ("vm_object_unwire: missing page"));
 				if ((tobject->flags & OBJ_FICTITIOUS) != 0)
 					goto next_page;
 				depth++;
 				if (depth == locked_depth) {
 					locked_depth++;
 					VM_OBJECT_RLOCK(tobject);
 				}
 			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
 			    NULL);
 		} else {
 			tm = m;
 			m = TAILQ_NEXT(m, listq);
 		}
 		vm_page_lock(tm);
 		vm_page_unwire(tm, queue);
 		vm_page_unlock(tm);
 next_page:
 		pindex++;
 	}
 	/* Release the accumulated object locks. */
 	for (depth = 0; depth < locked_depth; depth++) {
 		tobject = object->backing_object;
 		VM_OBJECT_RUNLOCK(object);
 		object = tobject;
 	}
 }
 
 struct vnode *
 vm_object_vnode(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type == OBJT_VNODE)
 		return (object->handle);
 	if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0)
 		return (object->un_pager.swp.swp_tmpfs);
 	return (NULL);
 }
 
 static int
 sysctl_vm_object_list(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_vmobject kvo;
 	char *fullpath, *freepath;
 	struct vnode *vp;
 	struct vattr va;
 	vm_object_t obj;
 	vm_page_t m;
 	int count, error;
 
 	if (req->oldptr == NULL) {
 		/*
 		 * If an old buffer has not been provided, generate an
 		 * estimate of the space needed for a subsequent call.
 		 */
 		mtx_lock(&vm_object_list_mtx);
 		count = 0;
 		TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 			if (obj->type == OBJT_DEAD)
 				continue;
 			count++;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) *
 		    count * 11 / 10));
 	}
 
 	error = 0;
 
 	/*
 	 * VM objects are type stable and are never removed from the
 	 * list once added.  This allows us to safely read obj->object_list
 	 * after reacquiring the VM object lock.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 		if (obj->type == OBJT_DEAD)
 			continue;
 		VM_OBJECT_RLOCK(obj);
 		if (obj->type == OBJT_DEAD) {
 			VM_OBJECT_RUNLOCK(obj);
 			continue;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		kvo.kvo_size = ptoa(obj->size);
 		kvo.kvo_resident = obj->resident_page_count;
 		kvo.kvo_ref_count = obj->ref_count;
 		kvo.kvo_shadow_count = obj->shadow_count;
 		kvo.kvo_memattr = obj->memattr;
 		kvo.kvo_active = 0;
 		kvo.kvo_inactive = 0;
 		TAILQ_FOREACH(m, &obj->memq, listq) {
 			/*
 			 * A page may belong to the object but be
 			 * dequeued and set to PQ_NONE while the
 			 * object lock is not held.  This makes the
 			 * reads of m->queue below racy, and we do not
 			 * count pages set to PQ_NONE.  However, this
 			 * sysctl is only meant to give an
 			 * approximation of the system anyway.
 			 */
 			if (vm_page_active(m))
 				kvo.kvo_active++;
 			else if (vm_page_inactive(m))
 				kvo.kvo_inactive++;
 		}
 
 		kvo.kvo_vn_fileid = 0;
 		kvo.kvo_vn_fsid = 0;
 		freepath = NULL;
 		fullpath = "";
 		vp = NULL;
 		switch (obj->type) {
 		case OBJT_DEFAULT:
 			kvo.kvo_type = KVME_TYPE_DEFAULT;
 			break;
 		case OBJT_VNODE:
 			kvo.kvo_type = KVME_TYPE_VNODE;
 			vp = obj->handle;
 			vref(vp);
 			break;
 		case OBJT_SWAP:
 			kvo.kvo_type = KVME_TYPE_SWAP;
 			break;
 		case OBJT_DEVICE:
 			kvo.kvo_type = KVME_TYPE_DEVICE;
 			break;
 		case OBJT_PHYS:
 			kvo.kvo_type = KVME_TYPE_PHYS;
 			break;
 		case OBJT_DEAD:
 			kvo.kvo_type = KVME_TYPE_DEAD;
 			break;
 		case OBJT_SG:
 			kvo.kvo_type = KVME_TYPE_SG;
 			break;
 		case OBJT_MGTDEVICE:
 			kvo.kvo_type = KVME_TYPE_MGTDEVICE;
 			break;
 		default:
 			kvo.kvo_type = KVME_TYPE_UNKNOWN;
 			break;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 		if (vp != NULL) {
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) {
 				kvo.kvo_vn_fileid = va.va_fileid;
 				kvo.kvo_vn_fsid = va.va_fsid;
 			}
 			vput(vp);
 		}
 
 		strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		/* Pack record size down */
 		kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) +
 		    strlen(kvo.kvo_path) + 1;
 		kvo.kvo_structsize = roundup(kvo.kvo_structsize,
 		    sizeof(uint64_t));
 		error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize);
 		mtx_lock(&vm_object_list_mtx);
 		if (error)
 			break;
 	}
 	mtx_unlock(&vm_object_list_mtx);
 	return (error);
 }
 SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
     "List of VM objects");
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags,
 	    object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	vm_pindex_t fidx;
 	vm_paddr_t pa;
 	vm_page_t m, prev_m;
 	int rcount, nl, c;
 
 	nl = 0;
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		pa = -1;
 		TAILQ_FOREACH(m, &object->memq, listq) {
 			if (m->pindex > 128)
 				break;
 			if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
 			    prev_m->pindex + 1 != m->pindex) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 			}				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = m->pindex;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: projects/netbsd-tests-upstream-01-2017/sys/vm/vm_page.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/sys/vm/vm_page.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/sys/vm/vm_page.c	(revision 312218)
@@ -1,3600 +1,3603 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
  *	- A page queue lock is required when adding or removing a page from a
  *	  page queue regardless of other locks or the busy state of a page.
  *
  *		* In general, no thread besides the page daemon can acquire or
  *		  hold more than one page queue lock at a time.
  *
  *		* The page daemon can acquire and hold any pair of page queue
  *		  locks in any order.
  *
  *	- The object lock is required when inserting or removing
  *	  pages from an object (vm_page_insert() or vm_page_remove()).
  *
  */
 
 /*
  *	Resident memory management module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
  */
 
 struct vm_domain vm_dom[MAXMEMDOM];
 struct mtx_padalign vm_page_queue_free_mtx;
 
 struct mtx_padalign pa_lock[PA_LOCK_COUNT];
 
 /*
  * bogus page -- for I/O to/from partially complete buffers,
  * or for paging into sparsely invalid regions.
  */
 vm_page_t bogus_page;
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
 long first_page;
 
 static int boot_pages = UMA_BOOT_PAGES;
 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &boot_pages, 0,
     "number of pages allocated for bootstrapping the VM system");
 
 static int pa_tryrelock_restart;
 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 
 static TAILQ_HEAD(, vm_page) blacklist_head;
 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
 
 /* Is the page daemon waiting for free pages? */
 static int vm_pageout_pages_needed;
 
 static uma_zone_t fakepg_zone;
 
 static void vm_page_alloc_check(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(uint8_t queue, vm_page_t m);
 static void vm_page_free_wakeup(void);
 static void vm_page_init(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
     vm_page_t mpred);
 static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
     vm_paddr_t high);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
 
 static void
 vm_page_init(void *dummy)
 {
 
 	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 }
 
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
 #if PAGE_SIZE == 32768
 #ifdef CTASSERT
 CTASSERT(sizeof(u_long) >= 8);
 #endif
 #endif
 
 /*
  * Try to acquire a physical address lock while a pmap is locked.  If we
  * fail to trylock we unlock and lock the pmap directly and cache the
  * locked pa in *locked.  The caller should then restart their loop in case
  * the virtual to physical mapping has changed.
  */
 int
 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
 {
 	vm_paddr_t lockpa;
 
 	lockpa = *locked;
 	*locked = pa;
 	if (lockpa) {
 		PA_LOCK_ASSERT(lockpa, MA_OWNED);
 		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
 			return (0);
 		PA_UNLOCK(lockpa);
 	}
 	if (PA_TRYLOCK(pa))
 		return (0);
 	PMAP_UNLOCK(pmap);
 	atomic_add_int(&pa_tryrelock_restart, 1);
 	PA_LOCK(pa);
 	PMAP_LOCK(pmap);
 	return (EAGAIN);
 }
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  */
 void
 vm_set_page_size(void)
 {
 	if (vm_cnt.v_page_size == 0)
 		vm_cnt.v_page_size = PAGE_SIZE;
 	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
  *	vm_page_blacklist_next:
  *
  *	Find the next entry in the provided string of blacklist
  *	addresses.  Entries are separated by space, comma, or newline.
  *	If an invalid integer is encountered then the rest of the
  *	string is skipped.  Updates the list pointer to the next
  *	character, or NULL if the string is exhausted or invalid.
  */
 static vm_paddr_t
 vm_page_blacklist_next(char **list, char *end)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
 	if (list == NULL || *list == NULL)
 		return (0);
 	if (**list =='\0') {
 		*list = NULL;
 		return (0);
 	}
 
 	/*
 	 * If there's no end pointer then the buffer is coming from
 	 * the kenv and we know it's null-terminated.
 	 */
 	if (end == NULL)
 		end = *list + strlen(*list);
 
 	/* Ensure that strtoq() won't walk off the end */
 	if (*end != '\0') {
 		if (*end == '\n' || *end == ' ' || *end  == ',')
 			*end = '\0';
 		else {
 			printf("Blacklist not terminated, skipping\n");
 			*list = NULL;
 			return (0);
 		}
 	}
 
 	for (pos = *list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
 		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
 			if (bad == 0) {
 				if (++cp < end)
 					continue;
 				else
 					break;
 			}
 		} else
 			break;
 		if (*cp == '\0' || ++cp >= end)
 			*list = NULL;
 		else
 			*list = cp;
 		return (trunc_page(bad));
 	}
 	printf("Garbage in RAM blacklist, skipping\n");
 	*list = NULL;
 	return (0);
 }
 
 /*
  *	vm_page_blacklist_check:
  *
  *	Iterate through the provided string of blacklist addresses, pulling
  *	each entry out of the physical allocator free list and putting it
  *	onto a list for reporting via the vm.page_blacklist sysctl.
  */
 static void
 vm_page_blacklist_check(char *list, char *end)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
 	char *next;
 	int ret;
 
 	next = list;
 	while (next != NULL) {
 		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
 			continue;
 		m = vm_phys_paddr_to_vm_page(pa);
 		if (m == NULL)
 			continue;
 		mtx_lock(&vm_page_queue_free_mtx);
 		ret = vm_phys_unfree_page(m);
 		mtx_unlock(&vm_page_queue_free_mtx);
 		if (ret == TRUE) {
 			TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
 			if (bootverbose)
 				printf("Skipping page with pa 0x%jx\n",
 				    (uintmax_t)pa);
 		}
 	}
 }
 
 /*
  *	vm_page_blacklist_load:
  *
  *	Search for a special module named "ram_blacklist".  It'll be a
  *	plain text file provided by the user via the loader directive
  *	of the same name.
  */
 static void
 vm_page_blacklist_load(char **list, char **end)
 {
 	void *mod;
 	u_char *ptr;
 	u_int len;
 
 	mod = NULL;
 	ptr = NULL;
 
 	mod = preload_search_by_type("ram_blacklist");
 	if (mod != NULL) {
 		ptr = preload_fetch_addr(mod);
 		len = preload_fetch_size(mod);
         }
 	*list = ptr;
 	if (ptr != NULL)
 		*end = ptr + len;
 	else
 		*end = NULL;
 	return;
 }
 
 static int
 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
 {
 	vm_page_t m;
 	struct sbuf sbuf;
 	int error, first;
 
 	first = 1;
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	TAILQ_FOREACH(m, &blacklist_head, listq) {
 		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
 		    (uintmax_t)m->phys_addr);
 		first = 0;
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 static void
 vm_page_domain_init(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
 	int i;
 
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
 	    &vm_cnt.v_inactive_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
 	    &vm_cnt.v_active_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
 	    "vm laundry pagequeue";
 	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
 	    &vm_cnt.v_laundry_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
 	    "vm unswappable pagequeue";
 	/* Unswappable dirty pages are counted as being in the laundry. */
 	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_vcnt) =
 	    &vm_cnt.v_laundry_count;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
 	vmd->vmd_segs = 0;
 	vmd->vmd_oom = FALSE;
 	for (i = 0; i < PQ_COUNT; i++) {
 		pq = &vmd->vmd_pagequeues[i];
 		TAILQ_INIT(&pq->pq_pl);
 		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
 		    MTX_DEF | MTX_DUPOK);
 	}
 }
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.
  *
  *	Allocates memory for the page cells, and
  *	for the object/offset-to-page hash table headers.
  *	Each page cell is initialized and placed on the free list.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	vm_offset_t mapped;
 	vm_paddr_t page_range;
 	vm_paddr_t new_end;
 	int i;
 	vm_paddr_t pa;
 	vm_paddr_t last_pa;
 	char *list, *listend;
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
 	vm_paddr_t low_water, high_water;
 	int biggestone;
 	int pages_per_zone;
 
 	biggestsize = 0;
 	biggestone = 0;
 	vaddr = round_page(vaddr);
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
 	low_water = phys_avail[0];
 	high_water = phys_avail[1];
 
 	for (i = 0; i < vm_phys_nsegs; i++) {
 		if (vm_phys_segs[i].start < low_water)
 			low_water = vm_phys_segs[i].start;
 		if (vm_phys_segs[i].end > high_water)
 			high_water = vm_phys_segs[i].end;
 	}
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
 
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
 		if (phys_avail[i] < low_water)
 			low_water = phys_avail[i];
 		if (phys_avail[i + 1] > high_water)
 			high_water = phys_avail[i + 1];
 	}
 
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the page and queue locks.
 	 */
 	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
 		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
 	for (i = 0; i < vm_ndomains; i++)
 		vm_page_domain_init(&vm_dom[i]);
 
 	/*
 	 * Almost all of the pages needed for boot strapping UMA are used
 	 * for zone structures, so if the number of CPUs results in those
 	 * structures taking more than one page each, we set aside more pages
 	 * in proportion to the zone structure size.
 	 */
 	pages_per_zone = howmany(sizeof(struct uma_zone) +
 	    sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE);
 	if (pages_per_zone > 1) {
 		/* Reserve more pages so that we don't run out. */
 		boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
 	}
 
 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
 	 *
 	 * CTFLAG_RDTUN doesn't work during the early boot process, so we must
 	 * manually fetch the value.
 	 */
 	TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
 #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
     defined(__i386__) || defined(__mips__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
 	 *
 	 * The amd64 port needs this to indicate which direct map pages
 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 	 *
 	 * However, i386 still needs this workspace internally within the
 	 * minidump code.  In theory, they are not needed on i386, but are
 	 * included should the sf_buf code decide to use them.
 	 */
 	last_pa = 0;
 	for (i = 0; dump_avail[i + 1] != 0; i += 2)
 		if (dump_avail[i + 1] > last_pa)
 			last_pa = dump_avail[i + 1];
 	page_range = last_pa / PAGE_SIZE;
 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 	new_end -= vm_page_dump_size;
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #endif
 #ifdef __amd64__
 	/*
 	 * Request that the physical pages underlying the message buffer be
 	 * included in a crash dump.  Since the message buffer is accessed
 	 * through the direct map, they are not automatically included.
 	 */
 	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
 	last_pa = pa + round_page(msgbufsize);
 	while (pa < last_pa) {
 		dump_add_page(pa);
 		pa += PAGE_SIZE;
 	}
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use (taking into account the overhead of a page structure per
 	 * page).
 	 */
 	first_page = low_water / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
 	page_range = 0;
 	for (i = 0; i < vm_phys_nsegs; i++) {
 		page_range += atop(vm_phys_segs[i].end -
 		    vm_phys_segs[i].start);
 	}
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 #elif defined(VM_PHYSSEG_DENSE)
 	page_range = high_water / PAGE_SIZE - first_page;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 	end = new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 	 */
 	vaddr += PAGE_SIZE;
 
 	/*
 	 * Initialize the mem entry structures now, and put them in the free
 	 * queue.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array = (vm_page_t) mapped;
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Allocate memory for the reservation management system's data
 	 * structures.
 	 */
 	new_end = vm_reserv_startup(&vaddr, new_end, high_water);
 #endif
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * pmap_map on arm64, amd64, and mips can come out of the direct-map,
 	 * not kvm like i386, so the pages must be tracked for a crashdump to
 	 * include this data.  This includes the vm_page_array and the early
 	 * UMA bootstrap pages.
 	 */
 	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Add physical memory segments corresponding to the available
 	 * physical pages.
 	 */
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 	for (i = 0; i < page_range; i++)
 		vm_page_array[i].order = VM_NFREEORDER;
 	vm_page_array_size = page_range;
 
 	/*
 	 * Initialize the physical memory allocator.
 	 */
 	vm_phys_init();
 
 	/*
 	 * Add every available physical page that is not blacklisted to
 	 * the free lists.
 	 */
 	vm_cnt.v_page_count = 0;
 	vm_cnt.v_free_count = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		pa = phys_avail[i];
 		last_pa = phys_avail[i + 1];
 		while (pa < last_pa) {
 			vm_phys_add_page(pa);
 			pa += PAGE_SIZE;
 		}
 	}
 
 	TAILQ_INIT(&blacklist_head);
 	vm_page_blacklist_load(&list, &listend);
 	vm_page_blacklist_check(list, listend);
 
 	list = kern_getenv("vm.blacklist");
 	vm_page_blacklist_check(list, NULL);
 
 	freeenv(list);
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Initialize the reservation management system.
 	 */
 	vm_reserv_init();
 #endif
 	return (vaddr);
 }
 
 void
 vm_page_reference(vm_page_t m)
 {
 
 	vm_page_aflag_set(m, PGA_REFERENCED);
 }
 
 /*
  *	vm_page_busy_downgrade:
  *
  *	Downgrade an exclusive busy page into a single shared busy page.
  */
 void
 vm_page_busy_downgrade(vm_page_t m)
 {
 	u_int x;
 	bool locked;
 
 	vm_page_assert_xbusied(m);
 	locked = mtx_owned(vm_page_lockptr(m));
 
 	for (;;) {
 		x = m->busy_lock;
 		x &= VPB_BIT_WAITERS;
 		if (x != 0 && !locked)
 			vm_page_lock(m);
 		if (atomic_cmpset_rel_int(&m->busy_lock,
 		    VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
 			break;
 		if (x != 0 && !locked)
 			vm_page_unlock(m);
 	}
 	if (x != 0) {
 		wakeup(m);
 		if (!locked)
 			vm_page_unlock(m);
 	}
 }
 
 /*
  *	vm_page_sbusied:
  *
  *	Return a positive value if the page is shared busied, 0 otherwise.
  */
 int
 vm_page_sbusied(vm_page_t m)
 {
 	u_int x;
 
 	x = m->busy_lock;
 	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
 }
 
 /*
  *	vm_page_sunbusy:
  *
  *	Shared unbusy a page.
  */
 void
 vm_page_sunbusy(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_assert_sbusied(m);
 
 	for (;;) {
 		x = m->busy_lock;
 		if (VPB_SHARERS(x) > 1) {
 			if (atomic_cmpset_int(&m->busy_lock, x,
 			    x - VPB_ONE_SHARER))
 				break;
 			continue;
 		}
 		if ((x & VPB_BIT_WAITERS) == 0) {
 			KASSERT(x == VPB_SHARERS_WORD(1),
 			    ("vm_page_sunbusy: invalid lock state"));
 			if (atomic_cmpset_int(&m->busy_lock,
 			    VPB_SHARERS_WORD(1), VPB_UNBUSIED))
 				break;
 			continue;
 		}
 		KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
 		    ("vm_page_sunbusy: invalid lock state for waiters"));
 
 		vm_page_lock(m);
 		if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
 			vm_page_unlock(m);
 			continue;
 		}
 		wakeup(m);
 		vm_page_unlock(m);
 		break;
 	}
 }
 
 /*
  *	vm_page_busy_sleep:
  *
  *	Sleep and release the page lock, using the page pointer as wchan.
  *	This is used to implement the hard-path of busying mechanism.
  *
  *	The given page must be locked.
  *
  *	If nonshared is true, sleep only if the page is xbusy.
  */
 void
 vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
 {
 	u_int x;
 
 	vm_page_assert_locked(m);
 
 	x = m->busy_lock;
 	if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
 	    ((x & VPB_BIT_WAITERS) == 0 &&
 	    !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
 		vm_page_unlock(m);
 		return;
 	}
 	msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
 }
 
 /*
  *	vm_page_trysbusy:
  *
  *	Try to shared busy a page.
  *	If the operation succeeds 1 is returned otherwise 0.
  *	The operation never sleeps.
  */
 int
 vm_page_trysbusy(vm_page_t m)
 {
 	u_int x;
 
 	for (;;) {
 		x = m->busy_lock;
 		if ((x & VPB_BIT_SHARED) == 0)
 			return (0);
 		if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
 			return (1);
 	}
 }
 
 static void
 vm_page_xunbusy_locked(vm_page_t m)
 {
 
 	vm_page_assert_xbusied(m);
 	vm_page_assert_locked(m);
 
 	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
 	/* There is a waiter, do wakeup() instead of vm_page_flash(). */
 	wakeup(m);
 }
 
 void
 vm_page_xunbusy_maybelocked(vm_page_t m)
 {
 	bool lockacq;
 
 	vm_page_assert_xbusied(m);
 
 	/*
 	 * Fast path for unbusy.  If it succeeds, we know that there
 	 * are no waiters, so we do not need a wakeup.
 	 */
 	if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
 	    VPB_UNBUSIED))
 		return;
 
 	lockacq = !mtx_owned(vm_page_lockptr(m));
 	if (lockacq)
 		vm_page_lock(m);
 	vm_page_xunbusy_locked(m);
 	if (lockacq)
 		vm_page_unlock(m);
 }
 
 /*
  *	vm_page_xunbusy_hard:
  *
  *	Called after the first try the exclusive unbusy of a page failed.
  *	It is assumed that the waiters bit is on.
  */
 void
 vm_page_xunbusy_hard(vm_page_t m)
 {
 
 	vm_page_assert_xbusied(m);
 
 	vm_page_lock(m);
 	vm_page_xunbusy_locked(m);
 	vm_page_unlock(m);
 }
 
 /*
  *	vm_page_flash:
  *
  *	Wakeup anyone waiting for the page.
  *	The ownership bits do not change.
  *
  *	The given page must be locked.
  */
 void
 vm_page_flash(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_lock_assert(m, MA_OWNED);
 
 	for (;;) {
 		x = m->busy_lock;
 		if ((x & VPB_BIT_WAITERS) == 0)
 			return;
 		if (atomic_cmpset_int(&m->busy_lock, x,
 		    x & (~VPB_BIT_WAITERS)))
 			break;
 	}
 	wakeup(m);
 }
 
 /*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
  * holding ("wiring").
  */
 void
 vm_page_hold(vm_page_t mem)
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
         mem->hold_count++;
 }
 
 void
 vm_page_unhold(vm_page_t mem)
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
 	KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
 	--mem->hold_count;
 	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
 		vm_page_free_toq(mem);
 }
 
 /*
  *	vm_page_unhold_pages:
  *
  *	Unhold each of the pages that is referenced by the given array.
  */
 void
 vm_page_unhold_pages(vm_page_t *ma, int count)
 {
 	struct mtx *mtx, *new_mtx;
 
 	mtx = NULL;
 	for (; count != 0; count--) {
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(*ma);
 		if (mtx != new_mtx) {
 			if (mtx != NULL)
 				mtx_unlock(mtx);
 			mtx = new_mtx;
 			mtx_lock(mtx);
 		}
 		vm_page_unhold(*ma);
 		ma++;
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 vm_page_t
 PHYS_TO_VM_PAGE(vm_paddr_t pa)
 {
 	vm_page_t m;
 
 #ifdef VM_PHYSSEG_SPARSE
 	m = vm_phys_paddr_to_vm_page(pa);
 	if (m == NULL)
 		m = vm_phys_fictitious_to_vm_page(pa);
 	return (m);
 #elif defined(VM_PHYSSEG_DENSE)
 	long pi;
 
 	pi = atop(pa);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		m = &vm_page_array[pi - first_page];
 		return (m);
 	}
 	return (vm_phys_fictitious_to_vm_page(pa));
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 }
 
 /*
  *	vm_page_getfake:
  *
  *	Create a fictitious page with the specified physical address and
  *	memory attribute.  The memory attribute is the only the machine-
  *	dependent aspect of a fictitious page that must be initialized.
  */
 vm_page_t
 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
 {
 	vm_page_t m;
 
 	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
 	vm_page_initfake(m, paddr, memattr);
 	return (m);
 }
 
 void
 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		/*
 		 * The page's memattr might have changed since the
 		 * previous initialization.  Update the pmap to the
 		 * new memattr.
 		 */
 		goto memattr;
 	}
 	m->phys_addr = paddr;
 	m->queue = PQ_NONE;
 	/* Fictitious pages don't use "segind". */
 	m->flags = PG_FICTITIOUS;
 	/* Fictitious pages don't use "order" or "pool". */
 	m->oflags = VPO_UNMANAGED;
 	m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	m->wire_count = 1;
 	pmap_page_init(m);
 memattr:
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_putfake:
  *
  *	Release a fictitious page.
  */
 void
 vm_page_putfake(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_putfake: bad page %p", m));
 	uma_zfree(fakepg_zone, m);
 }
 
 /*
  *	vm_page_updatefake:
  *
  *	Update the given fictitious page to the specified physical address and
  *	memory attribute.
  */
 void
 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_updatefake: bad page %p", m));
 	m->phys_addr = paddr;
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_free:
  *
  *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
 
 	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_free_zero:
  *
  *	Free a page to the zerod-pages queue
  */
 void
 vm_page_free_zero(vm_page_t m)
 {
 
 	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  * Unbusy and handle the page queueing for a page from a getpages request that
  * was optionally read ahead or behind.
  */
 void
 vm_page_readahead_finish(vm_page_t m)
 {
 
 	/* We shouldn't put invalid pages on queues. */
 	KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
 
 	/*
 	 * Since the page is not the actually needed one, whether it should
 	 * be activated or deactivated is not obvious.  Empirical results
 	 * have shown that deactivating the page is usually the best choice,
 	 * unless the page is wanted by another thread.
 	 */
 	vm_page_lock(m);
 	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 		vm_page_activate(m);
 	else
 		vm_page_deactivate(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 }
 
 /*
  *	vm_page_sleep_if_busy:
  *
  *	Sleep and release the page queues lock if the page is busied.
  *	Returns TRUE if the thread slept.
  *
  *	The given page must be unlocked and object containing it must
  *	be locked.
  */
 int
 vm_page_sleep_if_busy(vm_page_t m, const char *msg)
 {
 	vm_object_t obj;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	if (vm_page_busied(m)) {
 		/*
 		 * The page-specific object must be cached because page
 		 * identity can change during the sleep, causing the
 		 * re-lock of a different object.
 		 * It is assumed that a reference to the object is already
 		 * held by the callers.
 		 */
 		obj = m->object;
 		vm_page_lock(m);
 		VM_OBJECT_WUNLOCK(obj);
 		vm_page_busy_sleep(m, msg, false);
 		VM_OBJECT_WLOCK(obj);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_page_dirty_KBI:		[ internal use only ]
  *
  *	Set all bits in the page's dirty field.
  *
  *	The object containing the specified page must be locked if the
  *	call is made from the machine-independent layer.
  *
  *	See vm_page_clear_dirty_mask().
  *
  *	This function should only be called by vm_page_dirty().
  */
 void
 vm_page_dirty_KBI(vm_page_t m)
 {
 
 	/* Refer to this operation by its public name. */
 	KASSERT(m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_page_dirty: page is invalid!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
  *
  *	The object must be locked.
  */
 int
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mpred;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	mpred = vm_radix_lookup_le(&object->rtree, pindex);
 	return (vm_page_insert_after(m, object, pindex, mpred));
 }
 
 /*
  *	vm_page_insert_after:
  *
  *	Inserts the page "m" into the specified object at offset "pindex".
  *
  *	The page "mpred" must immediately precede the offset "pindex" within
  *	the specified object.
  *
  *	The object must be locked.
  */
 static int
 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mpred)
 {
 	vm_page_t msucc;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(m->object == NULL,
 	    ("vm_page_insert_after: page already inserted"));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL)
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
 
 	/*
 	 * Record the object/offset pair in this page
 	 */
 	m->object = object;
 	m->pindex = pindex;
 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
 	if (vm_radix_insert(&object->rtree, m)) {
 		m->object = NULL;
 		m->pindex = 0;
 		return (1);
 	}
 	vm_page_insert_radixdone(m, object, mpred);
 	return (0);
 }
 
 /*
  *	vm_page_insert_radixdone:
  *
  *	Complete page "m" insertion into the specified object after the
  *	radix trie hooking.
  *
  *	The page "mpred" must precede the offset "m->pindex" within the
  *	specified object.
  *
  *	The object must be locked.
  */
 static void
 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object != NULL && m->object == object,
 	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < m->pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 	}
 
 	if (mpred != NULL)
 		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
 	else
 		TAILQ_INSERT_HEAD(&object->memq, m, listq);
 
 	/*
 	 * Show that the object has one more resident page.
 	 */
 	object->resident_page_count++;
 
 	/*
 	 * Hold the vnode until the last page is released.
 	 */
 	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 		vhold(object->handle);
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
 	 * update the object's OBJ_MIGHTBEDIRTY flag.
 	 */
 	if (pmap_page_is_write_mapped(m))
 		vm_object_set_writeable_dirty(object);
 }
 
 /*
  *	vm_page_remove:
  *
  *	Removes the specified page from its containing object, but does not
  *	invalidate any backing storage.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
 void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t mrem;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_assert_locked(m);
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (vm_page_xbusied(m))
 		vm_page_xunbusy_maybelocked(m);
 	mrem = vm_radix_remove(&object->rtree, m->pindex);
 	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 	object->resident_page_count--;
 
 	/*
 	 * The vnode may now be recycled.
 	 */
 	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 		vdrop(object->handle);
 
 	m->object = NULL;
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	return (vm_radix_lookup(&object->rtree, pindex));
 }
 
 /*
  *	vm_page_find_least:
  *
  *	Returns the page associated with the object with least pindex
  *	greater than or equal to the parameter pindex, or NULL.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
 		m = vm_radix_lookup_ge(&object->rtree, pindex);
 	return (m);
 }
 
 /*
  * Returns the given page's successor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_next(vm_page_t m)
 {
 	vm_page_t next;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
 		MPASS(next->object == m->object);
 		if (next->pindex != m->pindex + 1)
 			next = NULL;
 	}
 	return (next);
 }
 
 /*
  * Returns the given page's predecessor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_prev(vm_page_t m)
 {
 	vm_page_t prev;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
 		MPASS(prev->object == m->object);
 		if (prev->pindex != m->pindex - 1)
 			prev = NULL;
 	}
 	return (prev);
 }
 
 /*
  * Uses the page mnew as a replacement for an existing page at index
  * pindex which must be already present in the object.
  *
  * The existing page must not be on a paging queue.
  */
 vm_page_t
 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mold;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(mnew->object == NULL,
 	    ("vm_page_replace: page already in object"));
 
 	/*
 	 * This function mostly follows vm_page_insert() and
 	 * vm_page_remove() without the radix, object count and vnode
 	 * dance.  Double check such functions for more comments.
 	 */
 
 	mnew->object = object;
 	mnew->pindex = pindex;
 	mold = vm_radix_replace(&object->rtree, mnew);
 	KASSERT(mold->queue == PQ_NONE,
 	    ("vm_page_replace: mold is on a paging queue"));
 
 	/* Keep the resident page list in sorted order. */
 	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
 	TAILQ_REMOVE(&object->memq, mold, listq);
 
 	mold->object = NULL;
 	vm_page_xunbusy_maybelocked(mold);
 
 	/*
 	 * The object's resident_page_count does not change because we have
 	 * swapped one page for another, but OBJ_MIGHTBEDIRTY.
 	 */
 	if (pmap_page_is_write_mapped(mnew))
 		vm_object_set_writeable_dirty(object);
 	return (mold);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
  *	      page, (2) we are dirtying the page, (3) the VM system is probably
  *	      moving the page from object A to B, and will then later move
  *	      the backing store from A to B and we can't have a conflict.
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
  *	      swap.
  *
  *	The objects must be locked.
  */
 int
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
 	vm_page_t mpred;
 	vm_pindex_t opidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 
 	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
 	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
 	    ("vm_page_rename: pindex already renamed"));
 
 	/*
 	 * Create a custom version of vm_page_insert() which does not depend
 	 * by m_prev and can cheat on the implementation aspects of the
 	 * function.
 	 */
 	opidx = m->pindex;
 	m->pindex = new_pindex;
 	if (vm_radix_insert(&new_object->rtree, m)) {
 		m->pindex = opidx;
 		return (1);
 	}
 
 	/*
 	 * The operation cannot fail anymore.  The removal must happen before
 	 * the listq iterator is tainted.
 	 */
 	m->pindex = opidx;
 	vm_page_lock(m);
 	vm_page_remove(m);
 
 	/* Return back to the new pindex to complete vm_page_insert(). */
 	m->pindex = new_pindex;
 	m->object = new_object;
 	vm_page_unlock(m);
 	vm_page_insert_radixdone(m, new_object, mpred);
 	vm_page_dirty(m);
 	return (0);
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a page that is associated with the specified
  *	object and offset pair.  By default, this page is exclusive busied.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
 	vm_page_t m, mpred;
 	int flags, req_class;
 
 	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req));
 	if (object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	if (object != NULL) {
 		mpred = vm_radix_lookup_le(&object->rtree, pindex);
 		KASSERT(mpred == NULL || mpred->pindex != pindex,
 		   ("vm_page_alloc: pindex already allocated"));
 	}
 
 	/*
 	 * Allocate a page if the number of free pages exceeds the minimum
 	 * for the request class.
 	 */
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count > 0)) {
 		/*
 		 * Can we allocate the page from a reservation?
 		 */
 #if VM_NRESERVLEVEL > 0
 		if (object == NULL || (object->flags & (OBJ_COLORED |
 		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
 		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
 #endif
 		{
 			/*
 			 * If not, allocate it from the free page queues.
 			 */
 			m = vm_phys_alloc_pages(object != NULL ?
 			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 #if VM_NRESERVLEVEL > 0
 			if (m == NULL && vm_reserv_reclaim_inactive()) {
 				m = vm_phys_alloc_pages(object != NULL ?
 				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
 				    0);
 			}
 #endif
 		}
 	} else {
 		/*
 		 * Not allocatable, give up.
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
 		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 
 	/*
 	 *  At this point we had better have found a good page.
 	 */
 	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
 	vm_phys_freecnt_adj(m, -1);
 	mtx_unlock(&vm_page_queue_free_mtx);
 	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	flags &= m->flags;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	m->flags = flags;
 	m->aflags = 0;
 	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
 	m->busy_lock = VPB_UNBUSIED;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	if ((req & VM_ALLOC_SBUSY) != 0)
 		m->busy_lock = VPB_SHARERS_WORD(1);
 	if (req & VM_ALLOC_WIRED) {
 		/*
 		 * The page lock is not required for wiring a page until that
 		 * page is inserted into the object.
 		 */
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	m->act_count = 0;
 
 	if (object != NULL) {
 		if (vm_page_insert_after(m, object, pindex, mpred)) {
 			pagedaemon_wakeup();
 			if (req & VM_ALLOC_WIRED) {
 				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				m->wire_count = 0;
 			}
 			KASSERT(m->object == NULL, ("page %p has object", m));
 			m->oflags = VPO_UNMANAGED;
 			m->busy_lock = VPB_UNBUSIED;
 			/* Don't change PG_ZERO. */
 			vm_page_free_toq(m);
 			return (NULL);
 		}
 
 		/* Ignore device objects; the pager sets "memattr" for them. */
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    (object->flags & OBJ_FICTITIOUS) == 0)
 			pmap_page_set_memattr(m, object->memattr);
 	} else
 		m->pindex = pindex;
 
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
 /*
  *	vm_page_alloc_contig:
  *
  *	Allocate a contiguous set of physical pages of the given size "npages"
  *	from the free lists.  All of the physical pages must be at or above
  *	the given physical address "low" and below the given physical address
  *	"high".  The given value "alignment" determines the alignment of the
  *	first physical page in the set.  If the given value "boundary" is
  *	non-zero, then the set of physical pages cannot cross any physical
  *	address boundary that is a multiple of that value.  Both "alignment"
  *	and "boundary" must be a power of two.
  *
  *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
  *	then the memory attribute setting for the physical pages is configured
  *	to the object's memory attribute setting.  Otherwise, the memory
  *	attribute setting for the physical pages is configured to "memattr",
  *	overriding the object's memory attribute setting.  However, if the
  *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
  *	memory attribute setting for the physical pages cannot be configured
  *	to VM_MEMATTR_DEFAULT.
  *
  *	The specified object may not contain fictitious pages.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
 	vm_page_t m, m_ret, mpred;
 	u_int busy_lock, flags, oflags;
 	int req_class;
 
 	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
 	    req));
 	if (object != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(object);
 		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
 		    ("vm_page_alloc_contig: object %p has fictitious pages",
 		    object));
 	}
 	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	if (object != NULL) {
 		mpred = vm_radix_lookup_le(&object->rtree, pindex);
 		KASSERT(mpred == NULL || mpred->pindex != pindex,
 		    ("vm_page_alloc_contig: pindex already allocated"));
 	}
 
 	/*
 	 * Can we allocate the pages without the number of free pages falling
 	 * below the lower bound for the allocation class?
 	 */
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count >= npages)) {
 		/*
 		 * Can we allocate the pages from a reservation?
 		 */
 #if VM_NRESERVLEVEL > 0
 retry:
 		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
 		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
 		    low, high, alignment, boundary, mpred)) == NULL)
 #endif
 			/*
 			 * If not, allocate them from the free page queues.
 			 */
 			m_ret = vm_phys_alloc_contig(npages, low, high,
 			    alignment, boundary);
 	} else {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, npages);
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 	if (m_ret != NULL)
 		vm_phys_freecnt_adj(m_ret, -npages);
 	else {
 #if VM_NRESERVLEVEL > 0
 		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
 		    boundary))
 			goto retry;
 #endif
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (m_ret == NULL)
 		return (NULL);
 	for (m = m_ret; m < &m_ret[npages]; m++)
 		vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
 	busy_lock = VPB_UNBUSIED;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		busy_lock = VPB_SINGLE_EXCLUSIVER;
 	if ((req & VM_ALLOC_SBUSY) != 0)
 		busy_lock = VPB_SHARERS_WORD(1);
 	if ((req & VM_ALLOC_WIRED) != 0)
 		atomic_add_int(&vm_cnt.v_wire_count, npages);
 	if (object != NULL) {
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    memattr == VM_MEMATTR_DEFAULT)
 			memattr = object->memattr;
 	}
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		m->aflags = 0;
 		m->flags = (m->flags | PG_NODUMP) & flags;
 		m->busy_lock = busy_lock;
 		if ((req & VM_ALLOC_WIRED) != 0)
 			m->wire_count = 1;
 		m->act_count = 0;
 		m->oflags = oflags;
 		if (object != NULL) {
 			if (vm_page_insert_after(m, object, pindex, mpred)) {
 				pagedaemon_wakeup();
 				if ((req & VM_ALLOC_WIRED) != 0)
 					atomic_subtract_int(
 					    &vm_cnt.v_wire_count, npages);
 				KASSERT(m->object == NULL,
 				    ("page %p has object", m));
 				mpred = m;
 				for (m = m_ret; m < &m_ret[npages]; m++) {
 					if (m <= mpred &&
 					    (req & VM_ALLOC_WIRED) != 0)
 						m->wire_count = 0;
 					m->oflags = VPO_UNMANAGED;
 					m->busy_lock = VPB_UNBUSIED;
 					/* Don't change PG_ZERO. */
 					vm_page_free_toq(m);
 				}
 				return (NULL);
 			}
 			mpred = m;
 		} else
 			m->pindex = pindex;
 		if (memattr != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, memattr);
 		pindex++;
 	}
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 	return (m_ret);
 }
 
 /*
  * Check a page that has been freshly dequeued from a freelist.
  */
 static void
 vm_page_alloc_check(vm_page_t m)
 {
 
 	KASSERT(m->object == NULL, ("page %p has object", m));
 	KASSERT(m->queue == PQ_NONE,
 	    ("page %p has unexpected queue %d", m, m->queue));
 	KASSERT(m->wire_count == 0, ("page %p is wired", m));
 	KASSERT(m->hold_count == 0, ("page %p is held", m));
 	KASSERT(!vm_page_busied(m), ("page %p is busy", m));
 	KASSERT(m->dirty == 0, ("page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 	    ("page %p has unexpected memattr %d",
 	    m, pmap_page_get_memattr(m)));
 	KASSERT(m->valid == 0, ("free page %p is valid", m));
 }
 
 /*
  * 	vm_page_alloc_freelist:
  *
  *	Allocate a physical page from the specified free page list.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_freelist(int flind, int req)
 {
 	vm_page_t m;
 	u_int flags;
 	int req_class;
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	/*
 	 * Do not allocate reserved pages unless the req has asked for it.
 	 */
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count > 0))
 		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
 	else {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
 		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 	if (m == NULL) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return (NULL);
 	}
 	vm_phys_freecnt_adj(m, -1);
 	mtx_unlock(&vm_page_queue_free_mtx);
 	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	m->aflags = 0;
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	m->flags &= flags;
 	if ((req & VM_ALLOC_WIRED) != 0) {
 		/*
 		 * The page lock is not required for wiring a page that does
 		 * not belong to an object.
 		 */
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	/* Unmanaged pages don't use "act_count". */
 	m->oflags = VPO_UNMANAGED;
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 	return (m);
 }
 
 #define	VPSC_ANY	0	/* No restrictions. */
 #define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
 #define	VPSC_NOSUPER	2	/* Skip superpages. */
 
 /*
  *	vm_page_scan_contig:
  *
  *	Scan vm_page_array[] between the specified entries "m_start" and
  *	"m_end" for a run of contiguous physical pages that satisfy the
  *	specified conditions, and return the lowest page in the run.  The
  *	specified "alignment" determines the alignment of the lowest physical
  *	page in the run.  If the specified "boundary" is non-zero, then the
  *	run of physical pages cannot span a physical address that is a
  *	multiple of "boundary".
  *
  *	"m_end" is never dereferenced, so it need not point to a vm_page
  *	structure within vm_page_array[].
  *
  *	"npages" must be greater than zero.  "m_start" and "m_end" must not
  *	span a hole (or discontiguity) in the physical address space.  Both
  *	"alignment" and "boundary" must be a power of two.
  */
 vm_page_t
 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
     u_long alignment, vm_paddr_t boundary, int options)
 {
 	struct mtx *m_mtx, *new_mtx;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_run;
 #if VM_NRESERVLEVEL > 0
 	int level;
 #endif
 	int m_inc, order, run_ext, run_len;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	m_run = NULL;
 	run_len = 0;
 	m_mtx = NULL;
 	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
 		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
 		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
 
 		/*
 		 * If the current page would be the start of a run, check its
 		 * physical address against the end, alignment, and boundary
 		 * conditions.  If it doesn't satisfy these conditions, either
 		 * terminate the scan or advance to the next page that
 		 * satisfies the failed condition.
 		 */
 		if (run_len == 0) {
 			KASSERT(m_run == NULL, ("m_run != NULL"));
 			if (m + npages > m_end)
 				break;
 			pa = VM_PAGE_TO_PHYS(m);
 			if ((pa & (alignment - 1)) != 0) {
 				m_inc = atop(roundup2(pa, alignment) - pa);
 				continue;
 			}
 			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
 			    boundary) != 0) {
 				m_inc = atop(roundup2(pa, boundary) - pa);
 				continue;
 			}
 		} else
 			KASSERT(m_run != NULL, ("m_run == NULL"));
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(m);
 		if (m_mtx != new_mtx) {
 			if (m_mtx != NULL)
 				mtx_unlock(m_mtx);
 			m_mtx = new_mtx;
 			mtx_lock(m_mtx);
 		}
 		m_inc = 1;
 retry:
 		if (m->wire_count != 0 || m->hold_count != 0)
 			run_ext = 0;
 #if VM_NRESERVLEVEL > 0
 		else if ((level = vm_reserv_level(m)) >= 0 &&
 		    (options & VPSC_NORESERV) != 0) {
 			run_ext = 0;
 			/* Advance to the end of the reservation. */
 			pa = VM_PAGE_TO_PHYS(m);
 			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
 			    pa);
 		}
 #endif
 		else if ((object = m->object) != NULL) {
 			/*
 			 * The page is considered eligible for relocation if
 			 * and only if it could be laundered or reclaimed by
 			 * the page daemon.
 			 */
 			if (!VM_OBJECT_TRYRLOCK(object)) {
 				mtx_unlock(m_mtx);
 				VM_OBJECT_RLOCK(object);
 				mtx_lock(m_mtx);
 				if (m->object != object) {
 					/*
 					 * The page may have been freed.
 					 */
 					VM_OBJECT_RUNLOCK(object);
 					goto retry;
 				} else if (m->wire_count != 0 ||
 				    m->hold_count != 0) {
 					run_ext = 0;
 					goto unlock;
 				}
 			}
 			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 			    ("page %p is PG_UNHOLDFREE", m));
 			/* Don't care: PG_NODUMP, PG_ZERO. */
 			if (object->type != OBJT_DEFAULT &&
 			    object->type != OBJT_SWAP &&
 			    object->type != OBJT_VNODE) {
 				run_ext = 0;
 #if VM_NRESERVLEVEL > 0
 			} else if ((options & VPSC_NOSUPER) != 0 &&
 			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
 				run_ext = 0;
 				/* Advance to the end of the superpage. */
 				pa = VM_PAGE_TO_PHYS(m);
 				m_inc = atop(roundup2(pa + 1,
 				    vm_reserv_size(level)) - pa);
 #endif
 			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
 			    m->queue != PQ_NONE && !vm_page_busied(m)) {
 				/*
 				 * The page is allocated but eligible for
 				 * relocation.  Extend the current run by one
 				 * page.
 				 */
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
 				KASSERT((m->oflags & (VPO_SWAPINPROG |
 				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 				    ("page %p has unexpected oflags", m));
 				/* Don't care: VPO_NOSYNC. */
 				run_ext = 1;
 			} else
 				run_ext = 0;
 unlock:
 			VM_OBJECT_RUNLOCK(object);
 #if VM_NRESERVLEVEL > 0
 		} else if (level >= 0) {
 			/*
 			 * The page is reserved but not yet allocated.  In
 			 * other words, it is still free.  Extend the current
 			 * run by one page.
 			 */
 			run_ext = 1;
 #endif
 		} else if ((order = m->order) < VM_NFREEORDER) {
 			/*
 			 * The page is enqueued in the physical memory
 			 * allocator's free page queues.  Moreover, it is the
 			 * first page in a power-of-two-sized run of
 			 * contiguous free pages.  Add these pages to the end
 			 * of the current run, and jump ahead.
 			 */
 			run_ext = 1 << order;
 			m_inc = 1 << order;
 		} else {
 			/*
 			 * Skip the page for one of the following reasons: (1)
 			 * It is enqueued in the physical memory allocator's
 			 * free page queues.  However, it is not the first
 			 * page in a run of contiguous free pages.  (This case
 			 * rarely occurs because the scan is performed in
 			 * ascending order.) (2) It is not reserved, and it is
 			 * transitioning from free to allocated.  (Conversely,
 			 * the transition from allocated to free for managed
 			 * pages is blocked by the page lock.) (3) It is
 			 * allocated but not contained by an object and not
 			 * wired, e.g., allocated by Xen's balloon driver.
 			 */
 			run_ext = 0;
 		}
 
 		/*
 		 * Extend or reset the current run of pages.
 		 */
 		if (run_ext > 0) {
 			if (run_len == 0)
 				m_run = m;
 			run_len += run_ext;
 		} else {
 			if (run_len > 0) {
 				m_run = NULL;
 				run_len = 0;
 			}
 		}
 	}
 	if (m_mtx != NULL)
 		mtx_unlock(m_mtx);
 	if (run_len >= npages)
 		return (m_run);
 	return (NULL);
 }
 
 /*
  *	vm_page_reclaim_run:
  *
  *	Try to relocate each of the allocated virtual pages within the
  *	specified run of physical pages to a new physical address.  Free the
  *	physical pages underlying the relocated virtual pages.  A virtual page
  *	is relocatable if and only if it could be laundered or reclaimed by
  *	the page daemon.  Whenever possible, a virtual page is relocated to a
  *	physical address above "high".
  *
  *	Returns 0 if every physical page within the run was already free or
  *	just freed by a successful relocation.  Otherwise, returns a non-zero
  *	value indicating why the last attempt to relocate a virtual page was
  *	unsuccessful.
  *
  *	"req_class" must be an allocation class.
  */
 static int
 vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
     vm_paddr_t high)
 {
 	struct mtx *m_mtx, *new_mtx;
 	struct spglist free;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_end, m_new;
 	int error, order, req;
 
 	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
 	    ("req_class is not an allocation class"));
 	SLIST_INIT(&free);
 	error = 0;
 	m = m_run;
 	m_end = m_run + npages;
 	m_mtx = NULL;
 	for (; error == 0 && m < m_end; m++) {
 		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
 		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(m);
 		if (m_mtx != new_mtx) {
 			if (m_mtx != NULL)
 				mtx_unlock(m_mtx);
 			m_mtx = new_mtx;
 			mtx_lock(m_mtx);
 		}
 retry:
 		if (m->wire_count != 0 || m->hold_count != 0)
 			error = EBUSY;
 		else if ((object = m->object) != NULL) {
 			/*
 			 * The page is relocated if and only if it could be
 			 * laundered or reclaimed by the page daemon.
 			 */
 			if (!VM_OBJECT_TRYWLOCK(object)) {
 				mtx_unlock(m_mtx);
 				VM_OBJECT_WLOCK(object);
 				mtx_lock(m_mtx);
 				if (m->object != object) {
 					/*
 					 * The page may have been freed.
 					 */
 					VM_OBJECT_WUNLOCK(object);
 					goto retry;
 				} else if (m->wire_count != 0 ||
 				    m->hold_count != 0) {
 					error = EBUSY;
 					goto unlock;
 				}
 			}
 			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 			    ("page %p is PG_UNHOLDFREE", m));
 			/* Don't care: PG_NODUMP, PG_ZERO. */
 			if (object->type != OBJT_DEFAULT &&
 			    object->type != OBJT_SWAP &&
 			    object->type != OBJT_VNODE)
 				error = EINVAL;
 			else if (object->memattr != VM_MEMATTR_DEFAULT)
 				error = EINVAL;
 			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
 				KASSERT((m->oflags & (VPO_SWAPINPROG |
 				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 				    ("page %p has unexpected oflags", m));
 				/* Don't care: VPO_NOSYNC. */
 				if (m->valid != 0) {
 					/*
 					 * First, try to allocate a new page
 					 * that is above "high".  Failing
 					 * that, try to allocate a new page
 					 * that is below "m_run".  Allocate
 					 * the new page between the end of
 					 * "m_run" and "high" only as a last
 					 * resort.
 					 */
 					req = req_class | VM_ALLOC_NOOBJ;
 					if ((m->flags & PG_NODUMP) != 0)
 						req |= VM_ALLOC_NODUMP;
 					if (trunc_page(high) !=
 					    ~(vm_paddr_t)PAGE_MASK) {
 						m_new = vm_page_alloc_contig(
 						    NULL, 0, req, 1,
 						    round_page(high),
 						    ~(vm_paddr_t)0,
 						    PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					} else
 						m_new = NULL;
 					if (m_new == NULL) {
 						pa = VM_PAGE_TO_PHYS(m_run);
 						m_new = vm_page_alloc_contig(
 						    NULL, 0, req, 1,
 						    0, pa - 1, PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					}
 					if (m_new == NULL) {
 						pa += ptoa(npages);
 						m_new = vm_page_alloc_contig(
 						    NULL, 0, req, 1,
 						    pa, high, PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					}
 					if (m_new == NULL) {
 						error = ENOMEM;
 						goto unlock;
 					}
 					KASSERT(m_new->wire_count == 0,
 					    ("page %p is wired", m));
 
 					/*
 					 * Replace "m" with the new page.  For
 					 * vm_page_replace(), "m" must be busy
 					 * and dequeued.  Finally, change "m"
 					 * as if vm_page_free() was called.
 					 */
 					if (object->ref_count != 0)
 						pmap_remove_all(m);
 					m_new->aflags = m->aflags;
 					KASSERT(m_new->oflags == VPO_UNMANAGED,
 					    ("page %p is managed", m));
 					m_new->oflags = m->oflags & VPO_NOSYNC;
 					pmap_copy_page(m, m_new);
 					m_new->valid = m->valid;
 					m_new->dirty = m->dirty;
 					m->flags &= ~PG_ZERO;
 					vm_page_xbusy(m);
 					vm_page_remque(m);
 					vm_page_replace_checked(m_new, object,
 					    m->pindex, m);
 					m->valid = 0;
 					vm_page_undirty(m);
 
 					/*
 					 * The new page must be deactivated
 					 * before the object is unlocked.
 					 */
 					new_mtx = vm_page_lockptr(m_new);
 					if (m_mtx != new_mtx) {
 						mtx_unlock(m_mtx);
 						m_mtx = new_mtx;
 						mtx_lock(m_mtx);
 					}
 					vm_page_deactivate(m_new);
 				} else {
 					m->flags &= ~PG_ZERO;
 					vm_page_remque(m);
 					vm_page_remove(m);
 					KASSERT(m->dirty == 0,
 					    ("page %p is dirty", m));
 				}
 				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
 			} else
 				error = EBUSY;
 unlock:
 			VM_OBJECT_WUNLOCK(object);
 		} else {
 			mtx_lock(&vm_page_queue_free_mtx);
 			order = m->order;
 			if (order < VM_NFREEORDER) {
 				/*
 				 * The page is enqueued in the physical memory
 				 * allocator's free page queues.  Moreover, it
 				 * is the first page in a power-of-two-sized
 				 * run of contiguous free pages.  Jump ahead
 				 * to the last page within that run, and
 				 * continue from there.
 				 */
 				m += (1 << order) - 1;
 			}
 #if VM_NRESERVLEVEL > 0
 			else if (vm_reserv_is_page_free(m))
 				order = 0;
 #endif
 			mtx_unlock(&vm_page_queue_free_mtx);
 			if (order == VM_NFREEORDER)
 				error = EINVAL;
 		}
 	}
 	if (m_mtx != NULL)
 		mtx_unlock(m_mtx);
 	if ((m = SLIST_FIRST(&free)) != NULL) {
 		mtx_lock(&vm_page_queue_free_mtx);
 		do {
 			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 			vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 			if (!vm_reserv_free_page(m))
 #else
 			if (true)
 #endif
 				vm_phys_free_pages(m, 0);
 		} while ((m = SLIST_FIRST(&free)) != NULL);
 		vm_page_free_wakeup();
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 	return (error);
 }
 
 #define	NRUNS	16
 
 CTASSERT(powerof2(NRUNS));
 
 #define	RUN_INDEX(count)	((count) & (NRUNS - 1))
 
 #define	MIN_RECLAIM	8
 
 /*
  *	vm_page_reclaim_contig:
  *
  *	Reclaim allocated, contiguous physical memory satisfying the specified
  *	conditions by relocating the virtual pages using that physical memory.
  *	Returns true if reclamation is successful and false otherwise.  Since
  *	relocation requires the allocation of physical pages, reclamation may
  *	fail due to a shortage of free pages.  When reclamation fails, callers
  *	are expected to perform VM_WAIT before retrying a failed allocation
  *	operation, e.g., vm_page_alloc_contig().
  *
  *	The caller must always specify an allocation class through "req".
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	The optional allocation flags are ignored.
  *
  *	"npages" must be greater than zero.  Both "alignment" and "boundary"
  *	must be a power of two.
  */
 bool
 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t curr_low;
 	vm_page_t m_run, m_runs[NRUNS];
 	u_long count, reclaimed;
 	int error, i, options, req_class;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	/*
 	 * Return if the number of free pages cannot satisfy the requested
 	 * allocation.
 	 */
 	count = vm_cnt.v_free_count;
 	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
 	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
 	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
 		return (false);
 
 	/*
 	 * Scan up to three times, relaxing the restrictions ("options") on
 	 * the reclamation of reservations and superpages each time.
 	 */
 	for (options = VPSC_NORESERV;;) {
 		/*
 		 * Find the highest runs that satisfy the given constraints
 		 * and restrictions, and record them in "m_runs".
 		 */
 		curr_low = low;
 		count = 0;
 		for (;;) {
 			m_run = vm_phys_scan_contig(npages, curr_low, high,
 			    alignment, boundary, options);
 			if (m_run == NULL)
 				break;
 			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
 			m_runs[RUN_INDEX(count)] = m_run;
 			count++;
 		}
 
 		/*
 		 * Reclaim the highest runs in LIFO (descending) order until
 		 * the number of reclaimed pages, "reclaimed", is at least
 		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
 		 * reclamation is idempotent, and runs will (likely) recur
 		 * from one scan to the next as restrictions are relaxed.
 		 */
 		reclaimed = 0;
 		for (i = 0; count > 0 && i < NRUNS; i++) {
 			count--;
 			m_run = m_runs[RUN_INDEX(count)];
 			error = vm_page_reclaim_run(req_class, npages, m_run,
 			    high);
 			if (error == 0) {
 				reclaimed += npages;
 				if (reclaimed >= MIN_RECLAIM)
 					return (true);
 			}
 		}
 
 		/*
 		 * Either relax the restrictions on the next scan or return if
 		 * the last scan had no restrictions.
 		 */
 		if (options == VPSC_NORESERV)
 			options = VPSC_NOSUPER;
 		else if (options == VPSC_NOSUPER)
 			options = VPSC_ANY;
 		else if (options == VPSC_ANY)
 			return (reclaimed != 0);
 	}
 }
 
 /*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called in various places before memory allocations.
  */
 void
 vm_wait(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
 		if (__predict_false(pageproc == NULL))
 			panic("vm_wait in early boot");
 		if (!vm_pageout_wanted) {
 			vm_pageout_wanted = true;
 			wakeup(&vm_pageout_wanted);
 		}
 		vm_pages_needed = true;
 		msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
 		    "vmwait", 0);
 	}
 }
 
 /*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called only in vm_fault so that processes page faulting
  *	  can be easily tracked.
  *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
  *	  processes will be able to grab memory first.  Do not change
  *	  this balance without careful testing first.
  */
 void
 vm_waitpfault(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (!vm_pageout_wanted) {
 		vm_pageout_wanted = true;
 		wakeup(&vm_pageout_wanted);
 	}
 	vm_pages_needed = true;
 	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
 	    "pfault", 0);
 }
 
 struct vm_pagequeue *
 vm_page_pagequeue(vm_page_t m)
 {
 
 	if (vm_page_in_laundry(m))
 		return (&vm_dom[0].vmd_pagequeues[m->queue]);
 	else
 		return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
 }
 
 /*
  *	vm_page_dequeue:
  *
  *	Remove the given page from its current page queue.
  *
  *	The page must be locked.
  */
 void
 vm_page_dequeue(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_assert_locked(m);
 	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
 	    m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_dec(pq);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_dequeue_locked:
  *
  *	Remove the given page from its current page queue.
  *
  *	The page and page queue must be locked.
  */
 void
 vm_page_dequeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_dec(pq);
 }
 
 /*
  *	vm_page_enqueue:
  *
  *	Add the given page to the specified page queue.
  *
  *	The page must be locked.
  */
 static void
 vm_page_enqueue(uint8_t queue, vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(queue < PQ_COUNT,
 	    ("vm_page_enqueue: invalid queue %u request for page %p",
 	    queue, m));
 	if (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE)
 		pq = &vm_dom[0].vmd_pagequeues[queue];
 	else
 		pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_inc(pq);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_requeue:
  *
  *	Move the given page to the tail of its current page queue.
  *
  *	The page must be locked.
  */
 void
 vm_page_requeue(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue: page %p is not queued", m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_requeue_locked:
  *
  *	Move the given page to the tail of its current page queue.
  *
  *	The page queue must be locked.
  */
 void
 vm_page_requeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue_locked: page %p is not queued", m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 }
 
 /*
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
  *	Ensure that act_count is at least ACT_INIT but do not otherwise
  *	mess with it.
  *
  *	The page must be locked.
  */
 void
 vm_page_activate(vm_page_t m)
 {
 	int queue;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if ((queue = m->queue) != PQ_ACTIVE) {
 		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_page_enqueue(PQ_ACTIVE, m);
 		} else
 			KASSERT(queue == PQ_NONE,
 			    ("vm_page_activate: wired page %p is queued", m));
 	} else {
 		if (m->act_count < ACT_INIT)
 			m->act_count = ACT_INIT;
 	}
 }
 
 /*
  *	vm_page_free_wakeup:
  *
  *	Helper routine for vm_page_free_toq().  This routine is called
  *	when a page is added to the free queues.
  *
  *	The page queues must be locked.
  */
 static inline void
 vm_page_free_wakeup(void)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	/*
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
 	    vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
 	/*
 	 * wakeup processes that are waiting on memory if we hit a
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = false;
 		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
 /*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the free list,
  *	disassociating it with any VM object.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
 void
 vm_page_free_toq(vm_page_t m)
 {
 
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("vm_page_free_toq: freeing mapped page %p", m));
 	} else
 		KASSERT(m->queue == PQ_NONE,
 		    ("vm_page_free_toq: unmanaged page %p is queued", m));
 	PCPU_INC(cnt.v_tfree);
 
 	if (vm_page_sbusied(m))
 		panic("vm_page_free: freeing busy page %p", m);
 
 	/*
 	 * Unqueue, then remove page.  Note that we cannot destroy
 	 * the page here because we do not want to call the pager's
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
 	vm_page_remque(m);
 	vm_page_remove(m);
 
 	/*
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		return;
 	}
 
 	m->valid = 0;
 	vm_page_undirty(m);
 
 	if (m->wire_count != 0)
 		panic("vm_page_free: freeing wired page %p", m);
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
 		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
 		m->flags |= PG_UNHOLDFREE;
 	} else {
 		/*
 		 * Restore the default memory attribute to the page.
 		 */
 		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
 		/*
 		 * Insert the page into the physical memory allocator's free
 		 * page queues.
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 		if (!vm_reserv_free_page(m))
 #else
 		if (TRUE)
 #endif
 			vm_phys_free_pages(m, 0);
 		vm_page_free_wakeup();
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 }
 
 /*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
  *	another map, removing it from paging queues
  *	as necessary.
  *
  *	If the page is fictitious, then its wire count must remain one.
  *
  *	The page must be locked.
  */
 void
 vm_page_wire(vm_page_t m)
 {
 
 	/*
 	 * Only bump the wire statistics if the page is not already wired,
 	 * and only unqueue the page if it is on some queue (if it is unmanaged
 	 * it is already off the queues).
 	 */
 	vm_page_lock_assert(m, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 		    ("vm_page_wire: fictitious page %p's wire count isn't one",
 		    m));
 		return;
 	}
 	if (m->wire_count == 0) {
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
 		    m->queue == PQ_NONE,
 		    ("vm_page_wire: unmanaged page %p is queued", m));
 		vm_page_remque(m);
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 }
 
 /*
  * vm_page_unwire:
  *
  * Release one wiring of the specified page, potentially allowing it to be
  * paged out.  Returns TRUE if the number of wirings transitions to zero and
  * FALSE otherwise.
  *
  * Only managed pages belonging to an object can be paged out.  If the number
  * of wirings transitions to zero and the page is eligible for page out, then
  * the page is added to the specified paging queue (unless PQ_NONE is
  * specified).
  *
  * If a page is fictitious, then its wire count must always be one.
  *
  * A managed page must be locked.
  */
 boolean_t
 vm_page_unwire(vm_page_t m, uint8_t queue)
 {
 
 	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
 	    ("vm_page_unwire: invalid queue %u request for page %p",
 	    queue, m));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_assert_locked(m);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
 		return (FALSE);
 	}
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
 			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 			if ((m->oflags & VPO_UNMANAGED) == 0 &&
 			    m->object != NULL && queue != PQ_NONE)
 				vm_page_enqueue(queue, m);
 			return (TRUE);
 		} else
 			return (FALSE);
 	} else
 		panic("vm_page_unwire: page %p's wire count is zero", m);
 }
 
 /*
  * Move the specified page to the inactive queue.
  *
  * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
  * queue.  However, setting "noreuse" to TRUE will accelerate the specified
  * page's reclamation, but it will not unmap the page from any address space.
  * This is implemented by inserting the page near the head of the inactive
  * queue, using a marker page to guide FIFO insertion ordering.
  *
  * The page must be locked.
  */
 static inline void
 _vm_page_deactivate(vm_page_t m, boolean_t noreuse)
 {
 	struct vm_pagequeue *pq;
 	int queue;
 
 	vm_page_assert_locked(m);
 
 	/*
 	 * Ignore if the page is already inactive, unless it is unlikely to be
 	 * reactivated.
 	 */
 	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
 		return;
 	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
 		/* Avoid multiple acquisitions of the inactive queue lock. */
 		if (queue == PQ_INACTIVE) {
 			vm_pagequeue_lock(pq);
 			vm_page_dequeue_locked(m);
 		} else {
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_pagequeue_lock(pq);
 		}
 		m->queue = PQ_INACTIVE;
 		if (noreuse)
 			TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
 			    m, plinks.q);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 		vm_pagequeue_cnt_inc(pq);
 		vm_pagequeue_unlock(pq);
 	}
 }
 
 /*
  * Move the specified page to the inactive queue.
  *
  * The page must be locked.
  */
 void
 vm_page_deactivate(vm_page_t m)
 {
 
 	_vm_page_deactivate(m, FALSE);
 }
 
 /*
  * Move the specified page to the inactive queue with the expectation
  * that it is unlikely to be reused.
  *
  * The page must be locked.
  */
 void
 vm_page_deactivate_noreuse(vm_page_t m)
 {
 
 	_vm_page_deactivate(m, TRUE);
 }
 
 /*
  * vm_page_launder
  *
  * 	Put a page in the laundry.
  */
 void
 vm_page_launder(vm_page_t m)
 {
 	int queue;
 
 	vm_page_assert_locked(m);
 	if ((queue = m->queue) != PQ_LAUNDRY) {
 		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_page_enqueue(PQ_LAUNDRY, m);
 		} else
 			KASSERT(queue == PQ_NONE,
 			    ("wired page %p is queued", m));
 	}
 }
 
 /*
  * vm_page_unswappable
  *
  *	Put a page in the PQ_UNSWAPPABLE holding queue.
  */
 void
 vm_page_unswappable(vm_page_t m)
 {
 
 	vm_page_assert_locked(m);
 	KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0,
 	    ("page %p already unswappable", m));
 	if (m->queue != PQ_NONE)
 		vm_page_dequeue(m);
 	vm_page_enqueue(PQ_UNSWAPPABLE, m);
 }
 
 /*
  * vm_page_try_to_free()
  *
  *	Attempt to free the page.  If we cannot free it, we do nothing.
  *	1 is returned on success, 0 on failure.
  */
 int
 vm_page_try_to_free(vm_page_t m)
 {
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if (m->object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty || m->hold_count || m->wire_count ||
 	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
 		return (0);
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_free(m);
 	return (1);
 }
 
 /*
  * vm_page_advise
  *
- * 	Deactivate or do nothing, as appropriate.
+ * 	Apply the specified advice to the given page.
  *
  *	The object and page must be locked.
  */
 void
 vm_page_advise(vm_page_t m, int advice)
 {
 
 	vm_page_assert_locked(m);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (advice == MADV_FREE)
 		/*
 		 * Mark the page clean.  This will allow the page to be freed
 		 * without first paging it out.  MADV_FREE pages are often
 		 * quickly reused by malloc(3), so we do not do anything that
 		 * would result in a page fault on a later access.
 		 */
 		vm_page_undirty(m);
-	else if (advice != MADV_DONTNEED)
+	else if (advice != MADV_DONTNEED) {
+		if (advice == MADV_WILLNEED)
+			vm_page_activate(m);
 		return;
+	}
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
 	 * immediately reactivate the page.
 	 */
 	vm_page_aflag_clear(m, PGA_REFERENCED);
 
 	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
 	/*
 	 * Place clean pages near the head of the inactive queue rather than
 	 * the tail, thus defeating the queue's LRU operation and ensuring that
 	 * the page will be reused quickly.  Dirty pages not already in the
 	 * laundry are moved there.
 	 */
 	if (m->dirty == 0)
 		vm_page_deactivate_noreuse(m);
 	else
 		vm_page_launder(m);
 }
 
 /*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
  * This routine may sleep.
  *
  * The object must be locked on entry.  The lock will, however, be released
  * and reacquired if the routine sleeps.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 	int sleep;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 		    vm_page_xbusied(m) : vm_page_busied(m);
 		if (sleep) {
 			if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 				return (NULL);
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
 			 * likely to reclaim it.
 			 */
 			vm_page_aflag_set(m, PGA_REFERENCED);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(m, "pgrbwt", (allocflags &
 			    VM_ALLOC_IGN_SBUSY) != 0);
 			VM_OBJECT_WLOCK(object);
 			goto retrylookup;
 		} else {
 			if ((allocflags & VM_ALLOC_WIRED) != 0) {
 				vm_page_lock(m);
 				vm_page_wire(m);
 				vm_page_unlock(m);
 			}
 			if ((allocflags &
 			    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
 				vm_page_xbusy(m);
 			if ((allocflags & VM_ALLOC_SBUSY) != 0)
 				vm_page_sbusy(m);
 			return (m);
 		}
 	}
 	m = vm_page_alloc(object, pindex, allocflags);
 	if (m == NULL) {
 		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 			return (NULL);
 		VM_OBJECT_WUNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_WLOCK(object);
 		goto retrylookup;
 	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 /*
  * Mapping function for valid or dirty bits in a page.
  *
  * Inputs are required to range within a page.
  */
 vm_page_bits_t
 vm_page_bits(int base, int size)
 {
 	int first_bit;
 	int last_bit;
 
 	KASSERT(
 	    base + size <= PAGE_SIZE,
 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
 	);
 
 	if (size == 0)		/* handle degenerate case */
 		return (0);
 
 	first_bit = base >> DEV_BSHIFT;
 	last_bit = (base + size - 1) >> DEV_BSHIFT;
 
 	return (((vm_page_bits_t)2 << last_bit) -
 	    ((vm_page_bits_t)1 << first_bit));
 }
 
 /*
  *	vm_page_set_valid_range:
  *
  *	Sets portions of a page valid.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zeroed.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_valid_range(vm_page_t m, int base, int size)
 {
 	int endoff, frag;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Assert that no previously invalid block that is now being validated
 	 * is already dirty.
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 	    ("vm_page_set_valid_range: page %p is dirty", m));
 
 	/*
 	 * Set valid bits inclusive of any overlap.
 	 */
 	m->valid |= vm_page_bits(base, size);
 }
 
 /*
  * Clear the given bits from the specified page's dirty field.
  */
 static __inline void
 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
 {
 	uintptr_t addr;
 #if PAGE_SIZE < 16384
 	int shift;
 #endif
 
 	/*
 	 * If the object is locked and the page is neither exclusive busy nor
 	 * write mapped, then the page's dirty field cannot possibly be
 	 * set by a concurrent pmap operation.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
 		m->dirty &= ~pagebits;
 	else {
 		/*
 		 * The pmap layer can call vm_page_dirty() without
 		 * holding a distinguished lock.  The combination of
 		 * the object's lock and an atomic operation suffice
 		 * to guarantee consistency of the page dirty field.
 		 *
 		 * For PAGE_SIZE == 32768 case, compiler already
 		 * properly aligns the dirty field, so no forcible
 		 * alignment is needed. Only require existence of
 		 * atomic_clear_64 when page size is 32768.
 		 */
 		addr = (uintptr_t)&m->dirty;
 #if PAGE_SIZE == 32768
 		atomic_clear_64((uint64_t *)addr, pagebits);
 #elif PAGE_SIZE == 16384
 		atomic_clear_32((uint32_t *)addr, pagebits);
 #else		/* PAGE_SIZE <= 8192 */
 		/*
 		 * Use a trick to perform a 32-bit atomic on the
 		 * containing aligned word, to not depend on the existence
 		 * of atomic_clear_{8, 16}.
 		 */
 		shift = addr & (sizeof(uint32_t) - 1);
 #if BYTE_ORDER == BIG_ENDIAN
 		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
 #else
 		shift *= NBBY;
 #endif
 		addr &= ~(sizeof(uint32_t) - 1);
 		atomic_clear_32((uint32_t *)addr, pagebits << shift);
 #endif		/* PAGE_SIZE */
 	}
 }
 
 /*
  *	vm_page_set_validclean:
  *
  *	Sets portions of a page valid and clean.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zero'd.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t oldvalid, pagebits;
 	int endoff, frag;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
 	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
 	 * We set valid bits inclusive of any overlap, but we can only
 	 * clear dirty bits for DEV_BSIZE chunks that are fully within
 	 * the range.
 	 */
 	oldvalid = m->valid;
 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
 #if 0	/* NOT YET */
 	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 		frag = DEV_BSIZE - frag;
 		base += frag;
 		size -= frag;
 		if (size < 0)
 			size = 0;
 	}
 	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 #endif
 	if (base == 0 && size == PAGE_SIZE) {
 		/*
 		 * The page can only be modified within the pmap if it is
 		 * mapped, and it can only be mapped if it was previously
 		 * fully valid.
 		 */
 		if (oldvalid == VM_PAGE_BITS_ALL)
 			/*
 			 * Perform the pmap_clear_modify() first.  Otherwise,
 			 * a concurrent pmap operation, such as
 			 * pmap_protect(), could clear a modification in the
 			 * pmap and set the dirty field on the page before
 			 * pmap_clear_modify() had begun and after the dirty
 			 * field was cleared here.
 			 */
 			pmap_clear_modify(m);
 		m->dirty = 0;
 		m->oflags &= ~VPO_NOSYNC;
 	} else if (oldvalid != VM_PAGE_BITS_ALL)
 		m->dirty &= ~pagebits;
 	else
 		vm_page_clear_dirty_mask(m, pagebits);
 }
 
 void
 vm_page_clear_dirty(vm_page_t m, int base, int size)
 {
 
 	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
 }
 
 /*
  *	vm_page_set_invalid:
  *
  *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
  *	valid and dirty bits for the effected areas are cleared.
  */
 void
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 	vm_object_t object;
 
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
 	    size >= object->un_pager.vnp.vnp_size)
 		bits = VM_PAGE_BITS_ALL;
 	else
 		bits = vm_page_bits(base, size);
 	if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
 	    bits != 0)
 		pmap_remove_all(m);
 	KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
 	    !pmap_page_is_mapped(m),
 	    ("vm_page_set_invalid: page %p is mapped", m));
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
 }
 
 /*
  * vm_page_zero_invalid()
  *
  *	The kernel assumes that the invalid portions of a page contain
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
  *	Pages are most often semi-valid when the end of a file is mapped
  *	into memory and the file's size is not page aligned.
  */
 void
 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 {
 	int b;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	/*
 	 * Scan the valid bits looking for invalid sections that
 	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
 	 * valid bit may be set ) have already been zeroed by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
 		    (m->valid & ((vm_page_bits_t)1 << i))) {
 			if (i > b) {
 				pmap_zero_page_area(m,
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
 		}
 	}
 
 	/*
 	 * setvalid is TRUE when we can safely set the zero'd areas
 	 * as being valid.  We can do this if there are no cache consistancy
 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 	 */
 	if (setvalid)
 		m->valid = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_is_valid:
  *
  *	Is (partial) page valid?  Note that the case where size == 0
  *	will return FALSE in the degenerate case where the page is
  *	entirely invalid, and TRUE otherwise.
  */
 int
 vm_page_is_valid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	bits = vm_page_bits(base, size);
 	return (m->valid != 0 && (m->valid & bits) == bits);
 }
 
 /*
  *	vm_page_ps_is_valid:
  *
  *	Returns TRUE if the entire (super)page is valid and FALSE otherwise.
  */
 boolean_t
 vm_page_ps_is_valid(vm_page_t m)
 {
 	int i, npages;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	npages = atop(pagesizes[m->psind]);
 
 	/*
 	 * The physically contiguous pages that make up a superpage, i.e., a
 	 * page with a page size index ("psind") greater than zero, will
 	 * occupy adjacent entries in vm_page_array[].
 	 */
 	for (i = 0; i < npages; i++) {
 		if (m[i].valid != VM_PAGE_BITS_ALL)
 			return (FALSE);
 	}
 	return (TRUE);
 }
 
 /*
  * Set the page's dirty bits if the page is modified.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
 		vm_page_dirty(m);
 }
 
 void
 vm_page_lock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 void
 vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 int
 vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
 }
 
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void
 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
 {
 
 	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
 }
 
 void
 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
 {
 
 	mtx_assert_(vm_page_lockptr(m), a, file, line);
 }
 #endif
 
 #ifdef INVARIANTS
 void
 vm_page_object_lock_assert(vm_page_t m)
 {
 
 	/*
 	 * Certain of the page's fields may only be modified by the
 	 * holder of the containing object's lock or the exclusive busy.
 	 * holder.  Unfortunately, the holder of the write busy is
 	 * not recorded, and thus cannot be checked here.
 	 */
 	if (m->object != NULL && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 }
 
 void
 vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
 {
 
 	if ((bits & PGA_WRITEABLE) == 0)
 		return;
 
 	/*
 	 * The PGA_WRITEABLE flag can only be set if the page is
 	 * managed, is exclusively busied or the object is locked.
 	 * Currently, this flag is only set by pmap_enter().
 	 */
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("PGA_WRITEABLE on unmanaged page"));
 	if (!vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
 
 	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
 	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
 	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
 	db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
 	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
 	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
 	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
 	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
 	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
 	int dom;
 
 	db_printf("pq_free %d\n", vm_cnt.v_free_count);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf(
     "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
 		    dom,
 		    vm_dom[dom].vmd_page_count,
 		    vm_dom[dom].vmd_free_count,
 		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt);
 	}
 }
 
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 {
 	vm_page_t m;
 	boolean_t phys;
 
 	if (!have_addr) {
 		db_printf("show pginfo addr\n");
 		return;
 	}
 
 	phys = strchr(modif, 'p') != NULL;
 	if (phys)
 		m = PHYS_TO_VM_PAGE(addr);
 	else
 		m = (vm_page_t)addr;
 	db_printf(
     "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
     "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
 	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
 	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
 	    m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
 }
 #endif /* DDB */
Index: projects/netbsd-tests-upstream-01-2017/tests/sys/kern/acct/Makefile
===================================================================
--- projects/netbsd-tests-upstream-01-2017/tests/sys/kern/acct/Makefile	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/tests/sys/kern/acct/Makefile	(revision 312218)
@@ -1,19 +1,20 @@
 # $FreeBSD$
 
 TESTSDIR=	${TESTSBASE}/sys/kern/acct
 
 ATF_TESTS_C=	acct_test
 
 CFLAGS+=	-I${.OBJDIR}
 
 CLEANFILES+=	convert.c convert.c.tmp
 
 DPSRCS.acct_test=	convert.c
 acct_test.o: convert.c
 
 convert.c: ${SRCTOP}/sys/kern/kern_acct.c
 	sed -n -e 's/log(/syslog(/g' \
+	       -e 's/exp/expected/g' \
 	       -e '/FLOAT_CONVERSION_START/,/FLOAT_CONVERSION_END/p' ${.ALLSRC} >${.TARGET}.tmp
 	mv ${.TARGET}.tmp ${.TARGET}
 
 .include <bsd.test.mk>
Index: projects/netbsd-tests-upstream-01-2017/tests/sys/mac/bsdextended/ugidfw_test.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/tests/sys/mac/bsdextended/ugidfw_test.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/tests/sys/mac/bsdextended/ugidfw_test.c	(revision 312218)
@@ -1,253 +1,253 @@
 /*-
  * Copyright (c) 2005 McAfee, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 
 #include <security/mac_bsdextended/mac_bsdextended.h>
 
 #include <err.h>
 #include <errno.h>
 #include <grp.h>
 #include <pwd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ugidfw.h>
 #include <unistd.h>
 
 /*
  * Starting point for a regression test for mac_bsdextended(4) and the
  * supporting libugidfw(3).
  */
 
 /*
  * This section of the regression test passes some test cases through the
  * rule<->string routines to confirm they work approximately as desired.
  */
 
 /*
  * List of users and groups we must check exists before we can begin, since
  * they are used in the string test rules.  We use users and groups that will
  * always exist in a default install used for regression testing.
  */
 static const char *test_users[] = {
 	"root",
 	"daemon",
 	"operator",
 	"bin",
 };
 
 static const char *test_groups[] = {
 	"wheel",
 	"daemon",
 	"operator",
 	"bin",
 };
 
 static int test_num;
 
 /*
  * List of test strings that must go in (and come out) of libugidfw intact.
  */
 static const char *test_strings[] = {
 	/* Variations on subject and object uids. */
 	"subject uid root object uid root mode n",
 	"subject uid root object uid daemon mode n",
 	"subject uid daemon object uid root mode n",
 	"subject uid daemon object uid daemon mode n",
 	/* Variations on mode. */
 	"subject uid root object uid root mode a",
 	"subject uid root object uid root mode r",
 	"subject uid root object uid root mode s",
 	"subject uid root object uid root mode w",
 	"subject uid root object uid root mode x",
 	"subject uid root object uid root mode arswx",
 	/* Variations on subject and object gids. */
 	"subject gid wheel object gid wheel mode n",
 	"subject gid wheel object gid daemon mode n",
 	"subject gid daemon object gid wheel mode n",
 	"subject gid daemon object gid daemon mode n",
 	/* Subject uids and subject gids. */
 	"subject uid bin gid daemon object uid operator gid wheel mode n",
 	/* Not */
 	"subject not uid operator object uid bin mode n",
 	"subject uid bin object not uid operator mode n",
 	"subject not uid daemon object not uid operator mode n",
 	/* Ranges */
 	"subject uid root:operator object gid wheel:bin mode n",
 	/* Jail ID */
 	"subject jailid 1 object uid root mode n",
 	/* Filesys */
 	"subject uid root object filesys / mode n",
 	"subject uid root object filesys /dev mode n",
 	/* S/UGID */
 	"subject not uid root object sgid mode n",
 	"subject not uid root object sgid mode n",
 	/* Matching uid/gid */
 	"subject not uid root:operator object not uid_of_subject mode n",
 	"subject not gid wheel:bin object not gid_of_subject mode n",
 	/* Object types */
 	"subject uid root object type a mode a",
 	"subject uid root object type r mode a",
 	"subject uid root object type d mode a",
 	"subject uid root object type b mode a",
 	"subject uid root object type c mode a",
 	"subject uid root object type l mode a",
 	"subject uid root object type s mode a",
 	"subject uid root object type rbc mode a",
 	"subject uid root object type dls mode a",
 	/* Empty rules always match */
 	"subject object mode a",
 	/* Partial negations */
 	"subject ! uid root object mode n",
 	"subject ! gid wheel object mode n",
 	"subject ! jailid 2 object mode n",
 	"subject object ! uid root mode n",
 	"subject object ! gid wheel mode n",
 	"subject object ! filesys / mode n",
 	"subject object ! suid mode n",
 	"subject object ! sgid mode n",
 	"subject object ! uid_of_subject mode n",
 	"subject object ! gid_of_subject mode n",
 	"subject object ! type d mode n",
 	/* All out nonsense */
 	"subject uid root ! gid wheel:bin ! jailid 1 "
 	    "object ! uid root:daemon gid daemon filesys / suid sgid uid_of_subject gid_of_subject ! type r "
 	    "mode rsx",
 };
 
 static void
 test_libugidfw_strings(void)
 {
 	struct mac_bsdextended_rule rule;
 	char errorstr[256];
 	char rulestr[256];
 	size_t i;
 	int error;
 
 	for (i = 0; i < nitems(test_users); i++, test_num++) {
 		if (getpwnam(test_users[i]) == NULL)
 			printf("not ok %d # test_libugidfw_strings: getpwnam(%s) "
 			    "failed: %s\n", test_num, test_users[i], strerror(errno));
 		else
 			printf("ok %d\n", test_num);
 	}
 
 	for (i = 0; i < nitems(test_groups); i++, test_num++) {
 		if (getgrnam(test_groups[i]) == NULL)
 			printf("not ok %d # test_libugidfw_strings: getgrnam(%s) "
 			    "failed: %s\n", test_num, test_groups[i], strerror(errno));
 		else
 			printf("ok %d\n", test_num);
 	}
 
 	for (i = 0; i < nitems(test_strings); i++) {
 		error = bsde_parse_rule_string(test_strings[i], &rule,
 		    sizeof(errorstr), errorstr);
 		if (error == -1)
 			printf("not ok %d # bsde_parse_rule_string: '%s' (%zu) "
 			    "failed: %s\n", test_num, test_strings[i], i, errorstr);
 		else
 			printf("ok %d\n", test_num);
 		test_num++;
 
 		error = bsde_rule_to_string(&rule, rulestr, sizeof(rulestr));
 		if (error < 0)
 			printf("not ok %d # bsde_rule_to_string: rule for '%s' "
 			    "returned %d\n", test_num, test_strings[i], error);
 		else
 			printf("ok %d\n", test_num);
 		test_num++;
 
 		if (strcmp(test_strings[i], rulestr) != 0)
 			printf("not ok %d # test_libugidfw: '%s' in, '%s' "
 			    "out\n", test_num, test_strings[i], rulestr);
 		else
 			printf("ok %d\n", test_num);
 		test_num++;
 	}
 }
 
 int
 main(void)
 {
 	char errorstr[256];
 	int count, slots;
 
 	test_num = 1;
 
 	/* Print an error if a non-root user attemps to run the tests. */
 	if (getuid() != 0) {
 		printf("1..0 # SKIP you must be root\n");
 		return (0);
 	}
 
 	switch (mac_is_present("bsdextended")) {
 	case -1:
 		printf("1..0 # SKIP mac_is_present failed: %s\n",
 		    strerror(errno));
 		return (0);
 	case 1:
 		break;
 	case 0:
 	default:
 		printf("1..0 # SKIP mac_bsdextended not loaded\n");
 		return (0);
 	}
 
-	printf("1..%lu\n", nitems(test_users) + nitems(test_groups) +
+	printf("1..%zu\n", nitems(test_users) + nitems(test_groups) +
 	    3 * nitems(test_strings) + 2);
 
 	test_libugidfw_strings();
 
 	/*
 	 * Some simple up-front checks to see if we're able to query the
 	 * policy for basic state.  We want the rule count to be 0 before
 	 * starting, but "slots" is a property of prior runs and so we ignore
 	 * the return value.
 	 */
 	count = bsde_get_rule_count(sizeof(errorstr), errorstr);
 	if (count == -1)
 		printf("not ok %d # bsde_get_rule_count: %s\n", test_num,
 		    errorstr);
 	else
 		printf("ok %d\n", test_num);
 
 	test_num++;
 
 	slots = bsde_get_rule_slots(sizeof(errorstr), errorstr);
 	if (slots == -1)
 		printf("not ok %d # bsde_get_rule_slots: %s\n", test_num,
 		    errorstr);
 	else
 		printf("ok %d\n", test_num);
 
 	return (0);
 }
Index: projects/netbsd-tests-upstream-01-2017/tests/sys/vfs/lookup_cap_dotdot.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/tests/sys/vfs/lookup_cap_dotdot.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/tests/sys/vfs/lookup_cap_dotdot.c	(revision 312218)
@@ -1,248 +1,256 @@
 /*-
  * Copyright (c) 2016 Ed Maste <emaste@FreeBSD.org>
  * Copyright (c) 2016 Conrad Meyer <cem@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/sysctl.h>
 #include <sys/stat.h>
 
 #include <atf-c.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "freebsd_test_suite/macros.h"
 
 static int dirfd = -1;
 static char *abspath;
 
 static void
 touchat(int _dirfd, const char *name)
 {
 	int fd;
 
 	ATF_REQUIRE((fd = openat(_dirfd, name, O_CREAT | O_TRUNC | O_WRONLY,
 	    0777)) >= 0);
 	ATF_REQUIRE(close(fd) == 0);
 }
 
 static void
 prepare_dotdot_tests(void)
 {
 	char cwd[MAXPATHLEN];
 
 	ATF_REQUIRE(getcwd(cwd, sizeof(cwd)) != NULL);
 	asprintf(&abspath, "%s/testdir/d1/f1", cwd);
 
 	ATF_REQUIRE(mkdir("testdir", 0777) == 0);
 	ATF_REQUIRE((dirfd = open("testdir", O_RDONLY)) >= 0);
 
 	ATF_REQUIRE(mkdirat(dirfd, "d1", 0777) == 0);
 	ATF_REQUIRE(mkdirat(dirfd, "d1/d2", 0777) == 0);
 	ATF_REQUIRE(mkdirat(dirfd, "d1/d2/d3", 0777) == 0);
 	touchat(dirfd, "d1/f1");
 	touchat(dirfd, "d1/d2/f2");
 	touchat(dirfd, "d1/d2/d3/f3");
 	ATF_REQUIRE(symlinkat("d1/d2/d3", dirfd, "l3") == 0);
 	ATF_REQUIRE(symlinkat("../testdir/d1", dirfd, "lup") == 0);
 	ATF_REQUIRE(symlinkat("../..", dirfd, "d1/d2/d3/ld1") == 0);
 	ATF_REQUIRE(symlinkat("../../f1", dirfd, "d1/d2/d3/lf1") == 0);
 }
 
 static void
 check_capsicum(void)
 {
 	ATF_REQUIRE_FEATURE("security_capabilities");
 	ATF_REQUIRE_FEATURE("security_capability_mode");
 }
 
 /*
  * Positive tests
  */
 ATF_TC(openat__basic_positive);
 ATF_TC_HEAD(openat__basic_positive, tc)
 {
 	atf_tc_set_md_var(tc, "descr", "Basic positive openat testcases");
 }
 
 ATF_TC_BODY(openat__basic_positive, tc)
 {
 	prepare_dotdot_tests();
 
 	ATF_REQUIRE(openat(dirfd, "d1/d2/d3/f3", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "d1/d2/d3/../../f1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/f3", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/../../f1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "../testdir/d1/f1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "lup/f1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/ld1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/lf1", O_RDONLY) >= 0);
 	ATF_REQUIRE(open(abspath, O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, abspath, O_RDONLY) >= 0);
 }
 
 ATF_TC(lookup_cap_dotdot__basic);
 ATF_TC_HEAD(lookup_cap_dotdot__basic, tc)
 {
 	atf_tc_set_md_var(tc, "descr",
 	    "Validate cap-mode (testdir)/d1/.. lookup");
 }
 
 ATF_TC_BODY(lookup_cap_dotdot__basic, tc)
 {
 	cap_rights_t rights;
 
 	check_capsicum();
 	prepare_dotdot_tests();
 
 	cap_rights_init(&rights, CAP_LOOKUP, CAP_READ);
 	ATF_REQUIRE(cap_rights_limit(dirfd, &rights) >= 0);
 
+	atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690");
+
 	ATF_REQUIRE(cap_enter() >= 0);
 
 	ATF_REQUIRE_MSG(openat(dirfd, "d1/..", O_RDONLY) >= 0, "%s",
 	    strerror(errno));
 }
 
 ATF_TC(lookup_cap_dotdot__advanced);
 ATF_TC_HEAD(lookup_cap_dotdot__advanced, tc)
 {
 	atf_tc_set_md_var(tc, "descr",
 	    "Validate cap-mode (testdir)/d1/.. lookup");
 }
 
 ATF_TC_BODY(lookup_cap_dotdot__advanced, tc)
 {
 	cap_rights_t rights;
 
 	check_capsicum();
 	prepare_dotdot_tests();
 
+	atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690");
+
 	cap_rights_init(&rights, CAP_LOOKUP, CAP_READ);
 	ATF_REQUIRE(cap_rights_limit(dirfd, &rights) >= 0);
 
 	ATF_REQUIRE(cap_enter() >= 0);
 
 	ATF_REQUIRE(openat(dirfd, "d1/d2/d3/../../f1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/../../f1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/ld1", O_RDONLY) >= 0);
 	ATF_REQUIRE(openat(dirfd, "l3/lf1", O_RDONLY) >= 0);
 }
 
 /*
  * Negative tests
  */
 ATF_TC(openat__basic_negative);
 ATF_TC_HEAD(openat__basic_negative, tc)
 {
 	atf_tc_set_md_var(tc, "descr", "Basic negative openat testcases");
 }
 
 ATF_TC_BODY(openat__basic_negative, tc)
 {
 	prepare_dotdot_tests();
 
 	ATF_REQUIRE_ERRNO(ENOENT,
 	    openat(dirfd, "does-not-exist", O_RDONLY) < 0);
 	ATF_REQUIRE_ERRNO(ENOENT,
 	    openat(dirfd, "l3/does-not-exist", O_RDONLY) < 0);
 }
 
 ATF_TC(capmode__negative);
 ATF_TC_HEAD(capmode__negative, tc)
 {
 	atf_tc_set_md_var(tc, "descr", "Negative Capability mode testcases");
 }
 
 ATF_TC_BODY(capmode__negative, tc)
 {
 	int subdirfd;
 
 	check_capsicum();
 	prepare_dotdot_tests();
 
+	atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690");
+
 	ATF_REQUIRE(cap_enter() == 0);
 
 	/* open() not permitted in capability mode */
 	ATF_REQUIRE_ERRNO(ECAPMODE, open("testdir", O_RDONLY) < 0);
 
 	/* AT_FDCWD not permitted in capability mode */
 	ATF_REQUIRE_ERRNO(ECAPMODE, openat(AT_FDCWD, "d1/f1", O_RDONLY) < 0);
 
 	/* Relative path above dirfd not capable */
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "..", O_RDONLY) < 0);
 	ATF_REQUIRE((subdirfd = openat(dirfd, "l3", O_RDONLY)) >= 0);
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE,
 	    openat(subdirfd, "../../f1", O_RDONLY) < 0);
 
 	/* Absolute paths not capable */
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, abspath, O_RDONLY) < 0);
 
 	/* Symlink above dirfd */
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "lup/f1", O_RDONLY) < 0);
 }
 
 ATF_TC(lookup_cap_dotdot__negative);
 ATF_TC_HEAD(lookup_cap_dotdot__negative, tc)
 {
 	atf_tc_set_md_var(tc, "descr",
 	    "Validate cap-mode (testdir)/.. lookup fails");
 }
 
 ATF_TC_BODY(lookup_cap_dotdot__negative, tc)
 {
 	cap_rights_t rights;
 
 	check_capsicum();
 	prepare_dotdot_tests();
 
 	cap_rights_init(&rights, CAP_LOOKUP, CAP_READ);
 	ATF_REQUIRE(cap_rights_limit(dirfd, &rights) >= 0);
+
+	atf_tc_expect_signal(SIGABRT, "needs change done upstream in atf/kyua according to cem: bug 215690");
 
 	ATF_REQUIRE(cap_enter() >= 0);
 
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "..", O_RDONLY) < 0);
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "d1/../..", O_RDONLY) < 0);
 	ATF_REQUIRE_ERRNO(ENOTCAPABLE, openat(dirfd, "../testdir/d1/f1", O_RDONLY) < 0);
 }
 
 ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, openat__basic_positive);
 	ATF_TP_ADD_TC(tp, openat__basic_negative);
 
 	ATF_TP_ADD_TC(tp, capmode__negative);
 
 	ATF_TP_ADD_TC(tp, lookup_cap_dotdot__basic);
 	ATF_TP_ADD_TC(tp, lookup_cap_dotdot__advanced);
 	ATF_TP_ADD_TC(tp, lookup_cap_dotdot__negative);
 
 	return (atf_no_error());
 }
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.c	(revision 312218)
@@ -1,2729 +1,2730 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <netdb.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "ctld.h"
 #include "isns.h"
 
 bool proxy_mode = false;
 
 static volatile bool sighup_received = false;
 static volatile bool sigterm_received = false;
 static volatile bool sigalrm_received = false;
 
 static int nchildren = 0;
 static uint16_t last_portal_group_tag = 0xff;
 
 static void
 usage(void)
 {
 
 	fprintf(stderr, "usage: ctld [-d][-u][-f config-file]\n");
 	exit(1);
 }
 
 char *
 checked_strdup(const char *s)
 {
 	char *c;
 
 	c = strdup(s);
 	if (c == NULL)
 		log_err(1, "strdup");
 	return (c);
 }
 
 struct conf *
 conf_new(void)
 {
 	struct conf *conf;
 
 	conf = calloc(1, sizeof(*conf));
 	if (conf == NULL)
 		log_err(1, "calloc");
 	TAILQ_INIT(&conf->conf_luns);
 	TAILQ_INIT(&conf->conf_targets);
 	TAILQ_INIT(&conf->conf_auth_groups);
 	TAILQ_INIT(&conf->conf_ports);
 	TAILQ_INIT(&conf->conf_portal_groups);
 	TAILQ_INIT(&conf->conf_pports);
 	TAILQ_INIT(&conf->conf_isns);
 
 	conf->conf_isns_period = 900;
 	conf->conf_isns_timeout = 5;
 	conf->conf_debug = 0;
 	conf->conf_timeout = 60;
 	conf->conf_maxproc = 30;
 
 	return (conf);
 }
 
 void
 conf_delete(struct conf *conf)
 {
 	struct lun *lun, *ltmp;
 	struct target *targ, *tmp;
 	struct auth_group *ag, *cagtmp;
 	struct portal_group *pg, *cpgtmp;
 	struct pport *pp, *pptmp;
 	struct isns *is, *istmp;
 
 	assert(conf->conf_pidfh == NULL);
 
 	TAILQ_FOREACH_SAFE(lun, &conf->conf_luns, l_next, ltmp)
 		lun_delete(lun);
 	TAILQ_FOREACH_SAFE(targ, &conf->conf_targets, t_next, tmp)
 		target_delete(targ);
 	TAILQ_FOREACH_SAFE(ag, &conf->conf_auth_groups, ag_next, cagtmp)
 		auth_group_delete(ag);
 	TAILQ_FOREACH_SAFE(pg, &conf->conf_portal_groups, pg_next, cpgtmp)
 		portal_group_delete(pg);
 	TAILQ_FOREACH_SAFE(pp, &conf->conf_pports, pp_next, pptmp)
 		pport_delete(pp);
 	TAILQ_FOREACH_SAFE(is, &conf->conf_isns, i_next, istmp)
 		isns_delete(is);
 	assert(TAILQ_EMPTY(&conf->conf_ports));
 	free(conf->conf_pidfile_path);
 	free(conf);
 }
 
 static struct auth *
 auth_new(struct auth_group *ag)
 {
 	struct auth *auth;
 
 	auth = calloc(1, sizeof(*auth));
 	if (auth == NULL)
 		log_err(1, "calloc");
 	auth->a_auth_group = ag;
 	TAILQ_INSERT_TAIL(&ag->ag_auths, auth, a_next);
 	return (auth);
 }
 
 static void
 auth_delete(struct auth *auth)
 {
 	TAILQ_REMOVE(&auth->a_auth_group->ag_auths, auth, a_next);
 
 	free(auth->a_user);
 	free(auth->a_secret);
 	free(auth->a_mutual_user);
 	free(auth->a_mutual_secret);
 	free(auth);
 }
 
 const struct auth *
 auth_find(const struct auth_group *ag, const char *user)
 {
 	const struct auth *auth;
 
 	TAILQ_FOREACH(auth, &ag->ag_auths, a_next) {
 		if (strcmp(auth->a_user, user) == 0)
 			return (auth);
 	}
 
 	return (NULL);
 }
 
 static void
 auth_check_secret_length(struct auth *auth)
 {
 	size_t len;
 
 	len = strlen(auth->a_secret);
 	if (len > 16) {
 		if (auth->a_auth_group->ag_name != NULL)
 			log_warnx("secret for user \"%s\", auth-group \"%s\", "
 			    "is too long; it should be at most 16 characters "
 			    "long", auth->a_user, auth->a_auth_group->ag_name);
 		else
 			log_warnx("secret for user \"%s\", target \"%s\", "
 			    "is too long; it should be at most 16 characters "
 			    "long", auth->a_user,
 			    auth->a_auth_group->ag_target->t_name);
 	}
 	if (len < 12) {
 		if (auth->a_auth_group->ag_name != NULL)
 			log_warnx("secret for user \"%s\", auth-group \"%s\", "
 			    "is too short; it should be at least 12 characters "
 			    "long", auth->a_user,
 			    auth->a_auth_group->ag_name);
 		else
 			log_warnx("secret for user \"%s\", target \"%s\", "
 			    "is too short; it should be at least 12 characters "
 			    "long", auth->a_user,
 			    auth->a_auth_group->ag_target->t_name);
 	}
 
 	if (auth->a_mutual_secret != NULL) {
 		len = strlen(auth->a_mutual_secret);
 		if (len > 16) {
 			if (auth->a_auth_group->ag_name != NULL)
 				log_warnx("mutual secret for user \"%s\", "
 				    "auth-group \"%s\", is too long; it should "
 				    "be at most 16 characters long",
 				    auth->a_user, auth->a_auth_group->ag_name);
 			else
 				log_warnx("mutual secret for user \"%s\", "
 				    "target \"%s\", is too long; it should "
 				    "be at most 16 characters long",
 				    auth->a_user,
 				    auth->a_auth_group->ag_target->t_name);
 		}
 		if (len < 12) {
 			if (auth->a_auth_group->ag_name != NULL)
 				log_warnx("mutual secret for user \"%s\", "
 				    "auth-group \"%s\", is too short; it "
 				    "should be at least 12 characters long",
 				    auth->a_user, auth->a_auth_group->ag_name);
 			else
 				log_warnx("mutual secret for user \"%s\", "
 				    "target \"%s\", is too short; it should be "
 				    "at least 12 characters long",
 				    auth->a_user,
 				    auth->a_auth_group->ag_target->t_name);
 		}
 	}
 }
 
 const struct auth *
 auth_new_chap(struct auth_group *ag, const char *user,
     const char *secret)
 {
 	struct auth *auth;
 
 	if (ag->ag_type == AG_TYPE_UNKNOWN)
 		ag->ag_type = AG_TYPE_CHAP;
 	if (ag->ag_type != AG_TYPE_CHAP) {
 		if (ag->ag_name != NULL)
 			log_warnx("cannot mix \"chap\" authentication with "
 			    "other types for auth-group \"%s\"", ag->ag_name);
 		else
 			log_warnx("cannot mix \"chap\" authentication with "
 			    "other types for target \"%s\"",
 			    ag->ag_target->t_name);
 		return (NULL);
 	}
 
 	auth = auth_new(ag);
 	auth->a_user = checked_strdup(user);
 	auth->a_secret = checked_strdup(secret);
 
 	auth_check_secret_length(auth);
 
 	return (auth);
 }
 
 const struct auth *
 auth_new_chap_mutual(struct auth_group *ag, const char *user,
     const char *secret, const char *user2, const char *secret2)
 {
 	struct auth *auth;
 
 	if (ag->ag_type == AG_TYPE_UNKNOWN)
 		ag->ag_type = AG_TYPE_CHAP_MUTUAL;
 	if (ag->ag_type != AG_TYPE_CHAP_MUTUAL) {
 		if (ag->ag_name != NULL)
 			log_warnx("cannot mix \"chap-mutual\" authentication "
 			    "with other types for auth-group \"%s\"",
 			    ag->ag_name);
 		else
 			log_warnx("cannot mix \"chap-mutual\" authentication "
 			    "with other types for target \"%s\"",
 			    ag->ag_target->t_name);
 		return (NULL);
 	}
 
 	auth = auth_new(ag);
 	auth->a_user = checked_strdup(user);
 	auth->a_secret = checked_strdup(secret);
 	auth->a_mutual_user = checked_strdup(user2);
 	auth->a_mutual_secret = checked_strdup(secret2);
 
 	auth_check_secret_length(auth);
 
 	return (auth);
 }
 
 const struct auth_name *
 auth_name_new(struct auth_group *ag, const char *name)
 {
 	struct auth_name *an;
 
 	an = calloc(1, sizeof(*an));
 	if (an == NULL)
 		log_err(1, "calloc");
 	an->an_auth_group = ag;
 	an->an_initator_name = checked_strdup(name);
 	TAILQ_INSERT_TAIL(&ag->ag_names, an, an_next);
 	return (an);
 }
 
 static void
 auth_name_delete(struct auth_name *an)
 {
 	TAILQ_REMOVE(&an->an_auth_group->ag_names, an, an_next);
 
 	free(an->an_initator_name);
 	free(an);
 }
 
 bool
 auth_name_defined(const struct auth_group *ag)
 {
 	if (TAILQ_EMPTY(&ag->ag_names))
 		return (false);
 	return (true);
 }
 
 const struct auth_name *
 auth_name_find(const struct auth_group *ag, const char *name)
 {
 	const struct auth_name *auth_name;
 
 	TAILQ_FOREACH(auth_name, &ag->ag_names, an_next) {
 		if (strcmp(auth_name->an_initator_name, name) == 0)
 			return (auth_name);
 	}
 
 	return (NULL);
 }
 
 int
 auth_name_check(const struct auth_group *ag, const char *initiator_name)
 {
 	if (!auth_name_defined(ag))
 		return (0);
 
 	if (auth_name_find(ag, initiator_name) == NULL)
 		return (1);
 
 	return (0);
 }
 
 const struct auth_portal *
 auth_portal_new(struct auth_group *ag, const char *portal)
 {
 	struct auth_portal *ap;
 	char *net, *mask, *str, *tmp;
 	int len, dm, m;
 
 	ap = calloc(1, sizeof(*ap));
 	if (ap == NULL)
 		log_err(1, "calloc");
 	ap->ap_auth_group = ag;
 	ap->ap_initator_portal = checked_strdup(portal);
 	mask = str = checked_strdup(portal);
 	net = strsep(&mask, "/");
 	if (net[0] == '[')
 		net++;
 	len = strlen(net);
 	if (len == 0)
 		goto error;
 	if (net[len - 1] == ']')
 		net[len - 1] = 0;
 	if (strchr(net, ':') != NULL) {
 		struct sockaddr_in6 *sin6 =
 		    (struct sockaddr_in6 *)&ap->ap_sa;
 
 		sin6->sin6_len = sizeof(*sin6);
 		sin6->sin6_family = AF_INET6;
 		if (inet_pton(AF_INET6, net, &sin6->sin6_addr) <= 0)
 			goto error;
 		dm = 128;
 	} else {
 		struct sockaddr_in *sin =
 		    (struct sockaddr_in *)&ap->ap_sa;
 
 		sin->sin_len = sizeof(*sin);
 		sin->sin_family = AF_INET;
 		if (inet_pton(AF_INET, net, &sin->sin_addr) <= 0)
 			goto error;
 		dm = 32;
 	}
 	if (mask != NULL) {
 		m = strtol(mask, &tmp, 0);
 		if (m < 0 || m > dm || tmp[0] != 0)
 			goto error;
 	} else
 		m = dm;
 	ap->ap_mask = m;
 	free(str);
 	TAILQ_INSERT_TAIL(&ag->ag_portals, ap, ap_next);
 	return (ap);
 
 error:
 	free(str);
 	free(ap);
 	log_warnx("incorrect initiator portal \"%s\"", portal);
 	return (NULL);
 }
 
 static void
 auth_portal_delete(struct auth_portal *ap)
 {
 	TAILQ_REMOVE(&ap->ap_auth_group->ag_portals, ap, ap_next);
 
 	free(ap->ap_initator_portal);
 	free(ap);
 }
 
 bool
 auth_portal_defined(const struct auth_group *ag)
 {
 	if (TAILQ_EMPTY(&ag->ag_portals))
 		return (false);
 	return (true);
 }
 
 const struct auth_portal *
 auth_portal_find(const struct auth_group *ag, const struct sockaddr_storage *ss)
 {
 	const struct auth_portal *ap;
 	const uint8_t *a, *b;
 	int i;
 	uint8_t bmask;
 
 	TAILQ_FOREACH(ap, &ag->ag_portals, ap_next) {
 		if (ap->ap_sa.ss_family != ss->ss_family)
 			continue;
 		if (ss->ss_family == AF_INET) {
 			a = (const uint8_t *)
 			    &((const struct sockaddr_in *)ss)->sin_addr;
 			b = (const uint8_t *)
 			    &((const struct sockaddr_in *)&ap->ap_sa)->sin_addr;
 		} else {
 			a = (const uint8_t *)
 			    &((const struct sockaddr_in6 *)ss)->sin6_addr;
 			b = (const uint8_t *)
 			    &((const struct sockaddr_in6 *)&ap->ap_sa)->sin6_addr;
 		}
 		for (i = 0; i < ap->ap_mask / 8; i++) {
 			if (a[i] != b[i])
 				goto next;
 		}
 		if (ap->ap_mask % 8) {
 			bmask = 0xff << (8 - (ap->ap_mask % 8));
 			if ((a[i] & bmask) != (b[i] & bmask))
 				goto next;
 		}
 		return (ap);
 next:
 		;
 	}
 
 	return (NULL);
 }
 
 int
 auth_portal_check(const struct auth_group *ag, const struct sockaddr_storage *sa)
 {
 
 	if (!auth_portal_defined(ag))
 		return (0);
 
 	if (auth_portal_find(ag, sa) == NULL)
 		return (1);
 
 	return (0);
 }
 
 struct auth_group *
 auth_group_new(struct conf *conf, const char *name)
 {
 	struct auth_group *ag;
 
 	if (name != NULL) {
 		ag = auth_group_find(conf, name);
 		if (ag != NULL) {
 			log_warnx("duplicated auth-group \"%s\"", name);
 			return (NULL);
 		}
 	}
 
 	ag = calloc(1, sizeof(*ag));
 	if (ag == NULL)
 		log_err(1, "calloc");
 	if (name != NULL)
 		ag->ag_name = checked_strdup(name);
 	TAILQ_INIT(&ag->ag_auths);
 	TAILQ_INIT(&ag->ag_names);
 	TAILQ_INIT(&ag->ag_portals);
 	ag->ag_conf = conf;
 	TAILQ_INSERT_TAIL(&conf->conf_auth_groups, ag, ag_next);
 
 	return (ag);
 }
 
 void
 auth_group_delete(struct auth_group *ag)
 {
 	struct auth *auth, *auth_tmp;
 	struct auth_name *auth_name, *auth_name_tmp;
 	struct auth_portal *auth_portal, *auth_portal_tmp;
 
 	TAILQ_REMOVE(&ag->ag_conf->conf_auth_groups, ag, ag_next);
 
 	TAILQ_FOREACH_SAFE(auth, &ag->ag_auths, a_next, auth_tmp)
 		auth_delete(auth);
 	TAILQ_FOREACH_SAFE(auth_name, &ag->ag_names, an_next, auth_name_tmp)
 		auth_name_delete(auth_name);
 	TAILQ_FOREACH_SAFE(auth_portal, &ag->ag_portals, ap_next,
 	    auth_portal_tmp)
 		auth_portal_delete(auth_portal);
 	free(ag->ag_name);
 	free(ag);
 }
 
 struct auth_group *
 auth_group_find(const struct conf *conf, const char *name)
 {
 	struct auth_group *ag;
 
 	TAILQ_FOREACH(ag, &conf->conf_auth_groups, ag_next) {
 		if (ag->ag_name != NULL && strcmp(ag->ag_name, name) == 0)
 			return (ag);
 	}
 
 	return (NULL);
 }
 
 int
 auth_group_set_type(struct auth_group *ag, const char *str)
 {
 	int type;
 
 	if (strcmp(str, "none") == 0) {
 		type = AG_TYPE_NO_AUTHENTICATION;
 	} else if (strcmp(str, "deny") == 0) {
 		type = AG_TYPE_DENY;
 	} else if (strcmp(str, "chap") == 0) {
 		type = AG_TYPE_CHAP;
 	} else if (strcmp(str, "chap-mutual") == 0) {
 		type = AG_TYPE_CHAP_MUTUAL;
 	} else {
 		if (ag->ag_name != NULL)
 			log_warnx("invalid auth-type \"%s\" for auth-group "
 			    "\"%s\"", str, ag->ag_name);
 		else
 			log_warnx("invalid auth-type \"%s\" for target "
 			    "\"%s\"", str, ag->ag_target->t_name);
 		return (1);
 	}
 
 	if (ag->ag_type != AG_TYPE_UNKNOWN && ag->ag_type != type) {
 		if (ag->ag_name != NULL) {
 			log_warnx("cannot set auth-type to \"%s\" for "
 			    "auth-group \"%s\"; already has a different "
 			    "type", str, ag->ag_name);
 		} else {
 			log_warnx("cannot set auth-type to \"%s\" for target "
 			    "\"%s\"; already has a different type",
 			    str, ag->ag_target->t_name);
 		}
 		return (1);
 	}
 
 	ag->ag_type = type;
 
 	return (0);
 }
 
 static struct portal *
 portal_new(struct portal_group *pg)
 {
 	struct portal *portal;
 
 	portal = calloc(1, sizeof(*portal));
 	if (portal == NULL)
 		log_err(1, "calloc");
 	TAILQ_INIT(&portal->p_targets);
 	portal->p_portal_group = pg;
 	TAILQ_INSERT_TAIL(&pg->pg_portals, portal, p_next);
 	return (portal);
 }
 
 static void
 portal_delete(struct portal *portal)
 {
 
 	TAILQ_REMOVE(&portal->p_portal_group->pg_portals, portal, p_next);
 	if (portal->p_ai != NULL)
 		freeaddrinfo(portal->p_ai);
 	free(portal->p_listen);
 	free(portal);
 }
 
 struct portal_group *
 portal_group_new(struct conf *conf, const char *name)
 {
 	struct portal_group *pg;
 
 	pg = portal_group_find(conf, name);
 	if (pg != NULL) {
 		log_warnx("duplicated portal-group \"%s\"", name);
 		return (NULL);
 	}
 
 	pg = calloc(1, sizeof(*pg));
 	if (pg == NULL)
 		log_err(1, "calloc");
 	pg->pg_name = checked_strdup(name);
 	TAILQ_INIT(&pg->pg_options);
 	TAILQ_INIT(&pg->pg_portals);
 	TAILQ_INIT(&pg->pg_ports);
 	pg->pg_conf = conf;
 	pg->pg_tag = 0;		/* Assigned later in conf_apply(). */
 	TAILQ_INSERT_TAIL(&conf->conf_portal_groups, pg, pg_next);
 
 	return (pg);
 }
 
 void
 portal_group_delete(struct portal_group *pg)
 {
 	struct portal *portal, *tmp;
 	struct port *port, *tport;
 	struct option *o, *otmp;
 
 	TAILQ_FOREACH_SAFE(port, &pg->pg_ports, p_pgs, tport)
 		port_delete(port);
 	TAILQ_REMOVE(&pg->pg_conf->conf_portal_groups, pg, pg_next);
 
 	TAILQ_FOREACH_SAFE(portal, &pg->pg_portals, p_next, tmp)
 		portal_delete(portal);
 	TAILQ_FOREACH_SAFE(o, &pg->pg_options, o_next, otmp)
 		option_delete(&pg->pg_options, o);
 	free(pg->pg_name);
 	free(pg->pg_offload);
 	free(pg->pg_redirection);
 	free(pg);
 }
 
 struct portal_group *
 portal_group_find(const struct conf *conf, const char *name)
 {
 	struct portal_group *pg;
 
 	TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 		if (strcmp(pg->pg_name, name) == 0)
 			return (pg);
 	}
 
 	return (NULL);
 }
 
 static int
 parse_addr_port(char *arg, const char *def_port, struct addrinfo **ai)
 {
 	struct addrinfo hints;
 	char *str, *addr, *ch;
 	const char *port;
 	int error, colons = 0;
 
 	str = arg = strdup(arg);
 	if (arg[0] == '[') {
 		/*
 		 * IPv6 address in square brackets, perhaps with port.
 		 */
 		arg++;
 		addr = strsep(&arg, "]");
 		if (arg == NULL) {
 			free(str);
 			return (1);
 		}
 		if (arg[0] == '\0') {
 			port = def_port;
 		} else if (arg[0] == ':') {
 			port = arg + 1;
 		} else {
 			free(str);
 			return (1);
 		}
 	} else {
 		/*
 		 * Either IPv6 address without brackets - and without
 		 * a port - or IPv4 address.  Just count the colons.
 		 */
 		for (ch = arg; *ch != '\0'; ch++) {
 			if (*ch == ':')
 				colons++;
 		}
 		if (colons > 1) {
 			addr = arg;
 			port = def_port;
 		} else {
 			addr = strsep(&arg, ":");
 			if (arg == NULL)
 				port = def_port;
 			else
 				port = arg;
 		}
 	}
 
 	memset(&hints, 0, sizeof(hints));
 	hints.ai_family = PF_UNSPEC;
 	hints.ai_socktype = SOCK_STREAM;
 	hints.ai_flags = AI_PASSIVE;
 	error = getaddrinfo(addr, port, &hints, ai);
 	free(str);
 	return ((error != 0) ? 1 : 0);
 }
 
 int
 portal_group_add_listen(struct portal_group *pg, const char *value, bool iser)
 {
 	struct portal *portal;
 
 	portal = portal_new(pg);
 	portal->p_listen = checked_strdup(value);
 	portal->p_iser = iser;
 
 	if (parse_addr_port(portal->p_listen, "3260", &portal->p_ai)) {
 		log_warnx("invalid listen address %s", portal->p_listen);
 		portal_delete(portal);
 		return (1);
 	}
 
 	/*
 	 * XXX: getaddrinfo(3) may return multiple addresses; we should turn
 	 *	those into multiple portals.
 	 */
 
 	return (0);
 }
 
 int
 isns_new(struct conf *conf, const char *addr)
 {
 	struct isns *isns;
 
 	isns = calloc(1, sizeof(*isns));
 	if (isns == NULL)
 		log_err(1, "calloc");
 	isns->i_conf = conf;
 	TAILQ_INSERT_TAIL(&conf->conf_isns, isns, i_next);
 	isns->i_addr = checked_strdup(addr);
 
 	if (parse_addr_port(isns->i_addr, "3205", &isns->i_ai)) {
 		log_warnx("invalid iSNS address %s", isns->i_addr);
 		isns_delete(isns);
 		return (1);
 	}
 
 	/*
 	 * XXX: getaddrinfo(3) may return multiple addresses; we should turn
 	 *	those into multiple servers.
 	 */
 
 	return (0);
 }
 
 void
 isns_delete(struct isns *isns)
 {
 
 	TAILQ_REMOVE(&isns->i_conf->conf_isns, isns, i_next);
 	free(isns->i_addr);
 	if (isns->i_ai != NULL)
 		freeaddrinfo(isns->i_ai);
 	free(isns);
 }
 
 static int
 isns_do_connect(struct isns *isns)
 {
 	int s;
 
 	s = socket(isns->i_ai->ai_family, isns->i_ai->ai_socktype,
 	    isns->i_ai->ai_protocol);
 	if (s < 0) {
 		log_warn("socket(2) failed for %s", isns->i_addr);
 		return (-1);
 	}
 	if (connect(s, isns->i_ai->ai_addr, isns->i_ai->ai_addrlen)) {
 		log_warn("connect(2) failed for %s", isns->i_addr);
 		close(s);
 		return (-1);
 	}
 	return(s);
 }
 
 static int
 isns_do_register(struct isns *isns, int s, const char *hostname)
 {
 	struct conf *conf = isns->i_conf;
 	struct target *target;
 	struct portal *portal;
 	struct portal_group *pg;
 	struct port *port;
 	struct isns_req *req;
 	int res = 0;
 	uint32_t error;
 
 	req = isns_req_create(ISNS_FUNC_DEVATTRREG, ISNS_FLAG_CLIENT);
 	isns_req_add_str(req, 32, TAILQ_FIRST(&conf->conf_targets)->t_name);
 	isns_req_add_delim(req);
 	isns_req_add_str(req, 1, hostname);
 	isns_req_add_32(req, 2, 2); /* 2 -- iSCSI */
 	isns_req_add_32(req, 6, conf->conf_isns_period);
 	TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 		if (pg->pg_unassigned)
 			continue;
 		TAILQ_FOREACH(portal, &pg->pg_portals, p_next) {
 			isns_req_add_addr(req, 16, portal->p_ai);
 			isns_req_add_port(req, 17, portal->p_ai);
 		}
 	}
 	TAILQ_FOREACH(target, &conf->conf_targets, t_next) {
 		isns_req_add_str(req, 32, target->t_name);
 		isns_req_add_32(req, 33, 1); /* 1 -- Target*/
 		if (target->t_alias != NULL)
 			isns_req_add_str(req, 34, target->t_alias);
 		TAILQ_FOREACH(port, &target->t_ports, p_ts) {
 			if ((pg = port->p_portal_group) == NULL)
 				continue;
 			isns_req_add_32(req, 51, pg->pg_tag);
 			TAILQ_FOREACH(portal, &pg->pg_portals, p_next) {
 				isns_req_add_addr(req, 49, portal->p_ai);
 				isns_req_add_port(req, 50, portal->p_ai);
 			}
 		}
 	}
 	res = isns_req_send(s, req);
 	if (res < 0) {
 		log_warn("send(2) failed for %s", isns->i_addr);
 		goto quit;
 	}
 	res = isns_req_receive(s, req);
 	if (res < 0) {
 		log_warn("receive(2) failed for %s", isns->i_addr);
 		goto quit;
 	}
 	error = isns_req_get_status(req);
 	if (error != 0) {
 		log_warnx("iSNS register error %d for %s", error, isns->i_addr);
 		res = -1;
 	}
 quit:
 	isns_req_free(req);
 	return (res);
 }
 
 static int
 isns_do_check(struct isns *isns, int s, const char *hostname)
 {
 	struct conf *conf = isns->i_conf;
 	struct isns_req *req;
 	int res = 0;
 	uint32_t error;
 
 	req = isns_req_create(ISNS_FUNC_DEVATTRQRY, ISNS_FLAG_CLIENT);
 	isns_req_add_str(req, 32, TAILQ_FIRST(&conf->conf_targets)->t_name);
 	isns_req_add_str(req, 1, hostname);
 	isns_req_add_delim(req);
 	isns_req_add(req, 2, 0, NULL);
 	res = isns_req_send(s, req);
 	if (res < 0) {
 		log_warn("send(2) failed for %s", isns->i_addr);
 		goto quit;
 	}
 	res = isns_req_receive(s, req);
 	if (res < 0) {
 		log_warn("receive(2) failed for %s", isns->i_addr);
 		goto quit;
 	}
 	error = isns_req_get_status(req);
 	if (error != 0) {
 		log_warnx("iSNS check error %d for %s", error, isns->i_addr);
 		res = -1;
 	}
 quit:
 	isns_req_free(req);
 	return (res);
 }
 
 static int
 isns_do_deregister(struct isns *isns, int s, const char *hostname)
 {
 	struct conf *conf = isns->i_conf;
 	struct isns_req *req;
 	int res = 0;
 	uint32_t error;
 
 	req = isns_req_create(ISNS_FUNC_DEVDEREG, ISNS_FLAG_CLIENT);
 	isns_req_add_str(req, 32, TAILQ_FIRST(&conf->conf_targets)->t_name);
 	isns_req_add_delim(req);
 	isns_req_add_str(req, 1, hostname);
 	res = isns_req_send(s, req);
 	if (res < 0) {
 		log_warn("send(2) failed for %s", isns->i_addr);
 		goto quit;
 	}
 	res = isns_req_receive(s, req);
 	if (res < 0) {
 		log_warn("receive(2) failed for %s", isns->i_addr);
 		goto quit;
 	}
 	error = isns_req_get_status(req);
 	if (error != 0) {
 		log_warnx("iSNS deregister error %d for %s", error, isns->i_addr);
 		res = -1;
 	}
 quit:
 	isns_req_free(req);
 	return (res);
 }
 
 void
 isns_register(struct isns *isns, struct isns *oldisns)
 {
 	struct conf *conf = isns->i_conf;
 	int s;
 	char hostname[256];
 
 	if (TAILQ_EMPTY(&conf->conf_targets) ||
 	    TAILQ_EMPTY(&conf->conf_portal_groups))
 		return;
 	set_timeout(conf->conf_isns_timeout, false);
 	s = isns_do_connect(isns);
 	if (s < 0) {
 		set_timeout(0, false);
 		return;
 	}
 	gethostname(hostname, sizeof(hostname));
 
 	if (oldisns == NULL || TAILQ_EMPTY(&oldisns->i_conf->conf_targets))
 		oldisns = isns;
 	isns_do_deregister(oldisns, s, hostname);
 	isns_do_register(isns, s, hostname);
 	close(s);
 	set_timeout(0, false);
 }
 
 void
 isns_check(struct isns *isns)
 {
 	struct conf *conf = isns->i_conf;
 	int s, res;
 	char hostname[256];
 
 	if (TAILQ_EMPTY(&conf->conf_targets) ||
 	    TAILQ_EMPTY(&conf->conf_portal_groups))
 		return;
 	set_timeout(conf->conf_isns_timeout, false);
 	s = isns_do_connect(isns);
 	if (s < 0) {
 		set_timeout(0, false);
 		return;
 	}
 	gethostname(hostname, sizeof(hostname));
 
 	res = isns_do_check(isns, s, hostname);
 	if (res < 0) {
 		isns_do_deregister(isns, s, hostname);
 		isns_do_register(isns, s, hostname);
 	}
 	close(s);
 	set_timeout(0, false);
 }
 
 void
 isns_deregister(struct isns *isns)
 {
 	struct conf *conf = isns->i_conf;
 	int s;
 	char hostname[256];
 
 	if (TAILQ_EMPTY(&conf->conf_targets) ||
 	    TAILQ_EMPTY(&conf->conf_portal_groups))
 		return;
 	set_timeout(conf->conf_isns_timeout, false);
 	s = isns_do_connect(isns);
 	if (s < 0)
 		return;
 	gethostname(hostname, sizeof(hostname));
 
 	isns_do_deregister(isns, s, hostname);
 	close(s);
 	set_timeout(0, false);
 }
 
 int
 portal_group_set_filter(struct portal_group *pg, const char *str)
 {
 	int filter;
 
 	if (strcmp(str, "none") == 0) {
 		filter = PG_FILTER_NONE;
 	} else if (strcmp(str, "portal") == 0) {
 		filter = PG_FILTER_PORTAL;
 	} else if (strcmp(str, "portal-name") == 0) {
 		filter = PG_FILTER_PORTAL_NAME;
 	} else if (strcmp(str, "portal-name-auth") == 0) {
 		filter = PG_FILTER_PORTAL_NAME_AUTH;
 	} else {
 		log_warnx("invalid discovery-filter \"%s\" for portal-group "
 		    "\"%s\"; valid values are \"none\", \"portal\", "
 		    "\"portal-name\", and \"portal-name-auth\"",
 		    str, pg->pg_name);
 		return (1);
 	}
 
 	if (pg->pg_discovery_filter != PG_FILTER_UNKNOWN &&
 	    pg->pg_discovery_filter != filter) {
 		log_warnx("cannot set discovery-filter to \"%s\" for "
 		    "portal-group \"%s\"; already has a different "
 		    "value", str, pg->pg_name);
 		return (1);
 	}
 
 	pg->pg_discovery_filter = filter;
 
 	return (0);
 }
 
 int
 portal_group_set_offload(struct portal_group *pg, const char *offload)
 {
 
 	if (pg->pg_offload != NULL) {
 		log_warnx("cannot set offload to \"%s\" for "
 		    "portal-group \"%s\"; already defined",
 		    offload, pg->pg_name);
 		return (1);
 	}
 
 	pg->pg_offload = checked_strdup(offload);
 
 	return (0);
 }
 
 int
 portal_group_set_redirection(struct portal_group *pg, const char *addr)
 {
 
 	if (pg->pg_redirection != NULL) {
 		log_warnx("cannot set redirection to \"%s\" for "
 		    "portal-group \"%s\"; already defined",
 		    addr, pg->pg_name);
 		return (1);
 	}
 
 	pg->pg_redirection = checked_strdup(addr);
 
 	return (0);
 }
 
 static bool
 valid_hex(const char ch)
 {
 	switch (ch) {
 	case '0':
 	case '1':
 	case '2':
 	case '3':
 	case '4':
 	case '5':
 	case '6':
 	case '7':
 	case '8':
 	case '9':
 	case 'a':
 	case 'A':
 	case 'b':
 	case 'B':
 	case 'c':
 	case 'C':
 	case 'd':
 	case 'D':
 	case 'e':
 	case 'E':
 	case 'f':
 	case 'F':
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 bool
 valid_iscsi_name(const char *name)
 {
 	int i;
 
 	if (strlen(name) >= MAX_NAME_LEN) {
 		log_warnx("overlong name for target \"%s\"; max length allowed "
 		    "by iSCSI specification is %d characters",
 		    name, MAX_NAME_LEN);
 		return (false);
 	}
 
 	/*
 	 * In the cases below, we don't return an error, just in case the admin
 	 * was right, and we're wrong.
 	 */
 	if (strncasecmp(name, "iqn.", strlen("iqn.")) == 0) {
 		for (i = strlen("iqn."); name[i] != '\0'; i++) {
 			/*
 			 * XXX: We should verify UTF-8 normalisation, as defined
 			 *      by 3.2.6.2: iSCSI Name Encoding.
 			 */
 			if (isalnum(name[i]))
 				continue;
 			if (name[i] == '-' || name[i] == '.' || name[i] == ':')
 				continue;
 			log_warnx("invalid character \"%c\" in target name "
 			    "\"%s\"; allowed characters are letters, digits, "
 			    "'-', '.', and ':'", name[i], name);
 			break;
 		}
 		/*
 		 * XXX: Check more stuff: valid date and a valid reversed domain.
 		 */
 	} else if (strncasecmp(name, "eui.", strlen("eui.")) == 0) {
 		if (strlen(name) != strlen("eui.") + 16)
 			log_warnx("invalid target name \"%s\"; the \"eui.\" "
 			    "should be followed by exactly 16 hexadecimal "
 			    "digits", name);
 		for (i = strlen("eui."); name[i] != '\0'; i++) {
 			if (!valid_hex(name[i])) {
 				log_warnx("invalid character \"%c\" in target "
 				    "name \"%s\"; allowed characters are 1-9 "
 				    "and A-F", name[i], name);
 				break;
 			}
 		}
 	} else if (strncasecmp(name, "naa.", strlen("naa.")) == 0) {
 		if (strlen(name) > strlen("naa.") + 32)
 			log_warnx("invalid target name \"%s\"; the \"naa.\" "
 			    "should be followed by at most 32 hexadecimal "
 			    "digits", name);
 		for (i = strlen("naa."); name[i] != '\0'; i++) {
 			if (!valid_hex(name[i])) {
 				log_warnx("invalid character \"%c\" in target "
 				    "name \"%s\"; allowed characters are 1-9 "
 				    "and A-F", name[i], name);
 				break;
 			}
 		}
 	} else {
 		log_warnx("invalid target name \"%s\"; should start with "
 		    "either \"iqn.\", \"eui.\", or \"naa.\"",
 		    name);
 	}
 	return (true);
 }
 
 struct pport *
 pport_new(struct conf *conf, const char *name, uint32_t ctl_port)
 {
 	struct pport *pp;
 
 	pp = calloc(1, sizeof(*pp));
 	if (pp == NULL)
 		log_err(1, "calloc");
 	pp->pp_conf = conf;
 	pp->pp_name = checked_strdup(name);
 	pp->pp_ctl_port = ctl_port;
 	TAILQ_INIT(&pp->pp_ports);
 	TAILQ_INSERT_TAIL(&conf->conf_pports, pp, pp_next);
 	return (pp);
 }
 
 struct pport *
 pport_find(const struct conf *conf, const char *name)
 {
 	struct pport *pp;
 
 	TAILQ_FOREACH(pp, &conf->conf_pports, pp_next) {
 		if (strcasecmp(pp->pp_name, name) == 0)
 			return (pp);
 	}
 	return (NULL);
 }
 
 struct pport *
 pport_copy(struct pport *pp, struct conf *conf)
 {
 	struct pport *ppnew;
 
 	ppnew = pport_new(conf, pp->pp_name, pp->pp_ctl_port);
 	return (ppnew);
 }
 
 void
 pport_delete(struct pport *pp)
 {
 	struct port *port, *tport;
 
 	TAILQ_FOREACH_SAFE(port, &pp->pp_ports, p_ts, tport)
 		port_delete(port);
 	TAILQ_REMOVE(&pp->pp_conf->conf_pports, pp, pp_next);
 	free(pp->pp_name);
 	free(pp);
 }
 
 struct port *
 port_new(struct conf *conf, struct target *target, struct portal_group *pg)
 {
 	struct port *port;
 	char *name;
 	int ret;
 
 	ret = asprintf(&name, "%s-%s", pg->pg_name, target->t_name);
 	if (ret <= 0)
 		log_err(1, "asprintf");
 	if (port_find(conf, name) != NULL) {
 		log_warnx("duplicate port \"%s\"", name);
 		free(name);
 		return (NULL);
 	}
 	port = calloc(1, sizeof(*port));
 	if (port == NULL)
 		log_err(1, "calloc");
 	port->p_conf = conf;
 	port->p_name = name;
 	TAILQ_INSERT_TAIL(&conf->conf_ports, port, p_next);
 	TAILQ_INSERT_TAIL(&target->t_ports, port, p_ts);
 	port->p_target = target;
 	TAILQ_INSERT_TAIL(&pg->pg_ports, port, p_pgs);
 	port->p_portal_group = pg;
 	port->p_foreign = pg->pg_foreign;
 	return (port);
 }
 
 struct port *
 port_new_pp(struct conf *conf, struct target *target, struct pport *pp)
 {
 	struct port *port;
 	char *name;
 	int ret;
 
 	ret = asprintf(&name, "%s-%s", pp->pp_name, target->t_name);
 	if (ret <= 0)
 		log_err(1, "asprintf");
 	if (port_find(conf, name) != NULL) {
 		log_warnx("duplicate port \"%s\"", name);
 		free(name);
 		return (NULL);
 	}
 	port = calloc(1, sizeof(*port));
 	if (port == NULL)
 		log_err(1, "calloc");
 	port->p_conf = conf;
 	port->p_name = name;
 	TAILQ_INSERT_TAIL(&conf->conf_ports, port, p_next);
 	TAILQ_INSERT_TAIL(&target->t_ports, port, p_ts);
 	port->p_target = target;
 	TAILQ_INSERT_TAIL(&pp->pp_ports, port, p_pps);
 	port->p_pport = pp;
 	return (port);
 }
 
 struct port *
 port_find(const struct conf *conf, const char *name)
 {
 	struct port *port;
 
 	TAILQ_FOREACH(port, &conf->conf_ports, p_next) {
 		if (strcasecmp(port->p_name, name) == 0)
 			return (port);
 	}
 
 	return (NULL);
 }
 
 struct port *
 port_find_in_pg(const struct portal_group *pg, const char *target)
 {
 	struct port *port;
 
 	TAILQ_FOREACH(port, &pg->pg_ports, p_pgs) {
 		if (strcasecmp(port->p_target->t_name, target) == 0)
 			return (port);
 	}
 
 	return (NULL);
 }
 
 void
 port_delete(struct port *port)
 {
 
 	if (port->p_portal_group)
 		TAILQ_REMOVE(&port->p_portal_group->pg_ports, port, p_pgs);
 	if (port->p_pport)
 		TAILQ_REMOVE(&port->p_pport->pp_ports, port, p_pps);
 	if (port->p_target)
 		TAILQ_REMOVE(&port->p_target->t_ports, port, p_ts);
 	TAILQ_REMOVE(&port->p_conf->conf_ports, port, p_next);
 	free(port->p_name);
 	free(port);
 }
 
 struct target *
 target_new(struct conf *conf, const char *name)
 {
 	struct target *targ;
 	int i, len;
 
 	targ = target_find(conf, name);
 	if (targ != NULL) {
 		log_warnx("duplicated target \"%s\"", name);
 		return (NULL);
 	}
 	if (valid_iscsi_name(name) == false) {
 		log_warnx("target name \"%s\" is invalid", name);
 		return (NULL);
 	}
 	targ = calloc(1, sizeof(*targ));
 	if (targ == NULL)
 		log_err(1, "calloc");
 	targ->t_name = checked_strdup(name);
 
 	/*
 	 * RFC 3722 requires us to normalize the name to lowercase.
 	 */
 	len = strlen(name);
 	for (i = 0; i < len; i++)
 		targ->t_name[i] = tolower(targ->t_name[i]);
 
 	targ->t_conf = conf;
 	TAILQ_INIT(&targ->t_ports);
 	TAILQ_INSERT_TAIL(&conf->conf_targets, targ, t_next);
 
 	return (targ);
 }
 
 void
 target_delete(struct target *targ)
 {
 	struct port *port, *tport;
 
 	TAILQ_FOREACH_SAFE(port, &targ->t_ports, p_ts, tport)
 		port_delete(port);
 	TAILQ_REMOVE(&targ->t_conf->conf_targets, targ, t_next);
 
 	free(targ->t_name);
 	free(targ->t_redirection);
 	free(targ);
 }
 
 struct target *
 target_find(struct conf *conf, const char *name)
 {
 	struct target *targ;
 
 	TAILQ_FOREACH(targ, &conf->conf_targets, t_next) {
 		if (strcasecmp(targ->t_name, name) == 0)
 			return (targ);
 	}
 
 	return (NULL);
 }
 
 int
 target_set_redirection(struct target *target, const char *addr)
 {
 
 	if (target->t_redirection != NULL) {
 		log_warnx("cannot set redirection to \"%s\" for "
 		    "target \"%s\"; already defined",
 		    addr, target->t_name);
 		return (1);
 	}
 
 	target->t_redirection = checked_strdup(addr);
 
 	return (0);
 }
 
 struct lun *
 lun_new(struct conf *conf, const char *name)
 {
 	struct lun *lun;
 
 	lun = lun_find(conf, name);
 	if (lun != NULL) {
 		log_warnx("duplicated lun \"%s\"", name);
 		return (NULL);
 	}
 
 	lun = calloc(1, sizeof(*lun));
 	if (lun == NULL)
 		log_err(1, "calloc");
 	lun->l_conf = conf;
 	lun->l_name = checked_strdup(name);
 	TAILQ_INIT(&lun->l_options);
 	TAILQ_INSERT_TAIL(&conf->conf_luns, lun, l_next);
 	lun->l_ctl_lun = -1;
 
 	return (lun);
 }
 
 void
 lun_delete(struct lun *lun)
 {
 	struct target *targ;
 	struct option *o, *tmp;
 	int i;
 
 	TAILQ_FOREACH(targ, &lun->l_conf->conf_targets, t_next) {
 		for (i = 0; i < MAX_LUNS; i++) {
 			if (targ->t_luns[i] == lun)
 				targ->t_luns[i] = NULL;
 		}
 	}
 	TAILQ_REMOVE(&lun->l_conf->conf_luns, lun, l_next);
 
 	TAILQ_FOREACH_SAFE(o, &lun->l_options, o_next, tmp)
 		option_delete(&lun->l_options, o);
 	free(lun->l_name);
 	free(lun->l_backend);
 	free(lun->l_device_id);
 	free(lun->l_path);
 	free(lun->l_scsiname);
 	free(lun->l_serial);
 	free(lun);
 }
 
 struct lun *
 lun_find(const struct conf *conf, const char *name)
 {
 	struct lun *lun;
 
 	TAILQ_FOREACH(lun, &conf->conf_luns, l_next) {
 		if (strcmp(lun->l_name, name) == 0)
 			return (lun);
 	}
 
 	return (NULL);
 }
 
 void
 lun_set_backend(struct lun *lun, const char *value)
 {
 	free(lun->l_backend);
 	lun->l_backend = checked_strdup(value);
 }
 
 void
 lun_set_blocksize(struct lun *lun, size_t value)
 {
 
 	lun->l_blocksize = value;
 }
 
 void
 lun_set_device_type(struct lun *lun, uint8_t value)
 {
 
 	lun->l_device_type = value;
 }
 
 void
 lun_set_device_id(struct lun *lun, const char *value)
 {
 	free(lun->l_device_id);
 	lun->l_device_id = checked_strdup(value);
 }
 
 void
 lun_set_path(struct lun *lun, const char *value)
 {
 	free(lun->l_path);
 	lun->l_path = checked_strdup(value);
 }
 
 void
 lun_set_scsiname(struct lun *lun, const char *value)
 {
 	free(lun->l_scsiname);
 	lun->l_scsiname = checked_strdup(value);
 }
 
 void
 lun_set_serial(struct lun *lun, const char *value)
 {
 	free(lun->l_serial);
 	lun->l_serial = checked_strdup(value);
 }
 
 void
 lun_set_size(struct lun *lun, size_t value)
 {
 
 	lun->l_size = value;
 }
 
 void
 lun_set_ctl_lun(struct lun *lun, uint32_t value)
 {
 
 	lun->l_ctl_lun = value;
 }
 
 struct option *
 option_new(struct options *options, const char *name, const char *value)
 {
 	struct option *o;
 
 	o = option_find(options, name);
 	if (o != NULL) {
 		log_warnx("duplicated option \"%s\"", name);
 		return (NULL);
 	}
 
 	o = calloc(1, sizeof(*o));
 	if (o == NULL)
 		log_err(1, "calloc");
 	o->o_name = checked_strdup(name);
 	o->o_value = checked_strdup(value);
 	TAILQ_INSERT_TAIL(options, o, o_next);
 
 	return (o);
 }
 
 void
 option_delete(struct options *options, struct option *o)
 {
 
 	TAILQ_REMOVE(options, o, o_next);
 	free(o->o_name);
 	free(o->o_value);
 	free(o);
 }
 
 struct option *
 option_find(const struct options *options, const char *name)
 {
 	struct option *o;
 
 	TAILQ_FOREACH(o, options, o_next) {
 		if (strcmp(o->o_name, name) == 0)
 			return (o);
 	}
 
 	return (NULL);
 }
 
 void
 option_set(struct option *o, const char *value)
 {
 
 	free(o->o_value);
 	o->o_value = checked_strdup(value);
 }
 
 static struct connection *
 connection_new(struct portal *portal, int fd, const char *host,
     const struct sockaddr *client_sa)
 {
 	struct connection *conn;
 
 	conn = calloc(1, sizeof(*conn));
 	if (conn == NULL)
 		log_err(1, "calloc");
 	conn->conn_portal = portal;
 	conn->conn_socket = fd;
 	conn->conn_initiator_addr = checked_strdup(host);
 	memcpy(&conn->conn_initiator_sa, client_sa, client_sa->sa_len);
 
 	/*
 	 * Default values, from RFC 3720, section 12.
 	 */
 	conn->conn_max_recv_data_segment_length = 8192;
+	conn->conn_max_send_data_segment_length = 8192;
 	conn->conn_max_burst_length = 262144;
 	conn->conn_first_burst_length = 65536;
 	conn->conn_immediate_data = true;
 
 	return (conn);
 }
 
 #if 0
 static void
 conf_print(struct conf *conf)
 {
 	struct auth_group *ag;
 	struct auth *auth;
 	struct auth_name *auth_name;
 	struct auth_portal *auth_portal;
 	struct portal_group *pg;
 	struct portal *portal;
 	struct target *targ;
 	struct lun *lun;
 	struct option *o;
 
 	TAILQ_FOREACH(ag, &conf->conf_auth_groups, ag_next) {
 		fprintf(stderr, "auth-group %s {\n", ag->ag_name);
 		TAILQ_FOREACH(auth, &ag->ag_auths, a_next)
 			fprintf(stderr, "\t chap-mutual %s %s %s %s\n",
 			    auth->a_user, auth->a_secret,
 			    auth->a_mutual_user, auth->a_mutual_secret);
 		TAILQ_FOREACH(auth_name, &ag->ag_names, an_next)
 			fprintf(stderr, "\t initiator-name %s\n",
 			    auth_name->an_initator_name);
 		TAILQ_FOREACH(auth_portal, &ag->ag_portals, an_next)
 			fprintf(stderr, "\t initiator-portal %s\n",
 			    auth_portal->an_initator_portal);
 		fprintf(stderr, "}\n");
 	}
 	TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 		fprintf(stderr, "portal-group %s {\n", pg->pg_name);
 		TAILQ_FOREACH(portal, &pg->pg_portals, p_next)
 			fprintf(stderr, "\t listen %s\n", portal->p_listen);
 		fprintf(stderr, "}\n");
 	}
 	TAILQ_FOREACH(lun, &conf->conf_luns, l_next) {
 		fprintf(stderr, "\tlun %s {\n", lun->l_name);
 		fprintf(stderr, "\t\tpath %s\n", lun->l_path);
 		TAILQ_FOREACH(o, &lun->l_options, o_next)
 			fprintf(stderr, "\t\toption %s %s\n",
 			    lo->o_name, lo->o_value);
 		fprintf(stderr, "\t}\n");
 	}
 	TAILQ_FOREACH(targ, &conf->conf_targets, t_next) {
 		fprintf(stderr, "target %s {\n", targ->t_name);
 		if (targ->t_alias != NULL)
 			fprintf(stderr, "\t alias %s\n", targ->t_alias);
 		fprintf(stderr, "}\n");
 	}
 }
 #endif
 
 static int
 conf_verify_lun(struct lun *lun)
 {
 	const struct lun *lun2;
 
 	if (lun->l_backend == NULL)
 		lun_set_backend(lun, "block");
 	if (strcmp(lun->l_backend, "block") == 0) {
 		if (lun->l_path == NULL) {
 			log_warnx("missing path for lun \"%s\"",
 			    lun->l_name);
 			return (1);
 		}
 	} else if (strcmp(lun->l_backend, "ramdisk") == 0) {
 		if (lun->l_size == 0) {
 			log_warnx("missing size for ramdisk-backed lun \"%s\"",
 			    lun->l_name);
 			return (1);
 		}
 		if (lun->l_path != NULL) {
 			log_warnx("path must not be specified "
 			    "for ramdisk-backed lun \"%s\"",
 			    lun->l_name);
 			return (1);
 		}
 	}
 	if (lun->l_blocksize == 0) {
 		if (lun->l_device_type == 5)
 			lun_set_blocksize(lun, DEFAULT_CD_BLOCKSIZE);
 		else
 			lun_set_blocksize(lun, DEFAULT_BLOCKSIZE);
 	} else if (lun->l_blocksize < 0) {
 		log_warnx("invalid blocksize for lun \"%s\"; "
 		    "must be larger than 0", lun->l_name);
 		return (1);
 	}
 	if (lun->l_size != 0 && lun->l_size % lun->l_blocksize != 0) {
 		log_warnx("invalid size for lun \"%s\"; "
 		    "must be multiple of blocksize", lun->l_name);
 		return (1);
 	}
 	TAILQ_FOREACH(lun2, &lun->l_conf->conf_luns, l_next) {
 		if (lun == lun2)
 			continue;
 		if (lun->l_path != NULL && lun2->l_path != NULL &&
 		    strcmp(lun->l_path, lun2->l_path) == 0) {
 			log_debugx("WARNING: path \"%s\" duplicated "
 			    "between lun \"%s\", and "
 			    "lun \"%s\"", lun->l_path,
 			    lun->l_name, lun2->l_name);
 		}
 	}
 
 	return (0);
 }
 
 int
 conf_verify(struct conf *conf)
 {
 	struct auth_group *ag;
 	struct portal_group *pg;
 	struct port *port;
 	struct target *targ;
 	struct lun *lun;
 	bool found;
 	int error, i;
 
 	if (conf->conf_pidfile_path == NULL)
 		conf->conf_pidfile_path = checked_strdup(DEFAULT_PIDFILE);
 
 	TAILQ_FOREACH(lun, &conf->conf_luns, l_next) {
 		error = conf_verify_lun(lun);
 		if (error != 0)
 			return (error);
 	}
 	TAILQ_FOREACH(targ, &conf->conf_targets, t_next) {
 		if (targ->t_auth_group == NULL) {
 			targ->t_auth_group = auth_group_find(conf,
 			    "default");
 			assert(targ->t_auth_group != NULL);
 		}
 		if (TAILQ_EMPTY(&targ->t_ports)) {
 			pg = portal_group_find(conf, "default");
 			assert(pg != NULL);
 			port_new(conf, targ, pg);
 		}
 		found = false;
 		for (i = 0; i < MAX_LUNS; i++) {
 			if (targ->t_luns[i] != NULL)
 				found = true;
 		}
 		if (!found && targ->t_redirection == NULL) {
 			log_warnx("no LUNs defined for target \"%s\"",
 			    targ->t_name);
 		}
 		if (found && targ->t_redirection != NULL) {
 			log_debugx("target \"%s\" contains luns, "
 			    " but configured for redirection",
 			    targ->t_name);
 		}
 	}
 	TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 		assert(pg->pg_name != NULL);
 		if (pg->pg_discovery_auth_group == NULL) {
 			pg->pg_discovery_auth_group =
 			    auth_group_find(conf, "default");
 			assert(pg->pg_discovery_auth_group != NULL);
 		}
 
 		if (pg->pg_discovery_filter == PG_FILTER_UNKNOWN)
 			pg->pg_discovery_filter = PG_FILTER_NONE;
 
 		if (pg->pg_redirection != NULL) {
 			if (!TAILQ_EMPTY(&pg->pg_ports)) {
 				log_debugx("portal-group \"%s\" assigned "
 				    "to target, but configured "
 				    "for redirection",
 				    pg->pg_name);
 			}
 			pg->pg_unassigned = false;
 		} else if (!TAILQ_EMPTY(&pg->pg_ports)) {
 			pg->pg_unassigned = false;
 		} else {
 			if (strcmp(pg->pg_name, "default") != 0)
 				log_warnx("portal-group \"%s\" not assigned "
 				    "to any target", pg->pg_name);
 			pg->pg_unassigned = true;
 		}
 	}
 	TAILQ_FOREACH(ag, &conf->conf_auth_groups, ag_next) {
 		if (ag->ag_name == NULL)
 			assert(ag->ag_target != NULL);
 		else
 			assert(ag->ag_target == NULL);
 
 		found = false;
 		TAILQ_FOREACH(targ, &conf->conf_targets, t_next) {
 			if (targ->t_auth_group == ag) {
 				found = true;
 				break;
 			}
 		}
 		TAILQ_FOREACH(port, &conf->conf_ports, p_next) {
 			if (port->p_auth_group == ag) {
 				found = true;
 				break;
 			}
 		}
 		TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 			if (pg->pg_discovery_auth_group == ag) {
 				found = true;
 				break;
 			}
 		}
 		if (!found && ag->ag_name != NULL &&
 		    strcmp(ag->ag_name, "default") != 0 &&
 		    strcmp(ag->ag_name, "no-authentication") != 0 &&
 		    strcmp(ag->ag_name, "no-access") != 0) {
 			log_warnx("auth-group \"%s\" not assigned "
 			    "to any target", ag->ag_name);
 		}
 	}
 
 	return (0);
 }
 
 static int
 conf_apply(struct conf *oldconf, struct conf *newconf)
 {
 	struct lun *oldlun, *newlun, *tmplun;
 	struct portal_group *oldpg, *newpg;
 	struct portal *oldp, *newp;
 	struct port *oldport, *newport, *tmpport;
 	struct isns *oldns, *newns;
 	pid_t otherpid;
 	int changed, cumulated_error = 0, error, sockbuf;
 	int one = 1;
 
 	if (oldconf->conf_debug != newconf->conf_debug) {
 		log_debugx("changing debug level to %d", newconf->conf_debug);
 		log_init(newconf->conf_debug);
 	}
 
 	if (oldconf->conf_pidfh != NULL) {
 		assert(oldconf->conf_pidfile_path != NULL);
 		if (newconf->conf_pidfile_path != NULL &&
 		    strcmp(oldconf->conf_pidfile_path,
 		    newconf->conf_pidfile_path) == 0) {
 			newconf->conf_pidfh = oldconf->conf_pidfh;
 			oldconf->conf_pidfh = NULL;
 		} else {
 			log_debugx("removing pidfile %s",
 			    oldconf->conf_pidfile_path);
 			pidfile_remove(oldconf->conf_pidfh);
 			oldconf->conf_pidfh = NULL;
 		}
 	}
 
 	if (newconf->conf_pidfh == NULL && newconf->conf_pidfile_path != NULL) {
 		log_debugx("opening pidfile %s", newconf->conf_pidfile_path);
 		newconf->conf_pidfh =
 		    pidfile_open(newconf->conf_pidfile_path, 0600, &otherpid);
 		if (newconf->conf_pidfh == NULL) {
 			if (errno == EEXIST)
 				log_errx(1, "daemon already running, pid: %jd.",
 				    (intmax_t)otherpid);
 			log_err(1, "cannot open or create pidfile \"%s\"",
 			    newconf->conf_pidfile_path);
 		}
 	}
 
 	/*
 	 * Go through the new portal groups, assigning tags or preserving old.
 	 */
 	TAILQ_FOREACH(newpg, &newconf->conf_portal_groups, pg_next) {
 		if (newpg->pg_tag != 0)
 			continue;
 		oldpg = portal_group_find(oldconf, newpg->pg_name);
 		if (oldpg != NULL)
 			newpg->pg_tag = oldpg->pg_tag;
 		else
 			newpg->pg_tag = ++last_portal_group_tag;
 	}
 
 	/* Deregister on removed iSNS servers. */
 	TAILQ_FOREACH(oldns, &oldconf->conf_isns, i_next) {
 		TAILQ_FOREACH(newns, &newconf->conf_isns, i_next) {
 			if (strcmp(oldns->i_addr, newns->i_addr) == 0)
 				break;
 		}
 		if (newns == NULL)
 			isns_deregister(oldns);
 	}
 
 	/*
 	 * XXX: If target or lun removal fails, we should somehow "move"
 	 *      the old lun or target into newconf, so that subsequent
 	 *      conf_apply() would try to remove them again.  That would
 	 *      be somewhat hairy, though, and lun deletion failures don't
 	 *      really happen, so leave it as it is for now.
 	 */
 	/*
 	 * First, remove any ports present in the old configuration
 	 * and missing in the new one.
 	 */
 	TAILQ_FOREACH_SAFE(oldport, &oldconf->conf_ports, p_next, tmpport) {
 		if (oldport->p_foreign)
 			continue;
 		newport = port_find(newconf, oldport->p_name);
 		if (newport != NULL && !newport->p_foreign)
 			continue;
 		log_debugx("removing port \"%s\"", oldport->p_name);
 		error = kernel_port_remove(oldport);
 		if (error != 0) {
 			log_warnx("failed to remove port %s",
 			    oldport->p_name);
 			/*
 			 * XXX: Uncomment after fixing the root cause.
 			 *
 			 * cumulated_error++;
 			 */
 		}
 	}
 
 	/*
 	 * Second, remove any LUNs present in the old configuration
 	 * and missing in the new one.
 	 */
 	TAILQ_FOREACH_SAFE(oldlun, &oldconf->conf_luns, l_next, tmplun) {
 		newlun = lun_find(newconf, oldlun->l_name);
 		if (newlun == NULL) {
 			log_debugx("lun \"%s\", CTL lun %d "
 			    "not found in new configuration; "
 			    "removing", oldlun->l_name, oldlun->l_ctl_lun);
 			error = kernel_lun_remove(oldlun);
 			if (error != 0) {
 				log_warnx("failed to remove lun \"%s\", "
 				    "CTL lun %d",
 				    oldlun->l_name, oldlun->l_ctl_lun);
 				cumulated_error++;
 			}
 			continue;
 		}
 
 		/*
 		 * Also remove the LUNs changed by more than size.
 		 */
 		changed = 0;
 		assert(oldlun->l_backend != NULL);
 		assert(newlun->l_backend != NULL);
 		if (strcmp(newlun->l_backend, oldlun->l_backend) != 0) {
 			log_debugx("backend for lun \"%s\", "
 			    "CTL lun %d changed; removing",
 			    oldlun->l_name, oldlun->l_ctl_lun);
 			changed = 1;
 		}
 		if (oldlun->l_blocksize != newlun->l_blocksize) {
 			log_debugx("blocksize for lun \"%s\", "
 			    "CTL lun %d changed; removing",
 			    oldlun->l_name, oldlun->l_ctl_lun);
 			changed = 1;
 		}
 		if (newlun->l_device_id != NULL &&
 		    (oldlun->l_device_id == NULL ||
 		     strcmp(oldlun->l_device_id, newlun->l_device_id) !=
 		     0)) {
 			log_debugx("device-id for lun \"%s\", "
 			    "CTL lun %d changed; removing",
 			    oldlun->l_name, oldlun->l_ctl_lun);
 			changed = 1;
 		}
 		if (newlun->l_path != NULL &&
 		    (oldlun->l_path == NULL ||
 		     strcmp(oldlun->l_path, newlun->l_path) != 0)) {
 			log_debugx("path for lun \"%s\", "
 			    "CTL lun %d, changed; removing",
 			    oldlun->l_name, oldlun->l_ctl_lun);
 			changed = 1;
 		}
 		if (newlun->l_serial != NULL &&
 		    (oldlun->l_serial == NULL ||
 		     strcmp(oldlun->l_serial, newlun->l_serial) != 0)) {
 			log_debugx("serial for lun \"%s\", "
 			    "CTL lun %d changed; removing",
 			    oldlun->l_name, oldlun->l_ctl_lun);
 			changed = 1;
 		}
 		if (changed) {
 			error = kernel_lun_remove(oldlun);
 			if (error != 0) {
 				log_warnx("failed to remove lun \"%s\", "
 				    "CTL lun %d",
 				    oldlun->l_name, oldlun->l_ctl_lun);
 				cumulated_error++;
 			}
 			lun_delete(oldlun);
 			continue;
 		}
 
 		lun_set_ctl_lun(newlun, oldlun->l_ctl_lun);
 	}
 
 	TAILQ_FOREACH_SAFE(newlun, &newconf->conf_luns, l_next, tmplun) {
 		oldlun = lun_find(oldconf, newlun->l_name);
 		if (oldlun != NULL) {
 			log_debugx("modifying lun \"%s\", CTL lun %d",
 			    newlun->l_name, newlun->l_ctl_lun);
 			error = kernel_lun_modify(newlun);
 			if (error != 0) {
 				log_warnx("failed to "
 				    "modify lun \"%s\", CTL lun %d",
 				    newlun->l_name, newlun->l_ctl_lun);
 				cumulated_error++;
 			}
 			continue;
 		}
 		log_debugx("adding lun \"%s\"", newlun->l_name);
 		error = kernel_lun_add(newlun);
 		if (error != 0) {
 			log_warnx("failed to add lun \"%s\"", newlun->l_name);
 			lun_delete(newlun);
 			cumulated_error++;
 		}
 	}
 
 	/*
 	 * Now add new ports or modify existing ones.
 	 */
 	TAILQ_FOREACH(newport, &newconf->conf_ports, p_next) {
 		if (newport->p_foreign)
 			continue;
 		oldport = port_find(oldconf, newport->p_name);
 
 		if (oldport == NULL || oldport->p_foreign) {
 			log_debugx("adding port \"%s\"", newport->p_name);
 			error = kernel_port_add(newport);
 		} else {
 			log_debugx("updating port \"%s\"", newport->p_name);
 			newport->p_ctl_port = oldport->p_ctl_port;
 			error = kernel_port_update(newport, oldport);
 		}
 		if (error != 0) {
 			log_warnx("failed to %s port %s",
 			    (oldport == NULL) ? "add" : "update",
 			    newport->p_name);
 			/*
 			 * XXX: Uncomment after fixing the root cause.
 			 *
 			 * cumulated_error++;
 			 */
 		}
 	}
 
 	/*
 	 * Go through the new portals, opening the sockets as necessary.
 	 */
 	TAILQ_FOREACH(newpg, &newconf->conf_portal_groups, pg_next) {
 		if (newpg->pg_foreign)
 			continue;
 		if (newpg->pg_unassigned) {
 			log_debugx("not listening on portal-group \"%s\", "
 			    "not assigned to any target",
 			    newpg->pg_name);
 			continue;
 		}
 		TAILQ_FOREACH(newp, &newpg->pg_portals, p_next) {
 			/*
 			 * Try to find already open portal and reuse
 			 * the listening socket.  We don't care about
 			 * what portal or portal group that was, what
 			 * matters is the listening address.
 			 */
 			TAILQ_FOREACH(oldpg, &oldconf->conf_portal_groups,
 			    pg_next) {
 				TAILQ_FOREACH(oldp, &oldpg->pg_portals,
 				    p_next) {
 					if (strcmp(newp->p_listen,
 					    oldp->p_listen) == 0 &&
 					    oldp->p_socket > 0) {
 						newp->p_socket =
 						    oldp->p_socket;
 						oldp->p_socket = 0;
 						break;
 					}
 				}
 			}
 			if (newp->p_socket > 0) {
 				/*
 				 * We're done with this portal.
 				 */
 				continue;
 			}
 
 #ifdef ICL_KERNEL_PROXY
 			if (proxy_mode) {
 				newpg->pg_conf->conf_portal_id++;
 				newp->p_id = newpg->pg_conf->conf_portal_id;
 				log_debugx("listening on %s, portal-group "
 				    "\"%s\", portal id %d, using ICL proxy",
 				    newp->p_listen, newpg->pg_name, newp->p_id);
 				kernel_listen(newp->p_ai, newp->p_iser,
 				    newp->p_id);
 				continue;
 			}
 #endif
 			assert(proxy_mode == false);
 			assert(newp->p_iser == false);
 
 			log_debugx("listening on %s, portal-group \"%s\"",
 			    newp->p_listen, newpg->pg_name);
 			newp->p_socket = socket(newp->p_ai->ai_family,
 			    newp->p_ai->ai_socktype,
 			    newp->p_ai->ai_protocol);
 			if (newp->p_socket < 0) {
 				log_warn("socket(2) failed for %s",
 				    newp->p_listen);
 				cumulated_error++;
 				continue;
 			}
 			sockbuf = SOCKBUF_SIZE;
 			if (setsockopt(newp->p_socket, SOL_SOCKET, SO_RCVBUF,
 			    &sockbuf, sizeof(sockbuf)) == -1)
 				log_warn("setsockopt(SO_RCVBUF) failed "
 				    "for %s", newp->p_listen);
 			sockbuf = SOCKBUF_SIZE;
 			if (setsockopt(newp->p_socket, SOL_SOCKET, SO_SNDBUF,
 			    &sockbuf, sizeof(sockbuf)) == -1)
 				log_warn("setsockopt(SO_SNDBUF) failed "
 				    "for %s", newp->p_listen);
 			error = setsockopt(newp->p_socket, SOL_SOCKET,
 			    SO_REUSEADDR, &one, sizeof(one));
 			if (error != 0) {
 				log_warn("setsockopt(SO_REUSEADDR) failed "
 				    "for %s", newp->p_listen);
 				close(newp->p_socket);
 				newp->p_socket = 0;
 				cumulated_error++;
 				continue;
 			}
 			error = bind(newp->p_socket, newp->p_ai->ai_addr,
 			    newp->p_ai->ai_addrlen);
 			if (error != 0) {
 				log_warn("bind(2) failed for %s",
 				    newp->p_listen);
 				close(newp->p_socket);
 				newp->p_socket = 0;
 				cumulated_error++;
 				continue;
 			}
 			error = listen(newp->p_socket, -1);
 			if (error != 0) {
 				log_warn("listen(2) failed for %s",
 				    newp->p_listen);
 				close(newp->p_socket);
 				newp->p_socket = 0;
 				cumulated_error++;
 				continue;
 			}
 		}
 	}
 
 	/*
 	 * Go through the no longer used sockets, closing them.
 	 */
 	TAILQ_FOREACH(oldpg, &oldconf->conf_portal_groups, pg_next) {
 		TAILQ_FOREACH(oldp, &oldpg->pg_portals, p_next) {
 			if (oldp->p_socket <= 0)
 				continue;
 			log_debugx("closing socket for %s, portal-group \"%s\"",
 			    oldp->p_listen, oldpg->pg_name);
 			close(oldp->p_socket);
 			oldp->p_socket = 0;
 		}
 	}
 
 	/* (Re-)Register on remaining/new iSNS servers. */
 	TAILQ_FOREACH(newns, &newconf->conf_isns, i_next) {
 		TAILQ_FOREACH(oldns, &oldconf->conf_isns, i_next) {
 			if (strcmp(oldns->i_addr, newns->i_addr) == 0)
 				break;
 		}
 		isns_register(newns, oldns);
 	}
 
 	/* Schedule iSNS update */
 	if (!TAILQ_EMPTY(&newconf->conf_isns))
 		set_timeout((newconf->conf_isns_period + 2) / 3, false);
 
 	return (cumulated_error);
 }
 
 bool
 timed_out(void)
 {
 
 	return (sigalrm_received);
 }
 
 static void
 sigalrm_handler_fatal(int dummy __unused)
 {
 	/*
 	 * It would be easiest to just log an error and exit.  We can't
 	 * do this, though, because log_errx() is not signal safe, since
 	 * it calls syslog(3).  Instead, set a flag checked by pdu_send()
 	 * and pdu_receive(), to call log_errx() there.  Should they fail
 	 * to notice, we'll exit here one second later.
 	 */
 	if (sigalrm_received) {
 		/*
 		 * Oh well.  Just give up and quit.
 		 */
 		_exit(2);
 	}
 
 	sigalrm_received = true;
 }
 
 static void
 sigalrm_handler(int dummy __unused)
 {
 
 	sigalrm_received = true;
 }
 
 void
 set_timeout(int timeout, int fatal)
 {
 	struct sigaction sa;
 	struct itimerval itv;
 	int error;
 
 	if (timeout <= 0) {
 		log_debugx("session timeout disabled");
 		bzero(&itv, sizeof(itv));
 		error = setitimer(ITIMER_REAL, &itv, NULL);
 		if (error != 0)
 			log_err(1, "setitimer");
 		sigalrm_received = false;
 		return;
 	}
 
 	sigalrm_received = false;
 	bzero(&sa, sizeof(sa));
 	if (fatal)
 		sa.sa_handler = sigalrm_handler_fatal;
 	else
 		sa.sa_handler = sigalrm_handler;
 	sigfillset(&sa.sa_mask);
 	error = sigaction(SIGALRM, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 
 	/*
 	 * First SIGALRM will arive after conf_timeout seconds.
 	 * If we do nothing, another one will arrive a second later.
 	 */
 	log_debugx("setting session timeout to %d seconds", timeout);
 	bzero(&itv, sizeof(itv));
 	itv.it_interval.tv_sec = 1;
 	itv.it_value.tv_sec = timeout;
 	error = setitimer(ITIMER_REAL, &itv, NULL);
 	if (error != 0)
 		log_err(1, "setitimer");
 }
 
 static int
 wait_for_children(bool block)
 {
 	pid_t pid;
 	int status;
 	int num = 0;
 
 	for (;;) {
 		/*
 		 * If "block" is true, wait for at least one process.
 		 */
 		if (block && num == 0)
 			pid = wait4(-1, &status, 0, NULL);
 		else
 			pid = wait4(-1, &status, WNOHANG, NULL);
 		if (pid <= 0)
 			break;
 		if (WIFSIGNALED(status)) {
 			log_warnx("child process %d terminated with signal %d",
 			    pid, WTERMSIG(status));
 		} else if (WEXITSTATUS(status) != 0) {
 			log_warnx("child process %d terminated with exit status %d",
 			    pid, WEXITSTATUS(status));
 		} else {
 			log_debugx("child process %d terminated gracefully", pid);
 		}
 		num++;
 	}
 
 	return (num);
 }
 
 static void
 handle_connection(struct portal *portal, int fd,
     const struct sockaddr *client_sa, bool dont_fork)
 {
 	struct connection *conn;
 	int error;
 	pid_t pid;
 	char host[NI_MAXHOST + 1];
 	struct conf *conf;
 
 	conf = portal->p_portal_group->pg_conf;
 
 	if (dont_fork) {
 		log_debugx("incoming connection; not forking due to -d flag");
 	} else {
 		nchildren -= wait_for_children(false);
 		assert(nchildren >= 0);
 
 		while (conf->conf_maxproc > 0 && nchildren >= conf->conf_maxproc) {
 			log_debugx("maxproc limit of %d child processes hit; "
 			    "waiting for child process to exit", conf->conf_maxproc);
 			nchildren -= wait_for_children(true);
 			assert(nchildren >= 0);
 		}
 		log_debugx("incoming connection; forking child process #%d",
 		    nchildren);
 		nchildren++;
 		pid = fork();
 		if (pid < 0)
 			log_err(1, "fork");
 		if (pid > 0) {
 			close(fd);
 			return;
 		}
 	}
 	pidfile_close(conf->conf_pidfh);
 
 	error = getnameinfo(client_sa, client_sa->sa_len,
 	    host, sizeof(host), NULL, 0, NI_NUMERICHOST);
 	if (error != 0)
 		log_errx(1, "getnameinfo: %s", gai_strerror(error));
 
 	log_debugx("accepted connection from %s; portal group \"%s\"",
 	    host, portal->p_portal_group->pg_name);
 	log_set_peer_addr(host);
 	setproctitle("%s", host);
 
 	conn = connection_new(portal, fd, host, client_sa);
 	set_timeout(conf->conf_timeout, true);
 	kernel_capsicate();
 	login(conn);
 	if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) {
 		kernel_handoff(conn);
 		log_debugx("connection handed off to the kernel");
 	} else {
 		assert(conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY);
 		discovery(conn);
 	}
 	log_debugx("nothing more to do; exiting");
 	exit(0);
 }
 
 static int
 fd_add(int fd, fd_set *fdset, int nfds)
 {
 
 	/*
 	 * Skip sockets which we failed to bind.
 	 */
 	if (fd <= 0)
 		return (nfds);
 
 	FD_SET(fd, fdset);
 	if (fd > nfds)
 		nfds = fd;
 	return (nfds);
 }
 
 static void
 main_loop(struct conf *conf, bool dont_fork)
 {
 	struct portal_group *pg;
 	struct portal *portal;
 	struct sockaddr_storage client_sa;
 	socklen_t client_salen;
 #ifdef ICL_KERNEL_PROXY
 	int connection_id;
 	int portal_id;
 #endif
 	fd_set fdset;
 	int error, nfds, client_fd;
 
 	pidfile_write(conf->conf_pidfh);
 
 	for (;;) {
 		if (sighup_received || sigterm_received || timed_out())
 			return;
 
 #ifdef ICL_KERNEL_PROXY
 		if (proxy_mode) {
 			client_salen = sizeof(client_sa);
 			kernel_accept(&connection_id, &portal_id,
 			    (struct sockaddr *)&client_sa, &client_salen);
 			assert(client_salen >= client_sa.ss_len);
 
 			log_debugx("incoming connection, id %d, portal id %d",
 			    connection_id, portal_id);
 			TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 				TAILQ_FOREACH(portal, &pg->pg_portals, p_next) {
 					if (portal->p_id == portal_id) {
 						goto found;
 					}
 				}
 			}
 
 			log_errx(1, "kernel returned invalid portal_id %d",
 			    portal_id);
 
 found:
 			handle_connection(portal, connection_id,
 			    (struct sockaddr *)&client_sa, dont_fork);
 		} else {
 #endif
 			assert(proxy_mode == false);
 
 			FD_ZERO(&fdset);
 			nfds = 0;
 			TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 				TAILQ_FOREACH(portal, &pg->pg_portals, p_next)
 					nfds = fd_add(portal->p_socket, &fdset, nfds);
 			}
 			error = select(nfds + 1, &fdset, NULL, NULL, NULL);
 			if (error <= 0) {
 				if (errno == EINTR)
 					return;
 				log_err(1, "select");
 			}
 			TAILQ_FOREACH(pg, &conf->conf_portal_groups, pg_next) {
 				TAILQ_FOREACH(portal, &pg->pg_portals, p_next) {
 					if (!FD_ISSET(portal->p_socket, &fdset))
 						continue;
 					client_salen = sizeof(client_sa);
 					client_fd = accept(portal->p_socket,
 					    (struct sockaddr *)&client_sa,
 					    &client_salen);
 					if (client_fd < 0) {
 						if (errno == ECONNABORTED)
 							continue;
 						log_err(1, "accept");
 					}
 					assert(client_salen >= client_sa.ss_len);
 
 					handle_connection(portal, client_fd,
 					    (struct sockaddr *)&client_sa,
 					    dont_fork);
 					break;
 				}
 			}
 #ifdef ICL_KERNEL_PROXY
 		}
 #endif
 	}
 }
 
 static void
 sighup_handler(int dummy __unused)
 {
 
 	sighup_received = true;
 }
 
 static void
 sigterm_handler(int dummy __unused)
 {
 
 	sigterm_received = true;
 }
 
 static void
 sigchld_handler(int dummy __unused)
 {
 
 	/*
 	 * The only purpose of this handler is to make SIGCHLD
 	 * interrupt the ISCSIDWAIT ioctl(2), so we can call
 	 * wait_for_children().
 	 */
 }
 
 static void
 register_signals(void)
 {
 	struct sigaction sa;
 	int error;
 
 	bzero(&sa, sizeof(sa));
 	sa.sa_handler = sighup_handler;
 	sigfillset(&sa.sa_mask);
 	error = sigaction(SIGHUP, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 
 	sa.sa_handler = sigterm_handler;
 	error = sigaction(SIGTERM, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 
 	sa.sa_handler = sigterm_handler;
 	error = sigaction(SIGINT, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 
 	sa.sa_handler = sigchld_handler;
 	error = sigaction(SIGCHLD, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 }
 
 static void
 check_perms(const char *path)
 {
 	struct stat sb;
 	int error;
 
 	error = stat(path, &sb);
 	if (error != 0) {
 		log_warn("stat");
 		return;
 	}
 	if (sb.st_mode & S_IWOTH) {
 		log_warnx("%s is world-writable", path);
 	} else if (sb.st_mode & S_IROTH) {
 		log_warnx("%s is world-readable", path);
 	} else if (sb.st_mode & S_IXOTH) {
 		/*
 		 * Ok, this one doesn't matter, but still do it,
 		 * just for consistency.
 		 */
 		log_warnx("%s is world-executable", path);
 	}
 
 	/*
 	 * XXX: Should we also check for owner != 0?
 	 */
 }
 
 static struct conf *
 conf_new_from_file(const char *path, struct conf *oldconf, bool ucl)
 {
 	struct conf *conf;
 	struct auth_group *ag;
 	struct portal_group *pg;
 	struct pport *pp;
 	int error;
 
 	log_debugx("obtaining configuration from %s", path);
 
 	conf = conf_new();
 
 	TAILQ_FOREACH(pp, &oldconf->conf_pports, pp_next)
 		pport_copy(pp, conf);
 
 	ag = auth_group_new(conf, "default");
 	assert(ag != NULL);
 
 	ag = auth_group_new(conf, "no-authentication");
 	assert(ag != NULL);
 	ag->ag_type = AG_TYPE_NO_AUTHENTICATION;
 
 	ag = auth_group_new(conf, "no-access");
 	assert(ag != NULL);
 	ag->ag_type = AG_TYPE_DENY;
 
 	pg = portal_group_new(conf, "default");
 	assert(pg != NULL);
 
 	if (ucl)
 		error = uclparse_conf(conf, path);
 	else
 		error = parse_conf(conf, path);
 
 	if (error != 0) {
 		conf_delete(conf);
 		return (NULL);
 	}
 
 	check_perms(path);
 
 	if (conf->conf_default_ag_defined == false) {
 		log_debugx("auth-group \"default\" not defined; "
 		    "going with defaults");
 		ag = auth_group_find(conf, "default");
 		assert(ag != NULL);
 		ag->ag_type = AG_TYPE_DENY;
 	}
 
 	if (conf->conf_default_pg_defined == false) {
 		log_debugx("portal-group \"default\" not defined; "
 		    "going with defaults");
 		pg = portal_group_find(conf, "default");
 		assert(pg != NULL);
 		portal_group_add_listen(pg, "0.0.0.0:3260", false);
 		portal_group_add_listen(pg, "[::]:3260", false);
 	}
 
 	conf->conf_kernel_port_on = true;
 
 	error = conf_verify(conf);
 	if (error != 0) {
 		conf_delete(conf);
 		return (NULL);
 	}
 
 	return (conf);
 }
 
 int
 main(int argc, char **argv)
 {
 	struct conf *oldconf, *newconf, *tmpconf;
 	struct isns *newns;
 	const char *config_path = DEFAULT_CONFIG_PATH;
 	int debug = 0, ch, error;
 	bool dont_daemonize = false;
 	bool use_ucl = false;
 
 	while ((ch = getopt(argc, argv, "duf:R")) != -1) {
 		switch (ch) {
 		case 'd':
 			dont_daemonize = true;
 			debug++;
 			break;
 		case 'u':
 			use_ucl = true;
 			break;
 		case 'f':
 			config_path = optarg;
 			break;
 		case 'R':
 #ifndef ICL_KERNEL_PROXY
 			log_errx(1, "ctld(8) compiled without ICL_KERNEL_PROXY "
 			    "does not support iSER protocol");
 #endif
 			proxy_mode = true;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	if (argc != 0)
 		usage();
 
 	log_init(debug);
 	kernel_init();
 
 	oldconf = conf_new_from_kernel();
 	newconf = conf_new_from_file(config_path, oldconf, use_ucl);
 
 	if (newconf == NULL)
 		log_errx(1, "configuration error; exiting");
 	if (debug > 0) {
 		oldconf->conf_debug = debug;
 		newconf->conf_debug = debug;
 	}
 
 	error = conf_apply(oldconf, newconf);
 	if (error != 0)
 		log_errx(1, "failed to apply configuration; exiting");
 
 	conf_delete(oldconf);
 	oldconf = NULL;
 
 	register_signals();
 
 	if (dont_daemonize == false) {
 		log_debugx("daemonizing");
 		if (daemon(0, 0) == -1) {
 			log_warn("cannot daemonize");
 			pidfile_remove(newconf->conf_pidfh);
 			exit(1);
 		}
 	}
 
 	/* Schedule iSNS update */
 	if (!TAILQ_EMPTY(&newconf->conf_isns))
 		set_timeout((newconf->conf_isns_period + 2) / 3, false);
 
 	for (;;) {
 		main_loop(newconf, dont_daemonize);
 		if (sighup_received) {
 			sighup_received = false;
 			log_debugx("received SIGHUP, reloading configuration");
 			tmpconf = conf_new_from_file(config_path, newconf,
 			    use_ucl);
 
 			if (tmpconf == NULL) {
 				log_warnx("configuration error, "
 				    "continuing with old configuration");
 			} else {
 				if (debug > 0)
 					tmpconf->conf_debug = debug;
 				oldconf = newconf;
 				newconf = tmpconf;
 				error = conf_apply(oldconf, newconf);
 				if (error != 0)
 					log_warnx("failed to reload "
 					    "configuration");
 				conf_delete(oldconf);
 				oldconf = NULL;
 			}
 		} else if (sigterm_received) {
 			log_debugx("exiting on signal; "
 			    "reloading empty configuration");
 
 			log_debugx("removing CTL iSCSI ports "
 			    "and terminating all connections");
 
 			oldconf = newconf;
 			newconf = conf_new();
 			if (debug > 0)
 				newconf->conf_debug = debug;
 			error = conf_apply(oldconf, newconf);
 			if (error != 0)
 				log_warnx("failed to apply configuration");
 			conf_delete(oldconf);
 			oldconf = NULL;
 
 			log_warnx("exiting on signal");
 			exit(0);
 		} else {
 			nchildren -= wait_for_children(false);
 			assert(nchildren >= 0);
 			if (timed_out()) {
 				set_timeout(0, false);
 				TAILQ_FOREACH(newns, &newconf->conf_isns, i_next)
 					isns_check(newns);
 				/* Schedule iSNS update */
 				if (!TAILQ_EMPTY(&newconf->conf_isns)) {
 					set_timeout((newconf->conf_isns_period
 					    + 2) / 3,
 					    false);
 				}
 			}
 		}
 	}
 	/* NOTREACHED */
 }
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.h
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.h	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/ctld.h	(revision 312218)
@@ -1,462 +1,464 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef CTLD_H
 #define	CTLD_H
 
 #include <sys/queue.h>
 #ifdef ICL_KERNEL_PROXY
 #include <sys/types.h>
 #endif
 #include <sys/socket.h>
 #include <stdbool.h>
 #include <libutil.h>
 
 #define	DEFAULT_CONFIG_PATH		"/etc/ctl.conf"
 #define	DEFAULT_PIDFILE			"/var/run/ctld.pid"
 #define	DEFAULT_BLOCKSIZE		512
 #define	DEFAULT_CD_BLOCKSIZE		2048
 
 #define	MAX_LUNS			1024
 #define	MAX_NAME_LEN			223
 #define	MAX_DATA_SEGMENT_LENGTH		(128 * 1024)
-#define	MAX_BURST_LENGTH		16776192
-#define	FIRST_BURST_LENGTH		(128 * 1024)
 #define	SOCKBUF_SIZE			1048576
 
 struct auth {
 	TAILQ_ENTRY(auth)		a_next;
 	struct auth_group		*a_auth_group;
 	char				*a_user;
 	char				*a_secret;
 	char				*a_mutual_user;
 	char				*a_mutual_secret;
 };
 
 struct auth_name {
 	TAILQ_ENTRY(auth_name)		an_next;
 	struct auth_group		*an_auth_group;
 	char				*an_initator_name;
 };
 
 struct auth_portal {
 	TAILQ_ENTRY(auth_portal)	ap_next;
 	struct auth_group		*ap_auth_group;
 	char				*ap_initator_portal;
 	struct sockaddr_storage		ap_sa;
 	int				ap_mask;
 };
 
 #define	AG_TYPE_UNKNOWN			0
 #define	AG_TYPE_DENY			1
 #define	AG_TYPE_NO_AUTHENTICATION	2
 #define	AG_TYPE_CHAP			3
 #define	AG_TYPE_CHAP_MUTUAL		4
 
 struct auth_group {
 	TAILQ_ENTRY(auth_group)		ag_next;
 	struct conf			*ag_conf;
 	char				*ag_name;
 	struct target			*ag_target;
 	int				ag_type;
 	TAILQ_HEAD(, auth)		ag_auths;
 	TAILQ_HEAD(, auth_name)		ag_names;
 	TAILQ_HEAD(, auth_portal)	ag_portals;
 };
 
 struct portal {
 	TAILQ_ENTRY(portal)		p_next;
 	struct portal_group		*p_portal_group;
 	bool				p_iser;
 	char				*p_listen;
 	struct addrinfo			*p_ai;
 #ifdef ICL_KERNEL_PROXY
 	int				p_id;
 #endif
 
 	TAILQ_HEAD(, target)		p_targets;
 	int				p_socket;
 };
 
 TAILQ_HEAD(options, option);
 
 #define	PG_FILTER_UNKNOWN		0
 #define	PG_FILTER_NONE			1
 #define	PG_FILTER_PORTAL		2
 #define	PG_FILTER_PORTAL_NAME		3
 #define	PG_FILTER_PORTAL_NAME_AUTH	4
 
 struct portal_group {
 	TAILQ_ENTRY(portal_group)	pg_next;
 	struct conf			*pg_conf;
 	struct options			pg_options;
 	char				*pg_name;
 	struct auth_group		*pg_discovery_auth_group;
 	int				pg_discovery_filter;
 	int				pg_foreign;
 	bool				pg_unassigned;
 	TAILQ_HEAD(, portal)		pg_portals;
 	TAILQ_HEAD(, port)		pg_ports;
 	char				*pg_offload;
 	char				*pg_redirection;
 
 	uint16_t			pg_tag;
 };
 
 struct pport {
 	TAILQ_ENTRY(pport)		pp_next;
 	TAILQ_HEAD(, port)		pp_ports;
 	struct conf			*pp_conf;
 	char				*pp_name;
 
 	uint32_t			pp_ctl_port;
 };
 
 struct port {
 	TAILQ_ENTRY(port)		p_next;
 	TAILQ_ENTRY(port)		p_pgs;
 	TAILQ_ENTRY(port)		p_pps;
 	TAILQ_ENTRY(port)		p_ts;
 	struct conf			*p_conf;
 	char				*p_name;
 	struct auth_group		*p_auth_group;
 	struct portal_group		*p_portal_group;
 	struct pport			*p_pport;
 	struct target			*p_target;
 	int				p_foreign;
 
 	uint32_t			p_ctl_port;
 };
 
 struct option {
 	TAILQ_ENTRY(option)		o_next;
 	char				*o_name;
 	char				*o_value;
 };
 
 struct lun {
 	TAILQ_ENTRY(lun)		l_next;
 	struct conf			*l_conf;
 	struct options			l_options;
 	char				*l_name;
 	char				*l_backend;
 	uint8_t				l_device_type;
 	int				l_blocksize;
 	char				*l_device_id;
 	char				*l_path;
 	char				*l_scsiname;
 	char				*l_serial;
 	int64_t				l_size;
 
 	int				l_ctl_lun;
 };
 
 struct target {
 	TAILQ_ENTRY(target)		t_next;
 	struct conf			*t_conf;
 	struct lun			*t_luns[MAX_LUNS];
 	struct auth_group		*t_auth_group;
 	TAILQ_HEAD(, port)		t_ports;
 	char				*t_name;
 	char				*t_alias;
 	char				*t_redirection;
 };
 
 struct isns {
 	TAILQ_ENTRY(isns)		i_next;
 	struct conf			*i_conf;
 	char				*i_addr;
 	struct addrinfo			*i_ai;
 };
 
 struct conf {
 	char				*conf_pidfile_path;
 	TAILQ_HEAD(, lun)		conf_luns;
 	TAILQ_HEAD(, target)		conf_targets;
 	TAILQ_HEAD(, auth_group)	conf_auth_groups;
 	TAILQ_HEAD(, port)		conf_ports;
 	TAILQ_HEAD(, portal_group)	conf_portal_groups;
 	TAILQ_HEAD(, pport)		conf_pports;
 	TAILQ_HEAD(, isns)		conf_isns;
 	int				conf_isns_period;
 	int				conf_isns_timeout;
 	int				conf_debug;
 	int				conf_timeout;
 	int				conf_maxproc;
 
 #ifdef ICL_KERNEL_PROXY
 	int				conf_portal_id;
 #endif
 	struct pidfh			*conf_pidfh;
 
 	bool				conf_default_pg_defined;
 	bool				conf_default_ag_defined;
 	bool				conf_kernel_port_on;
 };
 
 #define	CONN_SESSION_TYPE_NONE		0
 #define	CONN_SESSION_TYPE_DISCOVERY	1
 #define	CONN_SESSION_TYPE_NORMAL	2
 
 #define	CONN_DIGEST_NONE		0
 #define	CONN_DIGEST_CRC32C		1
 
 struct connection {
 	struct portal		*conn_portal;
 	struct port		*conn_port;
 	struct target		*conn_target;
 	int			conn_socket;
 	int			conn_session_type;
 	char			*conn_initiator_name;
 	char			*conn_initiator_addr;
 	char			*conn_initiator_alias;
 	uint8_t			conn_initiator_isid[6];
 	struct sockaddr_storage	conn_initiator_sa;
 	uint32_t		conn_cmdsn;
 	uint32_t		conn_statsn;
+	int			conn_max_recv_data_segment_limit;
+	int			conn_max_send_data_segment_limit;
+	int			conn_max_burst_limit;
+	int			conn_first_burst_limit;
 	int			conn_max_recv_data_segment_length;
 	int			conn_max_send_data_segment_length;
 	int			conn_max_burst_length;
 	int			conn_first_burst_length;
 	int			conn_immediate_data;
 	int			conn_header_digest;
 	int			conn_data_digest;
 	const char		*conn_user;
 	struct chap		*conn_chap;
 };
 
 struct pdu {
 	struct connection	*pdu_connection;
 	struct iscsi_bhs	*pdu_bhs;
 	char			*pdu_data;
 	size_t			pdu_data_len;
 };
 
 #define	KEYS_MAX	1024
 
 struct keys {
 	char		*keys_names[KEYS_MAX];
 	char		*keys_values[KEYS_MAX];
 	char		*keys_data;
 	size_t		keys_data_len;
 };
 
 #define	CHAP_CHALLENGE_LEN	1024
 #define	CHAP_DIGEST_LEN		16 /* Equal to MD5 digest size. */
 
 struct chap {
 	unsigned char	chap_id;
 	char		chap_challenge[CHAP_CHALLENGE_LEN];
 	char		chap_response[CHAP_DIGEST_LEN];
 };
 
 struct rchap {
 	char		*rchap_secret;
 	unsigned char	rchap_id;
 	void		*rchap_challenge;
 	size_t		rchap_challenge_len;
 };
 
 struct chap		*chap_new(void);
 char			*chap_get_id(const struct chap *chap);
 char			*chap_get_challenge(const struct chap *chap);
 int			chap_receive(struct chap *chap, const char *response);
 int			chap_authenticate(struct chap *chap,
 			    const char *secret);
 void			chap_delete(struct chap *chap);
 
 struct rchap		*rchap_new(const char *secret);
 int			rchap_receive(struct rchap *rchap,
 			    const char *id, const char *challenge);
 char			*rchap_get_response(struct rchap *rchap);
 void			rchap_delete(struct rchap *rchap);
 
 int			parse_conf(struct conf *conf, const char *path);
 int			uclparse_conf(struct conf *conf, const char *path);
 
 struct conf		*conf_new(void);
 struct conf		*conf_new_from_kernel(void);
 void			conf_delete(struct conf *conf);
 int			conf_verify(struct conf *conf);
 
 struct auth_group	*auth_group_new(struct conf *conf, const char *name);
 void			auth_group_delete(struct auth_group *ag);
 struct auth_group	*auth_group_find(const struct conf *conf,
 			    const char *name);
 int			auth_group_set_type(struct auth_group *ag,
 			    const char *type);
 
 const struct auth	*auth_new_chap(struct auth_group *ag,
 			    const char *user, const char *secret);
 const struct auth	*auth_new_chap_mutual(struct auth_group *ag,
 			    const char *user, const char *secret,
 			    const char *user2, const char *secret2);
 const struct auth	*auth_find(const struct auth_group *ag,
 			    const char *user);
 
 const struct auth_name	*auth_name_new(struct auth_group *ag,
 			    const char *initiator_name);
 bool			auth_name_defined(const struct auth_group *ag);
 const struct auth_name	*auth_name_find(const struct auth_group *ag,
 			    const char *initiator_name);
 int			auth_name_check(const struct auth_group *ag,
 			    const char *initiator_name);
 
 const struct auth_portal	*auth_portal_new(struct auth_group *ag,
 				    const char *initiator_portal);
 bool			auth_portal_defined(const struct auth_group *ag);
 const struct auth_portal	*auth_portal_find(const struct auth_group *ag,
 				    const struct sockaddr_storage *sa);
 int				auth_portal_check(const struct auth_group *ag,
 				    const struct sockaddr_storage *sa);
 
 struct portal_group	*portal_group_new(struct conf *conf, const char *name);
 void			portal_group_delete(struct portal_group *pg);
 struct portal_group	*portal_group_find(const struct conf *conf,
 			    const char *name);
 int			portal_group_add_listen(struct portal_group *pg,
 			    const char *listen, bool iser);
 int			portal_group_set_filter(struct portal_group *pg,
 			    const char *filter);
 int			portal_group_set_offload(struct portal_group *pg,
 			    const char *offload);
 int			portal_group_set_redirection(struct portal_group *pg,
 			    const char *addr);
 
 int			isns_new(struct conf *conf, const char *addr);
 void			isns_delete(struct isns *is);
 void			isns_register(struct isns *isns, struct isns *oldisns);
 void			isns_check(struct isns *isns);
 void			isns_deregister(struct isns *isns);
 
 struct pport		*pport_new(struct conf *conf, const char *name,
 			    uint32_t ctl_port);
 struct pport		*pport_find(const struct conf *conf, const char *name);
 struct pport		*pport_copy(struct pport *pport, struct conf *conf);
 void			pport_delete(struct pport *pport);
 
 struct port		*port_new(struct conf *conf, struct target *target,
 			    struct portal_group *pg);
 struct port		*port_new_pp(struct conf *conf, struct target *target,
 			    struct pport *pp);
 struct port		*port_find(const struct conf *conf, const char *name);
 struct port		*port_find_in_pg(const struct portal_group *pg,
 			    const char *target);
 void			port_delete(struct port *port);
 
 struct target		*target_new(struct conf *conf, const char *name);
 void			target_delete(struct target *target);
 struct target		*target_find(struct conf *conf,
 			    const char *name);
 int			target_set_redirection(struct target *target,
 			    const char *addr);
 
 struct lun		*lun_new(struct conf *conf, const char *name);
 void			lun_delete(struct lun *lun);
 struct lun		*lun_find(const struct conf *conf, const char *name);
 void			lun_set_backend(struct lun *lun, const char *value);
 void			lun_set_device_type(struct lun *lun, uint8_t value);
 void			lun_set_blocksize(struct lun *lun, size_t value);
 void			lun_set_device_id(struct lun *lun, const char *value);
 void			lun_set_path(struct lun *lun, const char *value);
 void			lun_set_scsiname(struct lun *lun, const char *value);
 void			lun_set_serial(struct lun *lun, const char *value);
 void			lun_set_size(struct lun *lun, size_t value);
 void			lun_set_ctl_lun(struct lun *lun, uint32_t value);
 
 struct option		*option_new(struct options *os,
 			    const char *name, const char *value);
 void			option_delete(struct options *os, struct option *co);
 struct option		*option_find(const struct options *os, const char *name);
 void			option_set(struct option *o, const char *value);
 
 void			kernel_init(void);
 int			kernel_lun_add(struct lun *lun);
 int			kernel_lun_modify(struct lun *lun);
 int			kernel_lun_remove(struct lun *lun);
 void			kernel_handoff(struct connection *conn);
 void			kernel_limits(const char *offload,
 			    int *max_recv_data_segment_length,
 			    int *max_send_data_segment_length,
 			    int *max_burst_length,
 			    int *first_burst_length);
 int			kernel_port_add(struct port *port);
 int			kernel_port_update(struct port *port, struct port *old);
 int			kernel_port_remove(struct port *port);
 void			kernel_capsicate(void);
 
 #ifdef ICL_KERNEL_PROXY
 void			kernel_listen(struct addrinfo *ai, bool iser,
 			    int portal_id);
 void			kernel_accept(int *connection_id, int *portal_id,
 			    struct sockaddr *client_sa,
 			    socklen_t *client_salen);
 void			kernel_send(struct pdu *pdu);
 void			kernel_receive(struct pdu *pdu);
 #endif
 
 struct keys		*keys_new(void);
 void			keys_delete(struct keys *keys);
 void			keys_load(struct keys *keys, const struct pdu *pdu);
 void			keys_save(struct keys *keys, struct pdu *pdu);
 const char		*keys_find(struct keys *keys, const char *name);
 void			keys_add(struct keys *keys,
 			    const char *name, const char *value);
 void			keys_add_int(struct keys *keys,
 			    const char *name, int value);
 
 struct pdu		*pdu_new(struct connection *conn);
 struct pdu		*pdu_new_response(struct pdu *request);
 void			pdu_delete(struct pdu *pdu);
 void			pdu_receive(struct pdu *request);
 void			pdu_send(struct pdu *response);
 
 void			login(struct connection *conn);
 
 void			discovery(struct connection *conn);
 
 void			log_init(int level);
 void			log_set_peer_name(const char *name);
 void			log_set_peer_addr(const char *addr);
 void			log_err(int, const char *, ...)
 			    __dead2 __printflike(2, 3);
 void			log_errx(int, const char *, ...)
 			    __dead2 __printflike(2, 3);
 void			log_warn(const char *, ...) __printflike(1, 2);
 void			log_warnx(const char *, ...) __printflike(1, 2);
 void			log_debugx(const char *, ...) __printflike(1, 2);
 
 char			*checked_strdup(const char *);
 bool			valid_iscsi_name(const char *name);
 void			set_timeout(int timeout, int fatal);
 bool			timed_out(void);
 
 #endif /* !CTLD_H */
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/login.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/login.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/ctld/login.c	(revision 312218)
@@ -1,1038 +1,1057 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <assert.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <netinet/in.h>
 
 #include "ctld.h"
 #include "iscsi_proto.h"
 
 static void login_send_error(struct pdu *request,
     char class, char detail);
 
 static void
 login_set_nsg(struct pdu *response, int nsg)
 {
 	struct iscsi_bhs_login_response *bhslr;
 
 	assert(nsg == BHSLR_STAGE_SECURITY_NEGOTIATION ||
 	    nsg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION ||
 	    nsg == BHSLR_STAGE_FULL_FEATURE_PHASE);
 
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 
 	bhslr->bhslr_flags &= 0xFC;
 	bhslr->bhslr_flags |= nsg;
 	bhslr->bhslr_flags |= BHSLR_FLAGS_TRANSIT;
 }
 
 static int
 login_csg(const struct pdu *request)
 {
 	struct iscsi_bhs_login_request *bhslr;
 
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 
 	return ((bhslr->bhslr_flags & 0x0C) >> 2);
 }
 
 static void
 login_set_csg(struct pdu *response, int csg)
 {
 	struct iscsi_bhs_login_response *bhslr;
 
 	assert(csg == BHSLR_STAGE_SECURITY_NEGOTIATION ||
 	    csg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION ||
 	    csg == BHSLR_STAGE_FULL_FEATURE_PHASE);
 
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 
 	bhslr->bhslr_flags &= 0xF3;
 	bhslr->bhslr_flags |= csg << 2;
 }
 
 static struct pdu *
 login_receive(struct connection *conn, bool initial)
 {
 	struct pdu *request;
 	struct iscsi_bhs_login_request *bhslr;
 
 	request = pdu_new(conn);
 	pdu_receive(request);
 	if ((request->pdu_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) !=
 	    ISCSI_BHS_OPCODE_LOGIN_REQUEST) {
 		/*
 		 * The first PDU in session is special - if we receive any PDU
 		 * different than login request, we have to drop the connection
 		 * without sending response ("A target receiving any PDU
 		 * except a Login request before the Login Phase is started MUST
 		 * immediately terminate the connection on which the PDU
 		 * was received.")
 		 */
 		if (initial == false)
 			login_send_error(request, 0x02, 0x0b);
 		log_errx(1, "protocol error: received invalid opcode 0x%x",
 		    request->pdu_bhs->bhs_opcode);
 	}
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 	/*
 	 * XXX: Implement the C flag some day.
 	 */
 	if ((bhslr->bhslr_flags & BHSLR_FLAGS_CONTINUE) != 0) {
 		login_send_error(request, 0x03, 0x00);
 		log_errx(1, "received Login PDU with unsupported \"C\" flag");
 	}
 	if (bhslr->bhslr_version_max != 0x00) {
 		login_send_error(request, 0x02, 0x05);
 		log_errx(1, "received Login PDU with unsupported "
 		    "Version-max 0x%x", bhslr->bhslr_version_max);
 	}
 	if (bhslr->bhslr_version_min != 0x00) {
 		login_send_error(request, 0x02, 0x05);
 		log_errx(1, "received Login PDU with unsupported "
 		    "Version-min 0x%x", bhslr->bhslr_version_min);
 	}
 	if (initial == false &&
 	    ISCSI_SNLT(ntohl(bhslr->bhslr_cmdsn), conn->conn_cmdsn)) {
 		login_send_error(request, 0x02, 0x00);
 		log_errx(1, "received Login PDU with decreasing CmdSN: "
 		    "was %u, is %u", conn->conn_cmdsn,
 		    ntohl(bhslr->bhslr_cmdsn));
 	}
 	if (initial == false &&
 	    ntohl(bhslr->bhslr_expstatsn) != conn->conn_statsn) {
 		login_send_error(request, 0x02, 0x00);
 		log_errx(1, "received Login PDU with wrong ExpStatSN: "
 		    "is %u, should be %u", ntohl(bhslr->bhslr_expstatsn),
 		    conn->conn_statsn);
 	}
 	conn->conn_cmdsn = ntohl(bhslr->bhslr_cmdsn);
 
 	return (request);
 }
 
 static struct pdu *
 login_new_response(struct pdu *request)
 {
 	struct pdu *response;
 	struct connection *conn;
 	struct iscsi_bhs_login_request *bhslr;
 	struct iscsi_bhs_login_response *bhslr2;
 
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 	conn = request->pdu_connection;
 
 	response = pdu_new_response(request);
 	bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	bhslr2->bhslr_opcode = ISCSI_BHS_OPCODE_LOGIN_RESPONSE;
 	login_set_csg(response, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	memcpy(bhslr2->bhslr_isid,
 	    bhslr->bhslr_isid, sizeof(bhslr2->bhslr_isid));
 	bhslr2->bhslr_initiator_task_tag = bhslr->bhslr_initiator_task_tag;
 	bhslr2->bhslr_statsn = htonl(conn->conn_statsn++);
 	bhslr2->bhslr_expcmdsn = htonl(conn->conn_cmdsn);
 	bhslr2->bhslr_maxcmdsn = htonl(conn->conn_cmdsn);
 
 	return (response);
 }
 
 static void
 login_send_error(struct pdu *request, char class, char detail)
 {
 	struct pdu *response;
 	struct iscsi_bhs_login_response *bhslr2;
 
 	log_debugx("sending Login Response PDU with failure class 0x%x/0x%x; "
 	    "see next line for reason", class, detail);
 	response = login_new_response(request);
 	bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	bhslr2->bhslr_status_class = class;
 	bhslr2->bhslr_status_detail = detail;
 
 	pdu_send(response);
 	pdu_delete(response);
 }
 
 static int
 login_list_contains(const char *list, const char *what)
 {
 	char *tofree, *str, *token;
 
 	tofree = str = checked_strdup(list);
 
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (strcmp(token, what) == 0) {
 			free(tofree);
 			return (1);
 		}
 	}
 	free(tofree);
 	return (0);
 }
 
 static int
 login_list_prefers(const char *list,
     const char *choice1, const char *choice2)
 {
 	char *tofree, *str, *token;
 
 	tofree = str = checked_strdup(list);
 
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (strcmp(token, choice1) == 0) {
 			free(tofree);
 			return (1);
 		}
 		if (strcmp(token, choice2) == 0) {
 			free(tofree);
 			return (2);
 		}
 	}
 	free(tofree);
 	return (-1);
 }
 
 static struct pdu *
 login_receive_chap_a(struct connection *conn)
 {
 	struct pdu *request;
 	struct keys *request_keys;
 	const char *chap_a;
 
 	request = login_receive(conn, false);
 	request_keys = keys_new();
 	keys_load(request_keys, request);
 
 	chap_a = keys_find(request_keys, "CHAP_A");
 	if (chap_a == NULL) {
 		login_send_error(request, 0x02, 0x07);
 		log_errx(1, "received CHAP Login PDU without CHAP_A");
 	}
 	if (login_list_contains(chap_a, "5") == 0) {
 		login_send_error(request, 0x02, 0x01);
 		log_errx(1, "received CHAP Login PDU with unsupported CHAP_A "
 		    "\"%s\"", chap_a);
 	}
 	keys_delete(request_keys);
 
 	return (request);
 }
 
 static void
 login_send_chap_c(struct pdu *request, struct chap *chap)
 {
 	struct pdu *response;
 	struct keys *response_keys;
 	char *chap_c, *chap_i;
 
 	chap_c = chap_get_challenge(chap);
 	chap_i = chap_get_id(chap);
 
 	response = login_new_response(request);
 	response_keys = keys_new();
 	keys_add(response_keys, "CHAP_A", "5");
 	keys_add(response_keys, "CHAP_I", chap_i);
 	keys_add(response_keys, "CHAP_C", chap_c);
 	free(chap_i);
 	free(chap_c);
 	keys_save(response_keys, response);
 	pdu_send(response);
 	pdu_delete(response);
 	keys_delete(response_keys);
 }
 
 static struct pdu *
 login_receive_chap_r(struct connection *conn, struct auth_group *ag,
     struct chap *chap, const struct auth **authp)
 {
 	struct pdu *request;
 	struct keys *request_keys;
 	const char *chap_n, *chap_r;
 	const struct auth *auth;
 	int error;
 
 	request = login_receive(conn, false);
 	request_keys = keys_new();
 	keys_load(request_keys, request);
 
 	chap_n = keys_find(request_keys, "CHAP_N");
 	if (chap_n == NULL) {
 		login_send_error(request, 0x02, 0x07);
 		log_errx(1, "received CHAP Login PDU without CHAP_N");
 	}
 	chap_r = keys_find(request_keys, "CHAP_R");
 	if (chap_r == NULL) {
 		login_send_error(request, 0x02, 0x07);
 		log_errx(1, "received CHAP Login PDU without CHAP_R");
 	}
 	error = chap_receive(chap, chap_r);
 	if (error != 0) {
 		login_send_error(request, 0x02, 0x07);
 		log_errx(1, "received CHAP Login PDU with malformed CHAP_R");
 	}
 
 	/*
 	 * Verify the response.
 	 */
 	assert(ag->ag_type == AG_TYPE_CHAP ||
 	    ag->ag_type == AG_TYPE_CHAP_MUTUAL);
 	auth = auth_find(ag, chap_n);
 	if (auth == NULL) {
 		login_send_error(request, 0x02, 0x01);
 		log_errx(1, "received CHAP Login with invalid user \"%s\"",
 		    chap_n);
 	}
 
 	assert(auth->a_secret != NULL);
 	assert(strlen(auth->a_secret) > 0);
 
 	error = chap_authenticate(chap, auth->a_secret);
 	if (error != 0) {
 		login_send_error(request, 0x02, 0x01);
 		log_errx(1, "CHAP authentication failed for user \"%s\"",
 		    auth->a_user);
 	}
 
 	keys_delete(request_keys);
 
 	*authp = auth;
 	return (request);
 }
 
 static void
 login_send_chap_success(struct pdu *request,
     const struct auth *auth)
 {
 	struct pdu *response;
 	struct keys *request_keys, *response_keys;
 	struct rchap *rchap;
 	const char *chap_i, *chap_c;
 	char *chap_r;
 	int error;
 
 	response = login_new_response(request);
 	login_set_nsg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 
 	/*
 	 * Actually, one more thing: mutual authentication.
 	 */
 	request_keys = keys_new();
 	keys_load(request_keys, request);
 	chap_i = keys_find(request_keys, "CHAP_I");
 	chap_c = keys_find(request_keys, "CHAP_C");
 	if (chap_i != NULL || chap_c != NULL) {
 		if (chap_i == NULL) {
 			login_send_error(request, 0x02, 0x07);
 			log_errx(1, "initiator requested target "
 			    "authentication, but didn't send CHAP_I");
 		}
 		if (chap_c == NULL) {
 			login_send_error(request, 0x02, 0x07);
 			log_errx(1, "initiator requested target "
 			    "authentication, but didn't send CHAP_C");
 		}
 		if (auth->a_auth_group->ag_type != AG_TYPE_CHAP_MUTUAL) {
 			login_send_error(request, 0x02, 0x01);
 			log_errx(1, "initiator requests target authentication "
 			    "for user \"%s\", but mutual user/secret "
 			    "is not set", auth->a_user);
 		}
 
 		log_debugx("performing mutual authentication as user \"%s\"",
 		    auth->a_mutual_user);
 
 		rchap = rchap_new(auth->a_mutual_secret);
 		error = rchap_receive(rchap, chap_i, chap_c);
 		if (error != 0) {
 			login_send_error(request, 0x02, 0x07);
 			log_errx(1, "received CHAP Login PDU with malformed "
 			    "CHAP_I or CHAP_C");
 		}
 		chap_r = rchap_get_response(rchap);
 		rchap_delete(rchap);
 		response_keys = keys_new();
 		keys_add(response_keys, "CHAP_N", auth->a_mutual_user);
 		keys_add(response_keys, "CHAP_R", chap_r);
 		free(chap_r);
 		keys_save(response_keys, response);
 		keys_delete(response_keys);
 	} else {
 		log_debugx("initiator did not request target authentication");
 	}
 
 	keys_delete(request_keys);
 	pdu_send(response);
 	pdu_delete(response);
 }
 
 static void
 login_chap(struct connection *conn, struct auth_group *ag)
 {
 	const struct auth *auth;
 	struct chap *chap;
 	struct pdu *request;
 
 	/*
 	 * Receive CHAP_A PDU.
 	 */
 	log_debugx("beginning CHAP authentication; waiting for CHAP_A");
 	request = login_receive_chap_a(conn);
 
 	/*
 	 * Generate the challenge.
 	 */
 	chap = chap_new();
 
 	/*
 	 * Send the challenge.
 	 */
 	log_debugx("sending CHAP_C, binary challenge size is %zd bytes",
 	    sizeof(chap->chap_challenge));
 	login_send_chap_c(request, chap);
 	pdu_delete(request);
 
 	/*
 	 * Receive CHAP_N/CHAP_R PDU and authenticate.
 	 */
 	log_debugx("waiting for CHAP_N/CHAP_R");
 	request = login_receive_chap_r(conn, ag, chap, &auth);
 
 	/*
 	 * Yay, authentication succeeded!
 	 */
 	log_debugx("authentication succeeded for user \"%s\"; "
 	    "transitioning to Negotiation Phase", auth->a_user);
 	login_send_chap_success(request, auth);
 	pdu_delete(request);
 
 	/*
 	 * Leave username and CHAP information for discovery().
 	 */
 	conn->conn_user = auth->a_user;
 	conn->conn_chap = chap;
 }
 
 static void
 login_negotiate_key(struct pdu *request, const char *name,
     const char *value, bool skipped_security, struct keys *response_keys)
 {
 	int which;
 	size_t tmp;
 	struct connection *conn;
 
 	conn = request->pdu_connection;
 
 	if (strcmp(name, "InitiatorName") == 0) {
 		if (!skipped_security)
 			log_errx(1, "initiator resent InitiatorName");
 	} else if (strcmp(name, "SessionType") == 0) {
 		if (!skipped_security)
 			log_errx(1, "initiator resent SessionType");
 	} else if (strcmp(name, "TargetName") == 0) {
 		if (!skipped_security)
 			log_errx(1, "initiator resent TargetName");
 	} else if (strcmp(name, "InitiatorAlias") == 0) {
 		if (conn->conn_initiator_alias != NULL)
 			free(conn->conn_initiator_alias);
 		conn->conn_initiator_alias = checked_strdup(value);
 	} else if (strcmp(value, "Irrelevant") == 0) {
 		/* Ignore. */
 	} else if (strcmp(name, "HeaderDigest") == 0) {
 		/*
 		 * We don't handle digests for discovery sessions.
 		 */
 		if (conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY) {
 			log_debugx("discovery session; digests disabled");
 			keys_add(response_keys, name, "None");
 			return;
 		}
 
 		which = login_list_prefers(value, "CRC32C", "None");
 		switch (which) {
 		case 1:
 			log_debugx("initiator prefers CRC32C "
 			    "for header digest; we'll use it");
 			conn->conn_header_digest = CONN_DIGEST_CRC32C;
 			keys_add(response_keys, name, "CRC32C");
 			break;
 		case 2:
 			log_debugx("initiator prefers not to do "
 			    "header digest; we'll comply");
 			keys_add(response_keys, name, "None");
 			break;
 		default:
 			log_warnx("initiator sent unrecognized "
 			    "HeaderDigest value \"%s\"; will use None", value);
 			keys_add(response_keys, name, "None");
 			break;
 		}
 	} else if (strcmp(name, "DataDigest") == 0) {
 		if (conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY) {
 			log_debugx("discovery session; digests disabled");
 			keys_add(response_keys, name, "None");
 			return;
 		}
 
 		which = login_list_prefers(value, "CRC32C", "None");
 		switch (which) {
 		case 1:
 			log_debugx("initiator prefers CRC32C "
 			    "for data digest; we'll use it");
 			conn->conn_data_digest = CONN_DIGEST_CRC32C;
 			keys_add(response_keys, name, "CRC32C");
 			break;
 		case 2:
 			log_debugx("initiator prefers not to do "
 			    "data digest; we'll comply");
 			keys_add(response_keys, name, "None");
 			break;
 		default:
 			log_warnx("initiator sent unrecognized "
 			    "DataDigest value \"%s\"; will use None", value);
 			keys_add(response_keys, name, "None");
 			break;
 		}
 	} else if (strcmp(name, "MaxConnections") == 0) {
 		keys_add(response_keys, name, "1");
 	} else if (strcmp(name, "InitialR2T") == 0) {
 		keys_add(response_keys, name, "Yes");
 	} else if (strcmp(name, "ImmediateData") == 0) {
 		if (conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY) {
 			log_debugx("discovery session; ImmediateData irrelevant");
 			keys_add(response_keys, name, "Irrelevant");
 		} else {
 			if (strcmp(value, "Yes") == 0) {
 				conn->conn_immediate_data = true;
 				keys_add(response_keys, name, "Yes");
 			} else {
 				conn->conn_immediate_data = false;
 				keys_add(response_keys, name, "No");
 			}
 		}
 	} else if (strcmp(name, "MaxRecvDataSegmentLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0) {
 			login_send_error(request, 0x02, 0x00);
 			log_errx(1, "received invalid "
 			    "MaxRecvDataSegmentLength");
 		}
 
 		/*
 		 * MaxRecvDataSegmentLength is a direction-specific parameter.
 		 * We'll limit our _send_ to what the initiator can handle but
 		 * our MaxRecvDataSegmentLength is not influenced by the
 		 * initiator in any way.
 		 */
-		if ((int)tmp > conn->conn_max_send_data_segment_length) {
-			log_debugx("capping max_send_data_segment_length "
+		if ((int)tmp > conn->conn_max_send_data_segment_limit) {
+			log_debugx("capping MaxRecvDataSegmentLength "
 			    "from %zd to %d", tmp,
-			    conn->conn_max_send_data_segment_length);
-			tmp = conn->conn_max_send_data_segment_length;
+			    conn->conn_max_send_data_segment_limit);
+			tmp = conn->conn_max_send_data_segment_limit;
 		}
 		conn->conn_max_send_data_segment_length = tmp;
+		conn->conn_max_recv_data_segment_length =
+		    conn->conn_max_recv_data_segment_limit;
 		keys_add_int(response_keys, name,
 		    conn->conn_max_recv_data_segment_length);
 	} else if (strcmp(name, "MaxBurstLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0) {
 			login_send_error(request, 0x02, 0x00);
 			log_errx(1, "received invalid MaxBurstLength");
 		}
-		if ((int)tmp > conn->conn_max_burst_length) {
+		if ((int)tmp > conn->conn_max_burst_limit) {
 			log_debugx("capping MaxBurstLength from %zd to %d",
-			    tmp, conn->conn_max_burst_length);
-			tmp = conn->conn_max_burst_length;
+			    tmp, conn->conn_max_burst_limit);
+			tmp = conn->conn_max_burst_limit;
 		}
 		conn->conn_max_burst_length = tmp;
 		keys_add_int(response_keys, name, tmp);
 	} else if (strcmp(name, "FirstBurstLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0) {
 			login_send_error(request, 0x02, 0x00);
 			log_errx(1, "received invalid FirstBurstLength");
 		}
-		if ((int)tmp > conn->conn_first_burst_length) {
+		if ((int)tmp > conn->conn_first_burst_limit) {
 			log_debugx("capping FirstBurstLength from %zd to %d",
-			    tmp, conn->conn_first_burst_length);
-			tmp = conn->conn_first_burst_length;
+			    tmp, conn->conn_first_burst_limit);
+			tmp = conn->conn_first_burst_limit;
 		}
 		conn->conn_first_burst_length = tmp;
 		keys_add_int(response_keys, name, tmp);
 	} else if (strcmp(name, "DefaultTime2Wait") == 0) {
 		keys_add(response_keys, name, value);
 	} else if (strcmp(name, "DefaultTime2Retain") == 0) {
 		keys_add(response_keys, name, "0");
 	} else if (strcmp(name, "MaxOutstandingR2T") == 0) {
 		keys_add(response_keys, name, "1");
 	} else if (strcmp(name, "DataPDUInOrder") == 0) {
 		keys_add(response_keys, name, "Yes");
 	} else if (strcmp(name, "DataSequenceInOrder") == 0) {
 		keys_add(response_keys, name, "Yes");
 	} else if (strcmp(name, "ErrorRecoveryLevel") == 0) {
 		keys_add(response_keys, name, "0");
 	} else if (strcmp(name, "OFMarker") == 0) {
 		keys_add(response_keys, name, "No");
 	} else if (strcmp(name, "IFMarker") == 0) {
 		keys_add(response_keys, name, "No");
 	} else if (strcmp(name, "iSCSIProtocolLevel") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp > 2)
 			tmp = 2;
 		keys_add_int(response_keys, name, tmp);
 	} else {
 		log_debugx("unknown key \"%s\"; responding "
 		    "with NotUnderstood", name);
 		keys_add(response_keys, name, "NotUnderstood");
 	}
 }
 
 static void
 login_redirect(struct pdu *request, const char *target_address)
 {
 	struct pdu *response;
 	struct iscsi_bhs_login_response *bhslr2;
 	struct keys *response_keys;
 
 	response = login_new_response(request);
 	login_set_csg(response, login_csg(request));
 	bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	bhslr2->bhslr_status_class = 0x01;
 	bhslr2->bhslr_status_detail = 0x01;
 
 	response_keys = keys_new();
 	keys_add(response_keys, "TargetAddress", target_address);
 
 	keys_save(response_keys, response);
 	pdu_send(response);
 	pdu_delete(response);
 	keys_delete(response_keys);
 }
 
 static bool
 login_portal_redirect(struct connection *conn, struct pdu *request)
 {
 	const struct portal_group *pg;
 
 	pg = conn->conn_portal->p_portal_group;
 	if (pg->pg_redirection == NULL)
 		return (false);
 
 	log_debugx("portal-group \"%s\" configured to redirect to %s",
 	    pg->pg_name, pg->pg_redirection);
 	login_redirect(request, pg->pg_redirection);
 
 	return (true);
 }
 
 static bool
 login_target_redirect(struct connection *conn, struct pdu *request)
 {
 	const char *target_address;
 
 	assert(conn->conn_portal->p_portal_group->pg_redirection == NULL);
 
 	if (conn->conn_target == NULL)
 		return (false);
 
 	target_address = conn->conn_target->t_redirection;
 	if (target_address == NULL)
 		return (false);
 
 	log_debugx("target \"%s\" configured to redirect to %s",
 	  conn->conn_target->t_name, target_address);
 	login_redirect(request, target_address);
 
 	return (true);
 }
 
 static void
 login_negotiate(struct connection *conn, struct pdu *request)
 {
 	struct pdu *response;
 	struct iscsi_bhs_login_response *bhslr2;
 	struct keys *request_keys, *response_keys;
 	int i;
 	bool redirected, skipped_security;
 
 	if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) {
 		/*
 		 * Query the kernel for various size limits.  In case of
 		 * offload, it depends on hardware capabilities.
 		 */
 		assert(conn->conn_target != NULL);
+		conn->conn_max_recv_data_segment_limit = (1 << 24) - 1;
+		conn->conn_max_send_data_segment_limit = (1 << 24) - 1;
+		conn->conn_max_burst_limit = (1 << 24) - 1;
+		conn->conn_first_burst_limit = (1 << 24) - 1;
 		kernel_limits(conn->conn_portal->p_portal_group->pg_offload,
-		    &conn->conn_max_recv_data_segment_length,
-		    &conn->conn_max_send_data_segment_length,
-		    &conn->conn_max_burst_length,
-		    &conn->conn_first_burst_length);
+		    &conn->conn_max_recv_data_segment_limit,
+		    &conn->conn_max_send_data_segment_limit,
+		    &conn->conn_max_burst_limit,
+		    &conn->conn_first_burst_limit);
 
 		/* We expect legal, usable values at this point. */
-		assert(conn->conn_max_recv_data_segment_length >= 512);
-		assert(conn->conn_max_recv_data_segment_length < (1 << 24));
-		assert(conn->conn_max_burst_length >= 512);
-		assert(conn->conn_max_burst_length < (1 << 24));
-		assert(conn->conn_first_burst_length >= 512);
-		assert(conn->conn_first_burst_length < (1 << 24));
-		assert(conn->conn_first_burst_length <=
-		    conn->conn_max_burst_length);
+		assert(conn->conn_max_recv_data_segment_limit >= 512);
+		assert(conn->conn_max_recv_data_segment_limit < (1 << 24));
+		assert(conn->conn_max_send_data_segment_limit >= 512);
+		assert(conn->conn_max_send_data_segment_limit < (1 << 24));
+		assert(conn->conn_max_burst_limit >= 512);
+		assert(conn->conn_max_burst_limit < (1 << 24));
+		assert(conn->conn_first_burst_limit >= 512);
+		assert(conn->conn_first_burst_limit < (1 << 24));
+		assert(conn->conn_first_burst_limit <=
+		    conn->conn_max_burst_limit);
+
+		/*
+		 * Limit default send length in case it won't be negotiated.
+		 * We can't do it for other limits, since they may affect both
+		 * sender and receiver operation, and we must obey defaults.
+		 */
+		if (conn->conn_max_send_data_segment_limit <
+		    conn->conn_max_send_data_segment_length) {
+			conn->conn_max_send_data_segment_length =
+			    conn->conn_max_send_data_segment_limit;
+		}
 	} else {
-		conn->conn_max_recv_data_segment_length =
+		conn->conn_max_recv_data_segment_limit =
 		    MAX_DATA_SEGMENT_LENGTH;
-		conn->conn_max_send_data_segment_length =
+		conn->conn_max_send_data_segment_limit =
 		    MAX_DATA_SEGMENT_LENGTH;
 	}
 
 	if (request == NULL) {
 		log_debugx("beginning operational parameter negotiation; "
 		    "waiting for Login PDU");
 		request = login_receive(conn, false);
 		skipped_security = false;
 	} else
 		skipped_security = true;
 
 	/*
 	 * RFC 3720, 10.13.5.  Status-Class and Status-Detail, says
 	 * the redirection SHOULD be accepted by the initiator before
 	 * authentication, but MUST be be accepted afterwards; that's
 	 * why we're doing it here and not earlier.
 	 */
 	redirected = login_target_redirect(conn, request);
 	if (redirected) {
 		log_debugx("initiator redirected; exiting");
 		exit(0);
 	}
 
 	request_keys = keys_new();
 	keys_load(request_keys, request);
 
 	response = login_new_response(request);
 	bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	bhslr2->bhslr_tsih = htons(0xbadd);
 	login_set_csg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 	login_set_nsg(response, BHSLR_STAGE_FULL_FEATURE_PHASE);
 	response_keys = keys_new();
 
 	if (skipped_security &&
 	    conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) {
 		if (conn->conn_target->t_alias != NULL)
 			keys_add(response_keys,
 			    "TargetAlias", conn->conn_target->t_alias);
 		keys_add_int(response_keys, "TargetPortalGroupTag",
 		    conn->conn_portal->p_portal_group->pg_tag);
 	}
 
 	for (i = 0; i < KEYS_MAX; i++) {
 		if (request_keys->keys_names[i] == NULL)
 			break;
 
 		login_negotiate_key(request, request_keys->keys_names[i],
 		    request_keys->keys_values[i], skipped_security,
 		    response_keys);
 	}
 
 	/*
 	 * We'd started with usable values at our end.  But a bad initiator
 	 * could have presented a large FirstBurstLength and then a smaller
 	 * MaxBurstLength (in that order) and because we process the key/value
 	 * pairs in the order they are in the request we might have ended up
 	 * with illegal values here.
 	 */
 	if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL &&
 	    conn->conn_first_burst_length > conn->conn_max_burst_length) {
 		log_errx(1, "initiator sent FirstBurstLength > MaxBurstLength");
 	}
 
 	log_debugx("operational parameter negotiation done; "
 	    "transitioning to Full Feature Phase");
 
 	keys_save(response_keys, response);
 	pdu_send(response);
 	pdu_delete(response);
 	keys_delete(response_keys);
 	pdu_delete(request);
 	keys_delete(request_keys);
 }
 
 static void
 login_wait_transition(struct connection *conn)
 {
 	struct pdu *request, *response;
 	struct iscsi_bhs_login_request *bhslr;
 
 	log_debugx("waiting for state transition request");
 	request = login_receive(conn, false);
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 	if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) == 0) {
 		login_send_error(request, 0x02, 0x00);
 		log_errx(1, "got no \"T\" flag after answering AuthMethod");
 	}
 
 	log_debugx("got state transition request");
 	response = login_new_response(request);
 	pdu_delete(request);
 	login_set_nsg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 	pdu_send(response);
 	pdu_delete(response);
 
 	login_negotiate(conn, NULL);
 }
 
 void
 login(struct connection *conn)
 {
 	struct pdu *request, *response;
 	struct iscsi_bhs_login_request *bhslr;
 	struct keys *request_keys, *response_keys;
 	struct auth_group *ag;
 	struct portal_group *pg;
 	const char *initiator_name, *initiator_alias, *session_type,
 	    *target_name, *auth_method;
 	bool redirected, fail, trans;
 
 	/*
 	 * Handle the initial Login Request - figure out required authentication
 	 * method and either transition to the next phase, if no authentication
 	 * is required, or call appropriate authentication code.
 	 */
 	log_debugx("beginning Login Phase; waiting for Login PDU");
 	request = login_receive(conn, true);
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 	if (bhslr->bhslr_tsih != 0) {
 		login_send_error(request, 0x02, 0x0a);
 		log_errx(1, "received Login PDU with non-zero TSIH");
 	}
 
 	pg = conn->conn_portal->p_portal_group;
 
 	memcpy(conn->conn_initiator_isid, bhslr->bhslr_isid,
 	    sizeof(conn->conn_initiator_isid));
 
 	/*
 	 * XXX: Implement the C flag some day.
 	 */
 	request_keys = keys_new();
 	keys_load(request_keys, request);
 
 	assert(conn->conn_initiator_name == NULL);
 	initiator_name = keys_find(request_keys, "InitiatorName");
 	if (initiator_name == NULL) {
 		login_send_error(request, 0x02, 0x07);
 		log_errx(1, "received Login PDU without InitiatorName");
 	}
 	if (valid_iscsi_name(initiator_name) == false) {
 		login_send_error(request, 0x02, 0x00);
 		log_errx(1, "received Login PDU with invalid InitiatorName");
 	}
 	conn->conn_initiator_name = checked_strdup(initiator_name);
 	log_set_peer_name(conn->conn_initiator_name);
 	setproctitle("%s (%s)", conn->conn_initiator_addr, conn->conn_initiator_name);
 
 	redirected = login_portal_redirect(conn, request);
 	if (redirected) {
 		log_debugx("initiator redirected; exiting");
 		exit(0);
 	}
 
 	initiator_alias = keys_find(request_keys, "InitiatorAlias");
 	if (initiator_alias != NULL)
 		conn->conn_initiator_alias = checked_strdup(initiator_alias);
 
 	assert(conn->conn_session_type == CONN_SESSION_TYPE_NONE);
 	session_type = keys_find(request_keys, "SessionType");
 	if (session_type != NULL) {
 		if (strcmp(session_type, "Normal") == 0) {
 			conn->conn_session_type = CONN_SESSION_TYPE_NORMAL;
 		} else if (strcmp(session_type, "Discovery") == 0) {
 			conn->conn_session_type = CONN_SESSION_TYPE_DISCOVERY;
 		} else {
 			login_send_error(request, 0x02, 0x00);
 			log_errx(1, "received Login PDU with invalid "
 			    "SessionType \"%s\"", session_type);
 		}
 	} else
 		conn->conn_session_type = CONN_SESSION_TYPE_NORMAL;
 
 	assert(conn->conn_target == NULL);
 	if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) {
 		target_name = keys_find(request_keys, "TargetName");
 		if (target_name == NULL) {
 			login_send_error(request, 0x02, 0x07);
 			log_errx(1, "received Login PDU without TargetName");
 		}
 
 		conn->conn_port = port_find_in_pg(pg, target_name);
 		if (conn->conn_port == NULL) {
 			login_send_error(request, 0x02, 0x03);
 			log_errx(1, "requested target \"%s\" not found",
 			    target_name);
 		}
 		conn->conn_target = conn->conn_port->p_target;
 	}
 
 	/*
 	 * At this point we know what kind of authentication we need.
 	 */
 	if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) {
 		ag = conn->conn_port->p_auth_group;
 		if (ag == NULL)
 			ag = conn->conn_target->t_auth_group;
 		if (ag->ag_name != NULL) {
 			log_debugx("initiator requests to connect "
 			    "to target \"%s\"; auth-group \"%s\"",
 			    conn->conn_target->t_name,
 			    ag->ag_name);
 		} else {
 			log_debugx("initiator requests to connect "
 			    "to target \"%s\"", conn->conn_target->t_name);
 		}
 	} else {
 		assert(conn->conn_session_type == CONN_SESSION_TYPE_DISCOVERY);
 		ag = pg->pg_discovery_auth_group;
 		if (ag->ag_name != NULL) {
 			log_debugx("initiator requests "
 			    "discovery session; auth-group \"%s\"", ag->ag_name);
 		} else {
 			log_debugx("initiator requests discovery session");
 		}
 	}
 
 	if (ag->ag_type == AG_TYPE_DENY) {
 		login_send_error(request, 0x02, 0x01);
 		log_errx(1, "auth-type is \"deny\"");
 	}
 
 	if (ag->ag_type == AG_TYPE_UNKNOWN) {
 		/*
 		 * This can happen with empty auth-group.
 		 */
 		login_send_error(request, 0x02, 0x01);
 		log_errx(1, "auth-type not set, denying access");
 	}
 
 	/*
 	 * Enforce initiator-name and initiator-portal.
 	 */
 	if (auth_name_check(ag, initiator_name) != 0) {
 		login_send_error(request, 0x02, 0x02);
 		log_errx(1, "initiator does not match allowed initiator names");
 	}
 
 	if (auth_portal_check(ag, &conn->conn_initiator_sa) != 0) {
 		login_send_error(request, 0x02, 0x02);
 		log_errx(1, "initiator does not match allowed "
 		    "initiator portals");
 	}
 
 	/*
 	 * Let's see if the initiator intends to do any kind of authentication
 	 * at all.
 	 */
 	if (login_csg(request) == BHSLR_STAGE_OPERATIONAL_NEGOTIATION) {
 		if (ag->ag_type != AG_TYPE_NO_AUTHENTICATION) {
 			login_send_error(request, 0x02, 0x01);
 			log_errx(1, "initiator skipped the authentication, "
 			    "but authentication is required");
 		}
 
 		keys_delete(request_keys);
 
 		log_debugx("initiator skipped the authentication, "
 		    "and we don't need it; proceeding with negotiation");
 		login_negotiate(conn, request);
 		return;
 	}
 
 	fail = false;
 	response = login_new_response(request);
 	response_keys = keys_new();
 	trans = (bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0;
 	auth_method = keys_find(request_keys, "AuthMethod");
 	if (ag->ag_type == AG_TYPE_NO_AUTHENTICATION) {
 		log_debugx("authentication not required");
 		if (auth_method == NULL ||
 		    login_list_contains(auth_method, "None")) {
 			keys_add(response_keys, "AuthMethod", "None");
 		} else {
 			log_warnx("initiator requests "
 			    "AuthMethod \"%s\" instead of \"None\"",
 			    auth_method);
 			keys_add(response_keys, "AuthMethod", "Reject");
 		}
 		if (trans)
 			login_set_nsg(response, BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 	} else {
 		log_debugx("CHAP authentication required");
 		if (auth_method == NULL ||
 		    login_list_contains(auth_method, "CHAP")) {
 			keys_add(response_keys, "AuthMethod", "CHAP");
 		} else {
 			log_warnx("initiator requests unsupported "
 			    "AuthMethod \"%s\" instead of \"CHAP\"",
 			    auth_method);
 			keys_add(response_keys, "AuthMethod", "Reject");
 			fail = true;
 		}
 	}
 	if (conn->conn_session_type == CONN_SESSION_TYPE_NORMAL) {
 		if (conn->conn_target->t_alias != NULL)
 			keys_add(response_keys,
 			    "TargetAlias", conn->conn_target->t_alias);
 		keys_add_int(response_keys,
 		    "TargetPortalGroupTag", pg->pg_tag);
 	}
 	keys_save(response_keys, response);
 
 	pdu_send(response);
 	pdu_delete(response);
 	keys_delete(response_keys);
 	pdu_delete(request);
 	keys_delete(request_keys);
 
 	if (fail) {
 		log_debugx("sent reject for AuthMethod; exiting");
 		exit(1);
 	}
 
 	if (ag->ag_type != AG_TYPE_NO_AUTHENTICATION) {
 		login_chap(conn, ag);
 		login_negotiate(conn, NULL);
 	} else if (trans) {
 		login_negotiate(conn, NULL);
 	} else {
 		login_wait_transition(conn);
 	}
 }
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/inetd/inetd.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/inetd/inetd.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/inetd/inetd.c	(revision 312218)
@@ -1,2577 +1,2581 @@
 /*
  * Copyright (c) 1983, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1983, 1991, 1993, 1994\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #ifndef lint
 #if 0
 static char sccsid[] = "@(#)from: inetd.c	8.4 (Berkeley) 4/13/94";
 #endif
 #endif /* not lint */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Inetd - Internet super-server
  *
  * This program invokes all internet services as needed.  Connection-oriented
  * services are invoked each time a connection is made, by creating a process.
  * This process is passed the connection as file descriptor 0 and is expected
  * to do a getpeername to find out the source host and port.
  *
  * Datagram oriented services are invoked when a datagram
  * arrives; a process is created and passed a pending message
  * on file descriptor 0.  Datagram servers may either connect
  * to their peer, freeing up the original socket for inetd
  * to receive further messages on, or ``take over the socket'',
  * processing all arriving datagrams and, eventually, timing
  * out.	 The first type of server is said to be ``multi-threaded'';
  * the second type of server ``single-threaded''.
  *
  * Inetd uses a configuration file which is read at startup
  * and, possibly, at some later time in response to a hangup signal.
  * The configuration file is ``free format'' with fields given in the
  * order shown below.  Continuation lines for an entry must begin with
  * a space or tab.  All fields must be present in each entry.
  *
  *	service name			must be in /etc/services
  *					or name a tcpmux service 
  *					or specify a unix domain socket
  *	socket type			stream/dgram/raw/rdm/seqpacket
  *	protocol			tcp[4][6], udp[4][6], unix
  *	wait/nowait			single-threaded/multi-threaded
  *	user[:group][/login-class]	user/group/login-class to run daemon as
  *	server program			full path name
  *	server program arguments	maximum of MAXARGS (20)
  *
  * TCP services without official port numbers are handled with the
  * RFC1078-based tcpmux internal service. Tcpmux listens on port 1 for
  * requests. When a connection is made from a foreign host, the service
  * requested is passed to tcpmux, which looks it up in the servtab list
  * and returns the proper entry for the service. Tcpmux returns a
  * negative reply if the service doesn't exist, otherwise the invoked
  * server is expected to return the positive reply if the service type in
  * inetd.conf file has the prefix "tcpmux/". If the service type has the
  * prefix "tcpmux/+", tcpmux will return the positive reply for the
  * process; this is for compatibility with older server code, and also
  * allows you to invoke programs that use stdin/stdout without putting any
  * special server code in them. Services that use tcpmux are "nowait"
  * because they do not have a well-known port and hence cannot listen
  * for new requests.
  *
  * For RPC services
  *	service name/version		must be in /etc/rpc
  *	socket type			stream/dgram/raw/rdm/seqpacket
  *	protocol			rpc/tcp[4][6], rpc/udp[4][6]
  *	wait/nowait			single-threaded/multi-threaded
  *	user[:group][/login-class]	user/group/login-class to run daemon as
  *	server program			full path name
  *	server program arguments	maximum of MAXARGS
  *
  * Comment lines are indicated by a `#' in column 1.
  *
  * #ifdef IPSEC
  * Comment lines that start with "#@" denote IPsec policy string, as described
  * in ipsec_set_policy(3).  This will affect all the following items in
  * inetd.conf(8).  To reset the policy, just use "#@" line.  By default,
  * there's no IPsec policy.
  * #endif
  */
 #include <sys/param.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/un.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <arpa/inet.h>
 #include <rpc/rpc.h>
 #include <rpc/pmap_clnt.h>
 
 #include <ctype.h>
 #include <errno.h>
 #include <err.h>
 #include <fcntl.h>
 #include <grp.h>
 #include <libutil.h>
 #include <limits.h>
 #include <netdb.h>
 #include <pwd.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysexits.h>
 #include <syslog.h>
+#ifdef LIBWRAP
 #include <tcpd.h>
+#endif
 #include <unistd.h>
 
 #include "inetd.h"
 #include "pathnames.h"
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #ifndef IPSEC_POLICY_IPSEC	/* no ipsec support on old ipsec */
 #undef IPSEC
 #endif
 #endif
 
 #ifndef LIBWRAP_ALLOW_FACILITY
 # define LIBWRAP_ALLOW_FACILITY LOG_AUTH
 #endif
 #ifndef LIBWRAP_ALLOW_SEVERITY
 # define LIBWRAP_ALLOW_SEVERITY LOG_INFO
 #endif
 #ifndef LIBWRAP_DENY_FACILITY
 # define LIBWRAP_DENY_FACILITY LOG_AUTH
 #endif
 #ifndef LIBWRAP_DENY_SEVERITY
 # define LIBWRAP_DENY_SEVERITY LOG_WARNING
 #endif
 
 #define ISWRAP(sep)	\
 	   ( ((wrap_ex && !(sep)->se_bi) || (wrap_bi && (sep)->se_bi)) \
 	&& (sep->se_family == AF_INET || sep->se_family == AF_INET6) \
 	&& ( ((sep)->se_accept && (sep)->se_socktype == SOCK_STREAM) \
 	    || (sep)->se_socktype == SOCK_DGRAM))
 
 #ifdef LOGIN_CAP
 #include <login_cap.h>
 
 /* see init.c */
 #define RESOURCE_RC "daemon"
 
 #endif
 
 #ifndef	MAXCHILD
 #define	MAXCHILD	-1		/* maximum number of this service
 					   < 0 = no limit */
 #endif
 
 #ifndef	MAXCPM
 #define	MAXCPM		-1		/* rate limit invocations from a
 					   single remote address,
 					   < 0 = no limit */
 #endif
 
 #ifndef	MAXPERIP
 #define	MAXPERIP	-1		/* maximum number of this service
 					   from a single remote address,
 					   < 0 = no limit */
 #endif
 
 #ifndef TOOMANY
 #define	TOOMANY		256		/* don't start more than TOOMANY */
 #endif
 #define	CNT_INTVL	60		/* servers in CNT_INTVL sec. */
 #define	RETRYTIME	(60*10)		/* retry after bind or server fail */
 #define MAX_MAXCHLD	32767		/* max allowable max children */
 
 #define	SIGBLOCK	(sigmask(SIGCHLD)|sigmask(SIGHUP)|sigmask(SIGALRM))
 
 #define	satosin(sa)	((struct sockaddr_in *)(void *)sa)
 #define	csatosin(sa)	((const struct sockaddr_in *)(const void *)sa)
 #ifdef INET6
 #define	satosin6(sa)	((struct sockaddr_in6 *)(void *)sa)
 #define	csatosin6(sa)	((const struct sockaddr_in6 *)(const void *)sa)
 #endif
 static void	close_sep(struct servtab *);
 static void	flag_signal(int);
 static void	flag_config(int);
 static void	config(void);
 static int	cpmip(const struct servtab *, int);
 static void	endconfig(void);
 static struct servtab *enter(struct servtab *);
 static void	freeconfig(struct servtab *);
 static struct servtab *getconfigent(void);
 static int	matchservent(const char *, const char *, const char *);
 static char	*nextline(FILE *);
 static void	addchild(struct servtab *, int);
 static void	flag_reapchild(int);
 static void	reapchild(void);
 static void	enable(struct servtab *);
 static void	disable(struct servtab *);
 static void	flag_retry(int);
 static void	retry(void);
 static int	setconfig(void);
 static void	setup(struct servtab *);
 #ifdef IPSEC
 static void	ipsecsetup(struct servtab *);
 #endif
 static void	unregisterrpc(register struct servtab *sep);
 static struct conninfo *search_conn(struct servtab *sep, int ctrl);
 static int	room_conn(struct servtab *sep, struct conninfo *conn);
 static void	addchild_conn(struct conninfo *conn, pid_t pid);
 static void	reapchild_conn(pid_t pid);
 static void	free_conn(struct conninfo *conn);
 static void	resize_conn(struct servtab *sep, int maxperip);
 static void	free_connlist(struct servtab *sep);
 static void	free_proc(struct procinfo *);
 static struct procinfo *search_proc(pid_t pid, int add);
 static int	hashval(char *p, int len);
 static char	*skip(char **);
 static char	*sskip(char **);
 static char	*newstr(const char *);
 static void	print_service(const char *, const struct servtab *);
 
 /* tcpd.h */
 int	allow_severity;
 int	deny_severity;
 
 static int	wrap_ex = 0;
 static int	wrap_bi = 0;
 int	debug = 0;
 static int	dolog = 0;
 static int	maxsock;		/* highest-numbered descriptor */
 static fd_set	allsock;
 static int	options;
 static int	timingout;
 static int	toomany = TOOMANY;
 static int	maxchild = MAXCHILD;
 static int	maxcpm = MAXCPM;
 static int	maxperip = MAXPERIP;
 static struct	servent *sp;
 static struct	rpcent *rpc;
 static char	*hostname = NULL;
 static struct	sockaddr_in *bind_sa4;
 static int	v4bind_ok = 0;
 #ifdef INET6
 static struct	sockaddr_in6 *bind_sa6;
 static int	v6bind_ok = 0;
 #endif
 static int	signalpipe[2];
 #ifdef SANITY_CHECK
 static int	nsock;
 #endif
 static uid_t	euid;
 static gid_t	egid;
 static mode_t	mask;
 
 struct servtab *servtab;
 
 static const char	*CONFIG = _PATH_INETDCONF;
 static const char	*pid_file = _PATH_INETDPID;
 static struct pidfh	*pfh = NULL;
 
 static struct netconfig *udpconf, *tcpconf, *udp6conf, *tcp6conf;
 
 static LIST_HEAD(, procinfo) proctable[PERIPSIZE];
 
 static int
 getvalue(const char *arg, int *value, const char *whine)
 {
 	int  tmp;
 	char *p;
 
 	tmp = strtol(arg, &p, 0);
 	if (tmp < 0 || *p) {
 		syslog(LOG_ERR, whine, arg);
 		return 1;			/* failure */
 	}
 	*value = tmp;
 	return 0;				/* success */
 }
 
+#ifdef LIBWRAP
 static sa_family_t
 whichaf(struct request_info *req)
 {
 	struct sockaddr *sa;
 
 	sa = (struct sockaddr *)req->client->sin;
 	if (sa == NULL)
 		return AF_UNSPEC;
 #ifdef INET6
 	if (sa->sa_family == AF_INET6 &&
 	    IN6_IS_ADDR_V4MAPPED(&satosin6(sa)->sin6_addr))
 		return AF_INET;
 #endif
 	return sa->sa_family;
 }
+#endif
 
 int
 main(int argc, char **argv)
 {
 	struct servtab *sep;
 	struct passwd *pwd;
 	struct group *grp;
 	struct sigaction sa, saalrm, sachld, sahup, sapipe;
 	int ch, dofork;
 	pid_t pid;
 	char buf[50];
 #ifdef LOGIN_CAP
 	login_cap_t *lc = NULL;
 #endif
 #ifdef LIBWRAP
 	struct request_info req;
 	int denied;
 	char *service = NULL;
 #endif
 	struct sockaddr_storage peer;
 	int i;
 	struct addrinfo hints, *res;
 	const char *servname;
 	int error;
 	struct conninfo *conn;
 
 	openlog("inetd", LOG_PID | LOG_NOWAIT | LOG_PERROR, LOG_DAEMON);
 
 	while ((ch = getopt(argc, argv, "dlwWR:a:c:C:p:s:")) != -1)
 		switch(ch) {
 		case 'd':
 			debug = 1;
 			options |= SO_DEBUG;
 			break;
 		case 'l':
 			dolog = 1;
 			break;
 		case 'R':
 			getvalue(optarg, &toomany,
 				"-R %s: bad value for service invocation rate");
 			break;
 		case 'c':
 			getvalue(optarg, &maxchild,
 				"-c %s: bad value for maximum children");
 			break;
 		case 'C':
 			getvalue(optarg, &maxcpm,
 				"-C %s: bad value for maximum children/minute");
 			break;
 		case 'a':
 			hostname = optarg;
 			break;
 		case 'p':
 			pid_file = optarg;
 			break;
 		case 's':
 			getvalue(optarg, &maxperip,
 				"-s %s: bad value for maximum children per source address");
 			break;
 		case 'w':
 			wrap_ex++;
 			break;
 		case 'W':
 			wrap_bi++;
 			break;
 		case '?':
 		default:
 			syslog(LOG_ERR,
 				"usage: inetd [-dlwW] [-a address] [-R rate]"
 				" [-c maximum] [-C rate]"
 				" [-p pidfile] [conf-file]");
 			exit(EX_USAGE);
 		}
 	/*
 	 * Initialize Bind Addrs.
 	 *   When hostname is NULL, wild card bind addrs are obtained from
 	 *   getaddrinfo(). But getaddrinfo() requires at least one of
 	 *   hostname or servname is non NULL.
 	 *   So when hostname is NULL, set dummy value to servname.
 	 *   Since getaddrinfo() doesn't accept numeric servname, and
 	 *   we doesn't use ai_socktype of struct addrinfo returned
 	 *   from getaddrinfo(), we set dummy value to ai_socktype.
 	 */
 	servname = (hostname == NULL) ? "0" /* dummy */ : NULL;
 
 	bzero(&hints, sizeof(struct addrinfo));
 	hints.ai_flags = AI_PASSIVE;
 	hints.ai_family = AF_UNSPEC;
 	hints.ai_socktype = SOCK_STREAM;	/* dummy */
 	error = getaddrinfo(hostname, servname, &hints, &res);
 	if (error != 0) {
 		syslog(LOG_ERR, "-a %s: %s", hostname, gai_strerror(error));
 		if (error == EAI_SYSTEM)
 			syslog(LOG_ERR, "%s", strerror(errno));
 		exit(EX_USAGE);
 	}
 	do {
 		if (res->ai_addr == NULL) {
 			syslog(LOG_ERR, "-a %s: getaddrinfo failed", hostname);
 			exit(EX_USAGE);
 		}
 		switch (res->ai_addr->sa_family) {
 		case AF_INET:
 			if (v4bind_ok)
 				continue;
 			bind_sa4 = satosin(res->ai_addr);
 			/* init port num in case servname is dummy */
 			bind_sa4->sin_port = 0;
 			v4bind_ok = 1;
 			continue;
 #ifdef INET6
 		case AF_INET6:
 			if (v6bind_ok)
 				continue;
 			bind_sa6 = satosin6(res->ai_addr);
 			/* init port num in case servname is dummy */
 			bind_sa6->sin6_port = 0;
 			v6bind_ok = 1;
 			continue;
 #endif
 		}
 		if (v4bind_ok
 #ifdef INET6
 		    && v6bind_ok
 #endif
 		    )
 			break;
 	} while ((res = res->ai_next) != NULL);
 	if (!v4bind_ok
 #ifdef INET6
 	    && !v6bind_ok
 #endif
 	    ) {
 		syslog(LOG_ERR, "-a %s: unknown address family", hostname);
 		exit(EX_USAGE);
 	}
 
 	euid = geteuid();
 	egid = getegid();
 	umask(mask = umask(0777));
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc > 0)
 		CONFIG = argv[0];
 	if (access(CONFIG, R_OK) < 0)
 		syslog(LOG_ERR, "Accessing %s: %m, continuing anyway.", CONFIG);
 	if (debug == 0) {
 		pid_t otherpid;
 
 		pfh = pidfile_open(pid_file, 0600, &otherpid);
 		if (pfh == NULL) {
 			if (errno == EEXIST) {
 				syslog(LOG_ERR, "%s already running, pid: %d",
 				    getprogname(), otherpid);
 				exit(EX_OSERR);
 			}
 			syslog(LOG_WARNING, "pidfile_open() failed: %m");
 		}
 
 		if (daemon(0, 0) < 0) {
 			syslog(LOG_WARNING, "daemon(0,0) failed: %m");
 		}
 		/* From now on we don't want syslog messages going to stderr. */
 		closelog();
 		openlog("inetd", LOG_PID | LOG_NOWAIT, LOG_DAEMON);
 		/*
 		 * In case somebody has started inetd manually, we need to
 		 * clear the logname, so that old servers run as root do not
 		 * get the user's logname..
 		 */
 		if (setlogin("") < 0) {
 			syslog(LOG_WARNING, "cannot clear logname: %m");
 			/* no big deal if it fails.. */
 		}
 		if (pfh != NULL && pidfile_write(pfh) == -1) {
 			syslog(LOG_WARNING, "pidfile_write(): %m");
 		}
 	}
 
 	if (madvise(NULL, 0, MADV_PROTECT) != 0)
 		syslog(LOG_WARNING, "madvise() failed: %s", strerror(errno));
 
 	for (i = 0; i < PERIPSIZE; ++i)
 		LIST_INIT(&proctable[i]);
 
 	if (v4bind_ok) {
 		udpconf = getnetconfigent("udp");
 		tcpconf = getnetconfigent("tcp");
 		if (udpconf == NULL || tcpconf == NULL) {	
 			syslog(LOG_ERR, "unknown rpc/udp or rpc/tcp");
 			exit(EX_USAGE);
 		}
 	}
 #ifdef INET6
 	if (v6bind_ok) {
 		udp6conf = getnetconfigent("udp6");
 		tcp6conf = getnetconfigent("tcp6");
 		if (udp6conf == NULL || tcp6conf == NULL) {	
 			syslog(LOG_ERR, "unknown rpc/udp6 or rpc/tcp6");
 			exit(EX_USAGE);
 		}
 	}
 #endif
 
 	sa.sa_flags = 0;
 	sigemptyset(&sa.sa_mask);
 	sigaddset(&sa.sa_mask, SIGALRM);
 	sigaddset(&sa.sa_mask, SIGCHLD);
 	sigaddset(&sa.sa_mask, SIGHUP);
 	sa.sa_handler = flag_retry;
 	sigaction(SIGALRM, &sa, &saalrm);
 	config();
 	sa.sa_handler = flag_config;
 	sigaction(SIGHUP, &sa, &sahup);
 	sa.sa_handler = flag_reapchild;
 	sigaction(SIGCHLD, &sa, &sachld);
 	sa.sa_handler = SIG_IGN;
 	sigaction(SIGPIPE, &sa, &sapipe);
 
 	{
 		/* space for daemons to overwrite environment for ps */
 #define	DUMMYSIZE	100
 		char dummy[DUMMYSIZE];
 
 		(void)memset(dummy, 'x', DUMMYSIZE - 1);
 		dummy[DUMMYSIZE - 1] = '\0';
 		(void)setenv("inetd_dummy", dummy, 1);
 	}
 
 	if (pipe2(signalpipe, O_CLOEXEC) != 0) {
 		syslog(LOG_ERR, "pipe: %m");
 		exit(EX_OSERR);
 	}
 	FD_SET(signalpipe[0], &allsock);
 #ifdef SANITY_CHECK
 	nsock++;
 #endif
 	if (signalpipe[0] > maxsock)
 	    maxsock = signalpipe[0];
 	if (signalpipe[1] > maxsock)
 	    maxsock = signalpipe[1];
 
 	for (;;) {
 	    int n, ctrl;
 	    fd_set readable;
 
 #ifdef SANITY_CHECK
 	    if (nsock == 0) {
 		syslog(LOG_ERR, "%s: nsock=0", __func__);
 		exit(EX_SOFTWARE);
 	    }
 #endif
 	    readable = allsock;
 	    if ((n = select(maxsock + 1, &readable, (fd_set *)0,
 		(fd_set *)0, (struct timeval *)0)) <= 0) {
 		    if (n < 0 && errno != EINTR) {
 			syslog(LOG_WARNING, "select: %m");
 			sleep(1);
 		    }
 		    continue;
 	    }
 	    /* handle any queued signal flags */
 	    if (FD_ISSET(signalpipe[0], &readable)) {
 		int nsig;
 		if (ioctl(signalpipe[0], FIONREAD, &nsig) != 0) {
 		    syslog(LOG_ERR, "ioctl: %m");
 		    exit(EX_OSERR);
 		}
 		while (--nsig >= 0) {
 		    char c;
 		    if (read(signalpipe[0], &c, 1) != 1) {
 			syslog(LOG_ERR, "read: %m");
 			exit(EX_OSERR);
 		    }
 		    if (debug)
 			warnx("handling signal flag %c", c);
 		    switch(c) {
 		    case 'A': /* sigalrm */
 			retry();
 			break;
 		    case 'C': /* sigchld */
 			reapchild();
 			break;
 		    case 'H': /* sighup */
 			config();
 			break;
 		    }
 		}
 	    }
 	    for (sep = servtab; n && sep; sep = sep->se_next)
 	        if (sep->se_fd != -1 && FD_ISSET(sep->se_fd, &readable)) {
 		    n--;
 		    if (debug)
 			    warnx("someone wants %s", sep->se_service);
 		    dofork = !sep->se_bi || sep->se_bi->bi_fork || ISWRAP(sep);
 		    conn = NULL;
 		    if (sep->se_accept && sep->se_socktype == SOCK_STREAM) {
 			    i = 1;
 			    if (ioctl(sep->se_fd, FIONBIO, &i) < 0)
 				    syslog(LOG_ERR, "ioctl (FIONBIO, 1): %m");
 			    ctrl = accept(sep->se_fd, (struct sockaddr *)0,
 				(socklen_t *)0);
 			    if (debug)
 				    warnx("accept, ctrl %d", ctrl);
 			    if (ctrl < 0) {
 				    if (errno != EINTR)
 					    syslog(LOG_WARNING,
 						"accept (for %s): %m",
 						sep->se_service);
                                       if (sep->se_accept &&
                                           sep->se_socktype == SOCK_STREAM)
                                               close(ctrl);
 				    continue;
 			    }
 			    i = 0;
 			    if (ioctl(sep->se_fd, FIONBIO, &i) < 0)
 				    syslog(LOG_ERR, "ioctl1(FIONBIO, 0): %m");
 			    if (ioctl(ctrl, FIONBIO, &i) < 0)
 				    syslog(LOG_ERR, "ioctl2(FIONBIO, 0): %m");
 			    if (cpmip(sep, ctrl) < 0) {
 				close(ctrl);
 				continue;
 			    }
 			    if (dofork &&
 				(conn = search_conn(sep, ctrl)) != NULL &&
 				!room_conn(sep, conn)) {
 				close(ctrl);
 				continue;
 			    }
 		    } else
 			    ctrl = sep->se_fd;
 		    if (dolog && !ISWRAP(sep)) {
 			    char pname[NI_MAXHOST] = "unknown";
 			    socklen_t sl;
 			    sl = sizeof(peer);
 			    if (getpeername(ctrl, (struct sockaddr *)
 					    &peer, &sl)) {
 				    sl = sizeof(peer);
 				    if (recvfrom(ctrl, buf, sizeof(buf),
 					MSG_PEEK,
 					(struct sockaddr *)&peer,
 					&sl) >= 0) {
 				      getnameinfo((struct sockaddr *)&peer,
 						  peer.ss_len,
 						  pname, sizeof(pname),
 						  NULL, 0, NI_NUMERICHOST);
 				    }
 			    } else {
 			            getnameinfo((struct sockaddr *)&peer,
 						peer.ss_len,
 						pname, sizeof(pname),
 						NULL, 0, NI_NUMERICHOST);
 			    }
 			    syslog(LOG_INFO,"%s from %s", sep->se_service, pname);
 		    }
 		    (void) sigblock(SIGBLOCK);
 		    pid = 0;
 		    /*
 		     * Fork for all external services, builtins which need to
 		     * fork and anything we're wrapping (as wrapping might
 		     * block or use hosts_options(5) twist).
 		     */
 		    if (dofork) {
 			    if (sep->se_count++ == 0)
 				(void)clock_gettime(CLOCK_MONOTONIC_FAST, &sep->se_time);
 			    else if (toomany > 0 && sep->se_count >= toomany) {
 				struct timespec now;
 
 				(void)clock_gettime(CLOCK_MONOTONIC_FAST, &now);
 				if (now.tv_sec - sep->se_time.tv_sec >
 				    CNT_INTVL) {
 					sep->se_time = now;
 					sep->se_count = 1;
 				} else {
 					syslog(LOG_ERR,
 			"%s/%s server failing (looping), service terminated",
 					    sep->se_service, sep->se_proto);
 					if (sep->se_accept &&
 					    sep->se_socktype == SOCK_STREAM)
 						close(ctrl);
 					close_sep(sep);
 					free_conn(conn);
 					sigsetmask(0L);
 					if (!timingout) {
 						timingout = 1;
 						alarm(RETRYTIME);
 					}
 					continue;
 				}
 			    }
 			    pid = fork();
 		    }
 		    if (pid < 0) {
 			    syslog(LOG_ERR, "fork: %m");
 			    if (sep->se_accept &&
 				sep->se_socktype == SOCK_STREAM)
 				    close(ctrl);
 			    free_conn(conn);
 			    sigsetmask(0L);
 			    sleep(1);
 			    continue;
 		    }
 		    if (pid) {
 			addchild_conn(conn, pid);
 			addchild(sep, pid);
 		    }
 		    sigsetmask(0L);
 		    if (pid == 0) {
 			    pidfile_close(pfh);
 			    if (dofork) {
 				sigaction(SIGALRM, &saalrm, (struct sigaction *)0);
 				sigaction(SIGCHLD, &sachld, (struct sigaction *)0);
 				sigaction(SIGHUP, &sahup, (struct sigaction *)0);
 				/* SIGPIPE reset before exec */
 			    }
 			    /*
 			     * Call tcpmux to find the real service to exec.
 			     */
 			    if (sep->se_bi &&
 				sep->se_bi->bi_fn == (bi_fn_t *) tcpmux) {
 				    sep = tcpmux(ctrl);
 				    if (sep == NULL) {
 					    close(ctrl);
 					    _exit(0);
 				    }
 			    }
 #ifdef LIBWRAP
 			    if (ISWRAP(sep)) {
 				inetd_setproctitle("wrapping", ctrl);
 				service = sep->se_server_name ?
 				    sep->se_server_name : sep->se_service;
 				request_init(&req, RQ_DAEMON, service, RQ_FILE, ctrl, 0);
 				fromhost(&req);
 				deny_severity = LIBWRAP_DENY_FACILITY|LIBWRAP_DENY_SEVERITY;
 				allow_severity = LIBWRAP_ALLOW_FACILITY|LIBWRAP_ALLOW_SEVERITY;
 				denied = !hosts_access(&req);
 				if (denied) {
 				    syslog(deny_severity,
 				        "refused connection from %.500s, service %s (%s%s)",
 				        eval_client(&req), service, sep->se_proto,
 					(whichaf(&req) == AF_INET6) ? "6" : "");
 				    if (sep->se_socktype != SOCK_STREAM)
 					recv(ctrl, buf, sizeof (buf), 0);
 				    if (dofork) {
 					sleep(1);
 					_exit(0);
 				    }
 				}
 				if (dolog) {
 				    syslog(allow_severity,
 				        "connection from %.500s, service %s (%s%s)",
 					eval_client(&req), service, sep->se_proto,
 					(whichaf(&req) == AF_INET6) ? "6" : "");
 				}
 			    }
 #endif
 			    if (sep->se_bi) {
 				(*sep->se_bi->bi_fn)(ctrl, sep);
 			    } else {
 				if (debug)
 					warnx("%d execl %s",
 						getpid(), sep->se_server);
 				/* Clear close-on-exec. */
 				if (fcntl(ctrl, F_SETFD, 0) < 0) {
 					syslog(LOG_ERR,
 					    "%s/%s: fcntl (F_SETFD, 0): %m",
 						sep->se_service, sep->se_proto);
 					_exit(EX_OSERR);
 				}
 				if (ctrl != 0) {
 					dup2(ctrl, 0);
 					close(ctrl);
 				}
 				dup2(0, 1);
 				dup2(0, 2);
 				if ((pwd = getpwnam(sep->se_user)) == NULL) {
 					syslog(LOG_ERR,
 					    "%s/%s: %s: no such user",
 						sep->se_service, sep->se_proto,
 						sep->se_user);
 					if (sep->se_socktype != SOCK_STREAM)
 						recv(0, buf, sizeof (buf), 0);
 					_exit(EX_NOUSER);
 				}
 				grp = NULL;
 				if (   sep->se_group != NULL
 				    && (grp = getgrnam(sep->se_group)) == NULL
 				   ) {
 					syslog(LOG_ERR,
 					    "%s/%s: %s: no such group",
 						sep->se_service, sep->se_proto,
 						sep->se_group);
 					if (sep->se_socktype != SOCK_STREAM)
 						recv(0, buf, sizeof (buf), 0);
 					_exit(EX_NOUSER);
 				}
 				if (grp != NULL)
 					pwd->pw_gid = grp->gr_gid;
 #ifdef LOGIN_CAP
 				if ((lc = login_getclass(sep->se_class)) == NULL) {
 					/* error syslogged by getclass */
 					syslog(LOG_ERR,
 					    "%s/%s: %s: login class error",
 						sep->se_service, sep->se_proto,
 						sep->se_class);
 					if (sep->se_socktype != SOCK_STREAM)
 						recv(0, buf, sizeof (buf), 0);
 					_exit(EX_NOUSER);
 				}
 #endif
 				if (setsid() < 0) {
 					syslog(LOG_ERR,
 						"%s: can't setsid(): %m",
 						 sep->se_service);
 					/* _exit(EX_OSERR); not fatal yet */
 				}
 #ifdef LOGIN_CAP
 				if (setusercontext(lc, pwd, pwd->pw_uid,
 				    LOGIN_SETALL & ~LOGIN_SETMAC)
 				    != 0) {
 					syslog(LOG_ERR,
 					 "%s: can't setusercontext(..%s..): %m",
 					 sep->se_service, sep->se_user);
 					_exit(EX_OSERR);
 				}
 				login_close(lc);
 #else
 				if (pwd->pw_uid) {
 					if (setlogin(sep->se_user) < 0) {
 						syslog(LOG_ERR,
 						 "%s: can't setlogin(%s): %m",
 						 sep->se_service, sep->se_user);
 						/* _exit(EX_OSERR); not yet */
 					}
 					if (setgid(pwd->pw_gid) < 0) {
 						syslog(LOG_ERR,
 						  "%s: can't set gid %d: %m",
 						  sep->se_service, pwd->pw_gid);
 						_exit(EX_OSERR);
 					}
 					(void) initgroups(pwd->pw_name,
 							pwd->pw_gid);
 					if (setuid(pwd->pw_uid) < 0) {
 						syslog(LOG_ERR,
 						  "%s: can't set uid %d: %m",
 						  sep->se_service, pwd->pw_uid);
 						_exit(EX_OSERR);
 					}
 				}
 #endif
 				sigaction(SIGPIPE, &sapipe,
 				    (struct sigaction *)0);
 				execv(sep->se_server, sep->se_argv);
 				syslog(LOG_ERR,
 				    "cannot execute %s: %m", sep->se_server);
 				if (sep->se_socktype != SOCK_STREAM)
 					recv(0, buf, sizeof (buf), 0);
 			    }
 			    if (dofork)
 				_exit(0);
 		    }
 		    if (sep->se_accept && sep->se_socktype == SOCK_STREAM)
 			    close(ctrl);
 		}
 	}
 }
 
 /*
  * Add a signal flag to the signal flag queue for later handling
  */
 
 static void
 flag_signal(int c)
 {
 	char ch = c;
 
 	if (write(signalpipe[1], &ch, 1) != 1) {
 		syslog(LOG_ERR, "write: %m");
 		_exit(EX_OSERR);
 	}
 }
 
 /*
  * Record a new child pid for this service. If we've reached the
  * limit on children, then stop accepting incoming requests.
  */
 
 static void
 addchild(struct servtab *sep, pid_t pid)
 {
 	if (sep->se_maxchild <= 0)
 		return;
 #ifdef SANITY_CHECK
 	if (sep->se_numchild >= sep->se_maxchild) {
 		syslog(LOG_ERR, "%s: %d >= %d",
 		    __func__, sep->se_numchild, sep->se_maxchild);
 		exit(EX_SOFTWARE);
 	}
 #endif
 	sep->se_pids[sep->se_numchild++] = pid;
 	if (sep->se_numchild == sep->se_maxchild)
 		disable(sep);
 }
 
 /*
  * Some child process has exited. See if it's on somebody's list.
  */
 
 static void
 flag_reapchild(int signo __unused)
 {
 	flag_signal('C');
 }
 
 static void
 reapchild(void)
 {
 	int k, status;
 	pid_t pid;
 	struct servtab *sep;
 
 	for (;;) {
 		pid = wait3(&status, WNOHANG, (struct rusage *)0);
 		if (pid <= 0)
 			break;
 		if (debug)
 			warnx("%d reaped, %s %u", pid,
 			    WIFEXITED(status) ? "status" : "signal",
 			    WIFEXITED(status) ? WEXITSTATUS(status)
 				: WTERMSIG(status));
 		for (sep = servtab; sep; sep = sep->se_next) {
 			for (k = 0; k < sep->se_numchild; k++)
 				if (sep->se_pids[k] == pid)
 					break;
 			if (k == sep->se_numchild)
 				continue;
 			if (sep->se_numchild == sep->se_maxchild)
 				enable(sep);
 			sep->se_pids[k] = sep->se_pids[--sep->se_numchild];
 			if (WIFSIGNALED(status) || WEXITSTATUS(status))
 				syslog(LOG_WARNING,
 				    "%s[%d]: exited, %s %u",
 				    sep->se_server, pid,
 				    WIFEXITED(status) ? "status" : "signal",
 				    WIFEXITED(status) ? WEXITSTATUS(status)
 					: WTERMSIG(status));
 			break;
 		}
 		reapchild_conn(pid);
 	}
 }
 
 static void
 flag_config(int signo __unused)
 {
 	flag_signal('H');
 }
 
 static void
 config(void)
 {
 	struct servtab *sep, *new, **sepp;
 	long omask;
 	int new_nomapped;
 #ifdef LOGIN_CAP
 	login_cap_t *lc = NULL;
 #endif
 
 	if (!setconfig()) {
 		syslog(LOG_ERR, "%s: %m", CONFIG);
 		return;
 	}
 	for (sep = servtab; sep; sep = sep->se_next)
 		sep->se_checked = 0;
 	while ((new = getconfigent())) {
 		if (getpwnam(new->se_user) == NULL) {
 			syslog(LOG_ERR,
 				"%s/%s: no such user '%s', service ignored",
 				new->se_service, new->se_proto, new->se_user);
 			continue;
 		}
 		if (new->se_group && getgrnam(new->se_group) == NULL) {
 			syslog(LOG_ERR,
 				"%s/%s: no such group '%s', service ignored",
 				new->se_service, new->se_proto, new->se_group);
 			continue;
 		}
 #ifdef LOGIN_CAP
 		if ((lc = login_getclass(new->se_class)) == NULL) {
 			/* error syslogged by getclass */
 			syslog(LOG_ERR,
 				"%s/%s: %s: login class error, service ignored",
 				new->se_service, new->se_proto, new->se_class);
 			continue;
 		}
 		login_close(lc);
 #endif
 		new_nomapped = new->se_nomapped;
 		for (sep = servtab; sep; sep = sep->se_next)
 			if (strcmp(sep->se_service, new->se_service) == 0 &&
 			    strcmp(sep->se_proto, new->se_proto) == 0 &&
 			    sep->se_rpc == new->se_rpc &&
 			    sep->se_socktype == new->se_socktype &&
 			    sep->se_family == new->se_family)
 				break;
 		if (sep != 0) {
 			int i;
 
 #define SWAP(t,a, b) { t c = a; a = b; b = c; }
 			omask = sigblock(SIGBLOCK);
 			if (sep->se_nomapped != new->se_nomapped) {
 				/* for rpc keep old nommaped till unregister */
 				if (!sep->se_rpc)
 					sep->se_nomapped = new->se_nomapped;
 				sep->se_reset = 1;
 			}
 			/* copy over outstanding child pids */
 			if (sep->se_maxchild > 0 && new->se_maxchild > 0) {
 				new->se_numchild = sep->se_numchild;
 				if (new->se_numchild > new->se_maxchild)
 					new->se_numchild = new->se_maxchild;
 				memcpy(new->se_pids, sep->se_pids,
 				    new->se_numchild * sizeof(*new->se_pids));
 			}
 			SWAP(pid_t *, sep->se_pids, new->se_pids);
 			sep->se_maxchild = new->se_maxchild;
 			sep->se_numchild = new->se_numchild;
 			sep->se_maxcpm = new->se_maxcpm;
 			resize_conn(sep, new->se_maxperip);
 			sep->se_maxperip = new->se_maxperip;
 			sep->se_bi = new->se_bi;
 			/* might need to turn on or off service now */
 			if (sep->se_fd >= 0) {
 			      if (sep->se_maxchild > 0
 				  && sep->se_numchild == sep->se_maxchild) {
 				      if (FD_ISSET(sep->se_fd, &allsock))
 					  disable(sep);
 			      } else {
 				      if (!FD_ISSET(sep->se_fd, &allsock))
 					  enable(sep);
 			      }
 			}
 			sep->se_accept = new->se_accept;
 			SWAP(char *, sep->se_user, new->se_user);
 			SWAP(char *, sep->se_group, new->se_group);
 #ifdef LOGIN_CAP
 			SWAP(char *, sep->se_class, new->se_class);
 #endif
 			SWAP(char *, sep->se_server, new->se_server);
 			SWAP(char *, sep->se_server_name, new->se_server_name);
 			for (i = 0; i < MAXARGV; i++)
 				SWAP(char *, sep->se_argv[i], new->se_argv[i]);
 #ifdef IPSEC
 			SWAP(char *, sep->se_policy, new->se_policy);
 			ipsecsetup(sep);
 #endif
 			sigsetmask(omask);
 			freeconfig(new);
 			if (debug)
 				print_service("REDO", sep);
 		} else {
 			sep = enter(new);
 			if (debug)
 				print_service("ADD ", sep);
 		}
 		sep->se_checked = 1;
 		if (ISMUX(sep)) {
 			sep->se_fd = -1;
 			continue;
 		}
 		switch (sep->se_family) {
 		case AF_INET:
 			if (!v4bind_ok) {
 				sep->se_fd = -1;
 				continue;
 			}
 			break;
 #ifdef INET6
 		case AF_INET6:
 			if (!v6bind_ok) {
 				sep->se_fd = -1;
 				continue;
 			}
 			break;
 #endif
 		}
 		if (!sep->se_rpc) {
 			if (sep->se_family != AF_UNIX) {
 				sp = getservbyname(sep->se_service, sep->se_proto);
 				if (sp == 0) {
 					syslog(LOG_ERR, "%s/%s: unknown service",
 					sep->se_service, sep->se_proto);
 					sep->se_checked = 0;
 					continue;
 				}
 			}
 			switch (sep->se_family) {
 			case AF_INET:
 				if (sp->s_port != sep->se_ctrladdr4.sin_port) {
 					sep->se_ctrladdr4.sin_port =
 						sp->s_port;
 					sep->se_reset = 1;
 				}
 				break;
 #ifdef INET6
 			case AF_INET6:
 				if (sp->s_port !=
 				    sep->se_ctrladdr6.sin6_port) {
 					sep->se_ctrladdr6.sin6_port =
 						sp->s_port;
 					sep->se_reset = 1;
 				}
 				break;
 #endif
 			}
 			if (sep->se_reset != 0 && sep->se_fd >= 0)
 				close_sep(sep);
 		} else {
 			rpc = getrpcbyname(sep->se_service);
 			if (rpc == 0) {
 				syslog(LOG_ERR, "%s/%s unknown RPC service",
 					sep->se_service, sep->se_proto);
 				if (sep->se_fd != -1)
 					(void) close(sep->se_fd);
 				sep->se_fd = -1;
 					continue;
 			}
 			if (sep->se_reset != 0 ||
 			    rpc->r_number != sep->se_rpc_prog) {
 				if (sep->se_rpc_prog)
 					unregisterrpc(sep);
 				sep->se_rpc_prog = rpc->r_number;
 				if (sep->se_fd != -1)
 					(void) close(sep->se_fd);
 				sep->se_fd = -1;
 			}
 			sep->se_nomapped = new_nomapped;
 		}
 		sep->se_reset = 0;
 		if (sep->se_fd == -1)
 			setup(sep);
 	}
 	endconfig();
 	/*
 	 * Purge anything not looked at above.
 	 */
 	omask = sigblock(SIGBLOCK);
 	sepp = &servtab;
 	while ((sep = *sepp)) {
 		if (sep->se_checked) {
 			sepp = &sep->se_next;
 			continue;
 		}
 		*sepp = sep->se_next;
 		if (sep->se_fd >= 0)
 			close_sep(sep);
 		if (debug)
 			print_service("FREE", sep);
 		if (sep->se_rpc && sep->se_rpc_prog > 0)
 			unregisterrpc(sep);
 		freeconfig(sep);
 		free(sep);
 	}
 	(void) sigsetmask(omask);
 }
 
 static void
 unregisterrpc(struct servtab *sep)
 {
         u_int i;
         struct servtab *sepp;
 	long omask;
 	struct netconfig *netid4, *netid6;
 
 	omask = sigblock(SIGBLOCK);
 	netid4 = sep->se_socktype == SOCK_DGRAM ? udpconf : tcpconf;
 	netid6 = sep->se_socktype == SOCK_DGRAM ? udp6conf : tcp6conf;
 	if (sep->se_family == AF_INET)
 		netid6 = NULL;
 	else if (sep->se_nomapped)
 		netid4 = NULL;
 	/*
 	 * Conflict if same prog and protocol - In that case one should look
 	 * to versions, but it is not interesting: having separate servers for
 	 * different versions does not work well.
 	 * Therefore one do not unregister if there is a conflict.
 	 * There is also transport conflict if destroying INET when INET46
 	 * exists, or destroying INET46 when INET exists
 	 */
         for (sepp = servtab; sepp; sepp = sepp->se_next) {
                 if (sepp == sep)
                         continue;
 		if (sepp->se_checked == 0 ||
                     !sepp->se_rpc ||
 		    strcmp(sep->se_proto, sepp->se_proto) != 0 ||
                     sep->se_rpc_prog != sepp->se_rpc_prog)
 			continue;
 		if (sepp->se_family == AF_INET)
 			netid4 = NULL;
 		if (sepp->se_family == AF_INET6) {
 			netid6 = NULL;
 			if (!sep->se_nomapped)
 				netid4 = NULL;
 		}
 		if (netid4 == NULL && netid6 == NULL)
 			return;
         }
         if (debug)
                 print_service("UNREG", sep);
         for (i = sep->se_rpc_lowvers; i <= sep->se_rpc_highvers; i++) {
 		if (netid4)
 			rpcb_unset(sep->se_rpc_prog, i, netid4);
 		if (netid6)
 			rpcb_unset(sep->se_rpc_prog, i, netid6);
 	}
         if (sep->se_fd != -1)
                 (void) close(sep->se_fd);
         sep->se_fd = -1;
 	(void) sigsetmask(omask);
 }
 
 static void
 flag_retry(int signo __unused)
 {
 	flag_signal('A');
 }
 
 static void
 retry(void)
 {
 	struct servtab *sep;
 
 	timingout = 0;
 	for (sep = servtab; sep; sep = sep->se_next)
 		if (sep->se_fd == -1 && !ISMUX(sep))
 			setup(sep);
 }
 
 static void
 setup(struct servtab *sep)
 {
 	int on = 1;
 
 	/* Set all listening sockets to close-on-exec. */
 	if ((sep->se_fd = socket(sep->se_family,
 	    sep->se_socktype | SOCK_CLOEXEC, 0)) < 0) {
 		if (debug)
 			warn("socket failed on %s/%s",
 				sep->se_service, sep->se_proto);
 		syslog(LOG_ERR, "%s/%s: socket: %m",
 		    sep->se_service, sep->se_proto);
 		return;
 	}
 #define	turnon(fd, opt) \
 setsockopt(fd, SOL_SOCKET, opt, (char *)&on, sizeof (on))
 	if (strcmp(sep->se_proto, "tcp") == 0 && (options & SO_DEBUG) &&
 	    turnon(sep->se_fd, SO_DEBUG) < 0)
 		syslog(LOG_ERR, "setsockopt (SO_DEBUG): %m");
 	if (turnon(sep->se_fd, SO_REUSEADDR) < 0)
 		syslog(LOG_ERR, "setsockopt (SO_REUSEADDR): %m");
 #ifdef SO_PRIVSTATE
 	if (turnon(sep->se_fd, SO_PRIVSTATE) < 0)
 		syslog(LOG_ERR, "setsockopt (SO_PRIVSTATE): %m");
 #endif
 	/* tftpd opens a new connection then needs more infos */
 #ifdef INET6
 	if ((sep->se_family == AF_INET6) &&
 	    (strcmp(sep->se_proto, "udp") == 0) &&
 	    (sep->se_accept == 0) &&
 	    (setsockopt(sep->se_fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
 			(char *)&on, sizeof (on)) < 0))
 		syslog(LOG_ERR, "setsockopt (IPV6_RECVPKTINFO): %m");
 	if (sep->se_family == AF_INET6) {
 		int flag = sep->se_nomapped ? 1 : 0;
 		if (setsockopt(sep->se_fd, IPPROTO_IPV6, IPV6_V6ONLY,
 			       (char *)&flag, sizeof (flag)) < 0)
 			syslog(LOG_ERR, "setsockopt (IPV6_V6ONLY): %m");
 	}
 #endif
 #undef turnon
 #ifdef IPSEC
 	ipsecsetup(sep);
 #endif
 	if (sep->se_family == AF_UNIX) {
 		(void) unlink(sep->se_ctrladdr_un.sun_path);
 		umask(0777); /* Make socket with conservative permissions */
 	}
 	if (bind(sep->se_fd, (struct sockaddr *)&sep->se_ctrladdr,
 	    sep->se_ctrladdr_size) < 0) {
 		if (debug)
 			warn("bind failed on %s/%s",
 				sep->se_service, sep->se_proto);
 		syslog(LOG_ERR, "%s/%s: bind: %m",
 		    sep->se_service, sep->se_proto);
 		(void) close(sep->se_fd);
 		sep->se_fd = -1;
 		if (!timingout) {
 			timingout = 1;
 			alarm(RETRYTIME);
 		}
 		if (sep->se_family == AF_UNIX)
 			umask(mask);
 		return;
 	}
 	if (sep->se_family == AF_UNIX) {
 		/* Ick - fch{own,mod} don't work on Unix domain sockets */
 		if (chown(sep->se_service, sep->se_sockuid, sep->se_sockgid) < 0)
 			syslog(LOG_ERR, "chown socket: %m");
 		if (chmod(sep->se_service, sep->se_sockmode) < 0)
 			syslog(LOG_ERR, "chmod socket: %m");
 		umask(mask);
 	}
         if (sep->se_rpc) {
 		u_int i;
 		socklen_t len = sep->se_ctrladdr_size;
 		struct netconfig *netid, *netid2 = NULL;
 #ifdef INET6
 		struct sockaddr_in sock;
 #endif
 		struct netbuf nbuf, nbuf2;
 
                 if (getsockname(sep->se_fd,
 				(struct sockaddr*)&sep->se_ctrladdr, &len) < 0){
                         syslog(LOG_ERR, "%s/%s: getsockname: %m",
                                sep->se_service, sep->se_proto);
                         (void) close(sep->se_fd);
                         sep->se_fd = -1;
                         return;
                 }
 		nbuf.buf = &sep->se_ctrladdr;
 		nbuf.len = sep->se_ctrladdr.sa_len;
 		if (sep->se_family == AF_INET)
 			netid = sep->se_socktype==SOCK_DGRAM? udpconf:tcpconf;
 #ifdef INET6
 		else  {
 			netid = sep->se_socktype==SOCK_DGRAM? udp6conf:tcp6conf;
 			if (!sep->se_nomapped) { /* INET and INET6 */
 				netid2 = netid==udp6conf? udpconf:tcpconf;
 				memset(&sock, 0, sizeof sock);	/* ADDR_ANY */
 				nbuf2.buf = &sock;
 				nbuf2.len = sock.sin_len = sizeof sock;
 				sock.sin_family = AF_INET;
 				sock.sin_port = sep->se_ctrladdr6.sin6_port;
 			}
 		}
 #endif
                 if (debug)
                         print_service("REG ", sep);
                 for (i = sep->se_rpc_lowvers; i <= sep->se_rpc_highvers; i++) {
 			rpcb_unset(sep->se_rpc_prog, i, netid);
 			rpcb_set(sep->se_rpc_prog, i, netid, &nbuf);
 			if (netid2) {
 				rpcb_unset(sep->se_rpc_prog, i, netid2);
 				rpcb_set(sep->se_rpc_prog, i, netid2, &nbuf2);
 			}
                 }
         }
 	if (sep->se_socktype == SOCK_STREAM)
 		listen(sep->se_fd, -1);
 	enable(sep);
 	if (debug) {
 		warnx("registered %s on %d",
 			sep->se_server, sep->se_fd);
 	}
 }
 
 #ifdef IPSEC
 static void
 ipsecsetup(struct servtab *sep)
 {
 	char *buf;
 	char *policy_in = NULL;
 	char *policy_out = NULL;
 	int level;
 	int opt;
 
 	switch (sep->se_family) {
 	case AF_INET:
 		level = IPPROTO_IP;
 		opt = IP_IPSEC_POLICY;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		level = IPPROTO_IPV6;
 		opt = IPV6_IPSEC_POLICY;
 		break;
 #endif
 	default:
 		return;
 	}
 
 	if (!sep->se_policy || sep->se_policy[0] == '\0') {
 		static char def_in[] = "in entrust", def_out[] = "out entrust";
 		policy_in = def_in;
 		policy_out = def_out;
 	} else {
 		if (!strncmp("in", sep->se_policy, 2))
 			policy_in = sep->se_policy;
 		else if (!strncmp("out", sep->se_policy, 3))
 			policy_out = sep->se_policy;
 		else {
 			syslog(LOG_ERR, "invalid security policy \"%s\"",
 				sep->se_policy);
 			return;
 		}
 	}
 
 	if (policy_in != NULL) {
 		buf = ipsec_set_policy(policy_in, strlen(policy_in));
 		if (buf != NULL) {
 			if (setsockopt(sep->se_fd, level, opt,
 					buf, ipsec_get_policylen(buf)) < 0 &&
 			    debug != 0)
 				warnx("%s/%s: ipsec initialization failed; %s",
 				      sep->se_service, sep->se_proto,
 				      policy_in);
 			free(buf);
 		} else
 			syslog(LOG_ERR, "invalid security policy \"%s\"",
 				policy_in);
 	}
 	if (policy_out != NULL) {
 		buf = ipsec_set_policy(policy_out, strlen(policy_out));
 		if (buf != NULL) {
 			if (setsockopt(sep->se_fd, level, opt,
 					buf, ipsec_get_policylen(buf)) < 0 &&
 			    debug != 0)
 				warnx("%s/%s: ipsec initialization failed; %s",
 				      sep->se_service, sep->se_proto,
 				      policy_out);
 			free(buf);
 		} else
 			syslog(LOG_ERR, "invalid security policy \"%s\"",
 				policy_out);
 	}
 }
 #endif
 
 /*
  * Finish with a service and its socket.
  */
 static void
 close_sep(struct servtab *sep)
 {
 	if (sep->se_fd >= 0) {
 		if (FD_ISSET(sep->se_fd, &allsock))
 			disable(sep);
 		(void) close(sep->se_fd);
 		sep->se_fd = -1;
 	}
 	sep->se_count = 0;
 	sep->se_numchild = 0;	/* forget about any existing children */
 }
 
 static int
 matchservent(const char *name1, const char *name2, const char *proto)
 {
 	char **alias, *p;
 	struct servent *se;
 
 	if (strcmp(proto, "unix") == 0) {
 		if ((p = strrchr(name1, '/')) != NULL)
 			name1 = p + 1;
 		if ((p = strrchr(name2, '/')) != NULL)
 			name2 = p + 1;
 	}
 	if (strcmp(name1, name2) == 0)
 		return(1);
 	if ((se = getservbyname(name1, proto)) != NULL) {
 		if (strcmp(name2, se->s_name) == 0)
 			return(1);
 		for (alias = se->s_aliases; *alias; alias++)
 			if (strcmp(name2, *alias) == 0)
 				return(1);
 	}
 	return(0);
 }
 
 static struct servtab *
 enter(struct servtab *cp)
 {
 	struct servtab *sep;
 	long omask;
 
 	sep = (struct servtab *)malloc(sizeof (*sep));
 	if (sep == (struct servtab *)0) {
 		syslog(LOG_ERR, "malloc: %m");
 		exit(EX_OSERR);
 	}
 	*sep = *cp;
 	sep->se_fd = -1;
 	omask = sigblock(SIGBLOCK);
 	sep->se_next = servtab;
 	servtab = sep;
 	sigsetmask(omask);
 	return (sep);
 }
 
 static void
 enable(struct servtab *sep)
 {
 	if (debug)
 		warnx(
 		    "enabling %s, fd %d", sep->se_service, sep->se_fd);
 #ifdef SANITY_CHECK
 	if (sep->se_fd < 0) {
 		syslog(LOG_ERR,
 		    "%s: %s: bad fd", __func__, sep->se_service);
 		exit(EX_SOFTWARE);
 	}
 	if (ISMUX(sep)) {
 		syslog(LOG_ERR,
 		    "%s: %s: is mux", __func__, sep->se_service);
 		exit(EX_SOFTWARE);
 	}
 	if (FD_ISSET(sep->se_fd, &allsock)) {
 		syslog(LOG_ERR,
 		    "%s: %s: not off", __func__, sep->se_service);
 		exit(EX_SOFTWARE);
 	}
 	nsock++;
 #endif
 	FD_SET(sep->se_fd, &allsock);
 	if (sep->se_fd > maxsock)
 		maxsock = sep->se_fd;
 }
 
 static void
 disable(struct servtab *sep)
 {
 	if (debug)
 		warnx(
 		    "disabling %s, fd %d", sep->se_service, sep->se_fd);
 #ifdef SANITY_CHECK
 	if (sep->se_fd < 0) {
 		syslog(LOG_ERR,
 		    "%s: %s: bad fd", __func__, sep->se_service);
 		exit(EX_SOFTWARE);
 	}
 	if (ISMUX(sep)) {
 		syslog(LOG_ERR,
 		    "%s: %s: is mux", __func__, sep->se_service);
 		exit(EX_SOFTWARE);
 	}
 	if (!FD_ISSET(sep->se_fd, &allsock)) {
 		syslog(LOG_ERR,
 		    "%s: %s: not on", __func__, sep->se_service);
 		exit(EX_SOFTWARE);
 	}
 	if (nsock == 0) {
 		syslog(LOG_ERR, "%s: nsock=0", __func__);
 		exit(EX_SOFTWARE);
 	}
 	nsock--;
 #endif
 	FD_CLR(sep->se_fd, &allsock);
 	if (sep->se_fd == maxsock)
 		maxsock--;
 }
 
 static FILE	*fconfig = NULL;
 static struct	servtab serv;
 static char	line[LINE_MAX];
 
 static int
 setconfig(void)
 {
 
 	if (fconfig != NULL) {
 		fseek(fconfig, 0L, SEEK_SET);
 		return (1);
 	}
 	fconfig = fopen(CONFIG, "r");
 	return (fconfig != NULL);
 }
 
 static void
 endconfig(void)
 {
 	if (fconfig) {
 		(void) fclose(fconfig);
 		fconfig = NULL;
 	}
 }
 
 static struct servtab *
 getconfigent(void)
 {
 	struct servtab *sep = &serv;
 	int argc;
 	char *cp, *arg, *s;
 	char *versp;
 	static char TCPMUX_TOKEN[] = "tcpmux/";
 #define MUX_LEN		(sizeof(TCPMUX_TOKEN)-1)
 #ifdef IPSEC
 	char *policy;
 #endif
 	int v4bind;
 #ifdef INET6
 	int v6bind;
 #endif
 	int i;
 
 #ifdef IPSEC
 	policy = NULL;
 #endif
 more:
 	v4bind = 0;
 #ifdef INET6
 	v6bind = 0;
 #endif
 	while ((cp = nextline(fconfig)) != NULL) {
 #ifdef IPSEC
 		/* lines starting with #@ is not a comment, but the policy */
 		if (cp[0] == '#' && cp[1] == '@') {
 			char *p;
 			for (p = cp + 2; p && *p && isspace(*p); p++)
 				;
 			if (*p == '\0') {
 				if (policy)
 					free(policy);
 				policy = NULL;
 			} else if (ipsec_get_policylen(p) >= 0) {
 				if (policy)
 					free(policy);
 				policy = newstr(p);
 			} else {
 				syslog(LOG_ERR,
 					"%s: invalid ipsec policy \"%s\"",
 					CONFIG, p);
 				exit(EX_CONFIG);
 			}
 		}
 #endif
 		if (*cp == '#' || *cp == '\0')
 			continue;
 		break;
 	}
 	if (cp == NULL)
 		return ((struct servtab *)0);
 	/*
 	 * clear the static buffer, since some fields (se_ctrladdr,
 	 * for example) don't get initialized here.
 	 */
 	memset(sep, 0, sizeof *sep);
 	arg = skip(&cp);
 	if (cp == NULL) {
 		/* got an empty line containing just blanks/tabs. */
 		goto more;
 	}
 	if (arg[0] == ':') { /* :user:group:perm: */
 		char *user, *group, *perm;
 		struct passwd *pw;
 		struct group *gr;
 		user = arg+1;
 		if ((group = strchr(user, ':')) == NULL) {
 			syslog(LOG_ERR, "no group after user '%s'", user);
 			goto more;
 		}
 		*group++ = '\0';
 		if ((perm = strchr(group, ':')) == NULL) {
 			syslog(LOG_ERR, "no mode after group '%s'", group);
 			goto more;
 		}
 		*perm++ = '\0';
 		if ((pw = getpwnam(user)) == NULL) {
 			syslog(LOG_ERR, "no such user '%s'", user);
 			goto more;
 		}
 		sep->se_sockuid = pw->pw_uid;
 		if ((gr = getgrnam(group)) == NULL) {
 			syslog(LOG_ERR, "no such user '%s'", group);
 			goto more;
 		}
 		sep->se_sockgid = gr->gr_gid;
 		sep->se_sockmode = strtol(perm, &arg, 8);
 		if (*arg != ':') {
 			syslog(LOG_ERR, "bad mode '%s'", perm);
 			goto more;
 		}
 		*arg++ = '\0';
 	} else {
 		sep->se_sockuid = euid;
 		sep->se_sockgid = egid;
 		sep->se_sockmode = 0200;
 	}
 	if (strncmp(arg, TCPMUX_TOKEN, MUX_LEN) == 0) {
 		char *c = arg + MUX_LEN;
 		if (*c == '+') {
 			sep->se_type = MUXPLUS_TYPE;
 			c++;
 		} else
 			sep->se_type = MUX_TYPE;
 		sep->se_service = newstr(c);
 	} else {
 		sep->se_service = newstr(arg);
 		sep->se_type = NORM_TYPE;
 	}
 	arg = sskip(&cp);
 	if (strcmp(arg, "stream") == 0)
 		sep->se_socktype = SOCK_STREAM;
 	else if (strcmp(arg, "dgram") == 0)
 		sep->se_socktype = SOCK_DGRAM;
 	else if (strcmp(arg, "rdm") == 0)
 		sep->se_socktype = SOCK_RDM;
 	else if (strcmp(arg, "seqpacket") == 0)
 		sep->se_socktype = SOCK_SEQPACKET;
 	else if (strcmp(arg, "raw") == 0)
 		sep->se_socktype = SOCK_RAW;
 	else
 		sep->se_socktype = -1;
 
 	arg = sskip(&cp);
 	if (strncmp(arg, "tcp", 3) == 0) {
 		sep->se_proto = newstr(strsep(&arg, "/"));
 		if (arg != NULL && (strcmp(arg, "faith") == 0)) {
 			syslog(LOG_ERR, "faith has been deprecated");
 			goto more;
 		}
 	} else {
 		if (sep->se_type == NORM_TYPE &&
 		    strncmp(arg, "faith/", 6) == 0) {
 			syslog(LOG_ERR, "faith has been deprecated");
 			goto more;
 		}
 		sep->se_proto = newstr(arg);
 	}
         if (strncmp(sep->se_proto, "rpc/", 4) == 0) {
                 memmove(sep->se_proto, sep->se_proto + 4,
                     strlen(sep->se_proto) + 1 - 4);
                 sep->se_rpc = 1;
                 sep->se_rpc_prog = sep->se_rpc_lowvers =
 			sep->se_rpc_highvers = 0;
                 if ((versp = strrchr(sep->se_service, '/'))) {
                         *versp++ = '\0';
                         switch (sscanf(versp, "%u-%u",
                                        &sep->se_rpc_lowvers,
                                        &sep->se_rpc_highvers)) {
                         case 2:
                                 break;
                         case 1:
                                 sep->se_rpc_highvers =
                                         sep->se_rpc_lowvers;
                                 break;
                         default:
                                 syslog(LOG_ERR,
 					"bad RPC version specifier; %s",
 					sep->se_service);
                                 freeconfig(sep);
                                 goto more;
                         }
                 }
                 else {
                         sep->se_rpc_lowvers =
                                 sep->se_rpc_highvers = 1;
                 }
         }
 	sep->se_nomapped = 0;
 	if (strcmp(sep->se_proto, "unix") == 0) {
 	        sep->se_family = AF_UNIX;
 	} else {
 		while (isdigit(sep->se_proto[strlen(sep->se_proto) - 1])) {
 #ifdef INET6
 			if (sep->se_proto[strlen(sep->se_proto) - 1] == '6') {
 				sep->se_proto[strlen(sep->se_proto) - 1] = '\0';
 				v6bind = 1;
 				continue;
 			}
 #endif
 			if (sep->se_proto[strlen(sep->se_proto) - 1] == '4') {
 				sep->se_proto[strlen(sep->se_proto) - 1] = '\0';
 				v4bind = 1;
 				continue;
 			}
 			/* illegal version num */
 			syslog(LOG_ERR,	"bad IP version for %s", sep->se_proto);
 			freeconfig(sep);
 			goto more;
 		}
 #ifdef INET6
 		if (v6bind && !v6bind_ok) {
 			syslog(LOG_INFO, "IPv6 bind is ignored for %s",
 			       sep->se_service);
 			if (v4bind && v4bind_ok)
 				v6bind = 0;
 			else {
 				freeconfig(sep);
 				goto more;
 			}
 		}
 		if (v6bind) {
 			sep->se_family = AF_INET6;
 			if (!v4bind || !v4bind_ok)
 				sep->se_nomapped = 1;
 		} else
 #endif
 		{ /* default to v4 bind if not v6 bind */
 			if (!v4bind_ok) {
 				syslog(LOG_NOTICE, "IPv4 bind is ignored for %s",
 				       sep->se_service);
 				freeconfig(sep);
 				goto more;
 			}
 			sep->se_family = AF_INET;
 		}
 	}
 	/* init ctladdr */
 	switch(sep->se_family) {
 	case AF_INET:
 		memcpy(&sep->se_ctrladdr4, bind_sa4,
 		       sizeof(sep->se_ctrladdr4));
 		sep->se_ctrladdr_size =	sizeof(sep->se_ctrladdr4);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		memcpy(&sep->se_ctrladdr6, bind_sa6,
 		       sizeof(sep->se_ctrladdr6));
 		sep->se_ctrladdr_size =	sizeof(sep->se_ctrladdr6);
 		break;
 #endif
 	case AF_UNIX:
 		if (strlen(sep->se_service) >= sizeof(sep->se_ctrladdr_un.sun_path)) {
 			syslog(LOG_ERR, 
 			    "domain socket pathname too long for service %s",
 			    sep->se_service);
 			goto more;
 		}
 		memset(&sep->se_ctrladdr, 0, sizeof(sep->se_ctrladdr));
 		sep->se_ctrladdr_un.sun_family = sep->se_family;
 		sep->se_ctrladdr_un.sun_len = strlen(sep->se_service);
 		strcpy(sep->se_ctrladdr_un.sun_path, sep->se_service);
 		sep->se_ctrladdr_size = SUN_LEN(&sep->se_ctrladdr_un);
 	}
 	arg = sskip(&cp);
 	if (!strncmp(arg, "wait", 4))
 		sep->se_accept = 0;
 	else if (!strncmp(arg, "nowait", 6))
 		sep->se_accept = 1;
 	else {
 		syslog(LOG_ERR,
 			"%s: bad wait/nowait for service %s",
 			CONFIG, sep->se_service);
 		goto more;
 	}
 	sep->se_maxchild = -1;
 	sep->se_maxcpm = -1;
 	sep->se_maxperip = -1;
 	if ((s = strchr(arg, '/')) != NULL) {
 		char *eptr;
 		u_long val;
 
 		val = strtoul(s + 1, &eptr, 10);
 		if (eptr == s + 1 || val > MAX_MAXCHLD) {
 			syslog(LOG_ERR,
 				"%s: bad max-child for service %s",
 				CONFIG, sep->se_service);
 			goto more;
 		}
 		if (debug)
 			if (!sep->se_accept && val != 1)
 				warnx("maxchild=%lu for wait service %s"
 				    " not recommended", val, sep->se_service);
 		sep->se_maxchild = val;
 		if (*eptr == '/')
 			sep->se_maxcpm = strtol(eptr + 1, &eptr, 10);
 		if (*eptr == '/')
 			sep->se_maxperip = strtol(eptr + 1, &eptr, 10);
 		/*
 		 * explicitly do not check for \0 for future expansion /
 		 * backwards compatibility
 		 */
 	}
 	if (ISMUX(sep)) {
 		/*
 		 * Silently enforce "nowait" mode for TCPMUX services
 		 * since they don't have an assigned port to listen on.
 		 */
 		sep->se_accept = 1;
 		if (strcmp(sep->se_proto, "tcp")) {
 			syslog(LOG_ERR,
 				"%s: bad protocol for tcpmux service %s",
 				CONFIG, sep->se_service);
 			goto more;
 		}
 		if (sep->se_socktype != SOCK_STREAM) {
 			syslog(LOG_ERR,
 				"%s: bad socket type for tcpmux service %s",
 				CONFIG, sep->se_service);
 			goto more;
 		}
 	}
 	sep->se_user = newstr(sskip(&cp));
 #ifdef LOGIN_CAP
 	if ((s = strrchr(sep->se_user, '/')) != NULL) {
 		*s = '\0';
 		sep->se_class = newstr(s + 1);
 	} else
 		sep->se_class = newstr(RESOURCE_RC);
 #endif
 	if ((s = strrchr(sep->se_user, ':')) != NULL) {
 		*s = '\0';
 		sep->se_group = newstr(s + 1);
 	} else
 		sep->se_group = NULL;
 	sep->se_server = newstr(sskip(&cp));
 	if ((sep->se_server_name = strrchr(sep->se_server, '/')))
 		sep->se_server_name++;
 	if (strcmp(sep->se_server, "internal") == 0) {
 		struct biltin *bi;
 
 		for (bi = biltins; bi->bi_service; bi++)
 			if (bi->bi_socktype == sep->se_socktype &&
 			    matchservent(bi->bi_service, sep->se_service,
 			    sep->se_proto))
 				break;
 		if (bi->bi_service == 0) {
 			syslog(LOG_ERR, "internal service %s unknown",
 				sep->se_service);
 			goto more;
 		}
 		sep->se_accept = 1;	/* force accept mode for built-ins */
 		sep->se_bi = bi;
 	} else
 		sep->se_bi = NULL;
 	if (sep->se_maxperip < 0)
 		sep->se_maxperip = maxperip;
 	if (sep->se_maxcpm < 0)
 		sep->se_maxcpm = maxcpm;
 	if (sep->se_maxchild < 0) {	/* apply default max-children */
 		if (sep->se_bi && sep->se_bi->bi_maxchild >= 0)
 			sep->se_maxchild = sep->se_bi->bi_maxchild;
 		else if (sep->se_accept) 
 			sep->se_maxchild = MAX(maxchild, 0);
 		else
 			sep->se_maxchild = 1;
 	}
 	if (sep->se_maxchild > 0) {
 		sep->se_pids = malloc(sep->se_maxchild * sizeof(*sep->se_pids));
 		if (sep->se_pids == NULL) {
 			syslog(LOG_ERR, "malloc: %m");
 			exit(EX_OSERR);
 		}
 	}
 	argc = 0;
 	for (arg = skip(&cp); cp; arg = skip(&cp))
 		if (argc < MAXARGV) {
 			sep->se_argv[argc++] = newstr(arg);
 		} else {
 			syslog(LOG_ERR,
 				"%s: too many arguments for service %s",
 				CONFIG, sep->se_service);
 			goto more;
 		}
 	while (argc <= MAXARGV)
 		sep->se_argv[argc++] = NULL;
 	for (i = 0; i < PERIPSIZE; ++i)
 		LIST_INIT(&sep->se_conn[i]);
 #ifdef IPSEC
 	sep->se_policy = policy ? newstr(policy) : NULL;
 #endif
 	return (sep);
 }
 
 static void
 freeconfig(struct servtab *cp)
 {
 	int i;
 
 	if (cp->se_service)
 		free(cp->se_service);
 	if (cp->se_proto)
 		free(cp->se_proto);
 	if (cp->se_user)
 		free(cp->se_user);
 	if (cp->se_group)
 		free(cp->se_group);
 #ifdef LOGIN_CAP
 	if (cp->se_class)
 		free(cp->se_class);
 #endif
 	if (cp->se_server)
 		free(cp->se_server);
 	if (cp->se_pids)
 		free(cp->se_pids);
 	for (i = 0; i < MAXARGV; i++)
 		if (cp->se_argv[i])
 			free(cp->se_argv[i]);
 	free_connlist(cp);
 #ifdef IPSEC
 	if (cp->se_policy)
 		free(cp->se_policy);
 #endif
 }
 
 
 /*
  * Safe skip - if skip returns null, log a syntax error in the
  * configuration file and exit.
  */
 static char *
 sskip(char **cpp)
 {
 	char *cp;
 
 	cp = skip(cpp);
 	if (cp == NULL) {
 		syslog(LOG_ERR, "%s: syntax error", CONFIG);
 		exit(EX_DATAERR);
 	}
 	return (cp);
 }
 
 static char *
 skip(char **cpp)
 {
 	char *cp = *cpp;
 	char *start;
 	char quote = '\0';
 
 again:
 	while (*cp == ' ' || *cp == '\t')
 		cp++;
 	if (*cp == '\0') {
 		int c;
 
 		c = getc(fconfig);
 		(void) ungetc(c, fconfig);
 		if (c == ' ' || c == '\t')
 			if ((cp = nextline(fconfig)))
 				goto again;
 		*cpp = (char *)0;
 		return ((char *)0);
 	}
 	if (*cp == '"' || *cp == '\'')
 		quote = *cp++;
 	start = cp;
 	if (quote)
 		while (*cp && *cp != quote)
 			cp++;
 	else
 		while (*cp && *cp != ' ' && *cp != '\t')
 			cp++;
 	if (*cp != '\0')
 		*cp++ = '\0';
 	*cpp = cp;
 	return (start);
 }
 
 static char *
 nextline(FILE *fd)
 {
 	char *cp;
 
 	if (fgets(line, sizeof (line), fd) == NULL)
 		return ((char *)0);
 	cp = strchr(line, '\n');
 	if (cp)
 		*cp = '\0';
 	return (line);
 }
 
 static char *
 newstr(const char *cp)
 {
 	char *cr;
 
 	if ((cr = strdup(cp != NULL ? cp : "")))
 		return (cr);
 	syslog(LOG_ERR, "strdup: %m");
 	exit(EX_OSERR);
 }
 
 void
 inetd_setproctitle(const char *a, int s)
 {
 	socklen_t size;
 	struct sockaddr_storage ss;
 	char buf[80], pbuf[NI_MAXHOST];
 
 	size = sizeof(ss);
 	if (getpeername(s, (struct sockaddr *)&ss, &size) == 0) {
 		getnameinfo((struct sockaddr *)&ss, size, pbuf, sizeof(pbuf),
 			    NULL, 0, NI_NUMERICHOST);
 		(void) sprintf(buf, "%s [%s]", a, pbuf);
 	} else
 		(void) sprintf(buf, "%s", a);
 	setproctitle("%s", buf);
 }
 
 int
 check_loop(const struct sockaddr *sa, const struct servtab *sep)
 {
 	struct servtab *se2;
 	char pname[NI_MAXHOST];
 
 	for (se2 = servtab; se2; se2 = se2->se_next) {
 		if (!se2->se_bi || se2->se_socktype != SOCK_DGRAM)
 			continue;
 
 		switch (se2->se_family) {
 		case AF_INET:
 			if (csatosin(sa)->sin_port ==
 			    se2->se_ctrladdr4.sin_port)
 				goto isloop;
 			continue;
 #ifdef INET6
 		case AF_INET6:
 			if (csatosin6(sa)->sin6_port ==
 			    se2->se_ctrladdr6.sin6_port)
 				goto isloop;
 			continue;
 #endif
 		default:
 			continue;
 		}
 	isloop:
 		getnameinfo(sa, sa->sa_len, pname, sizeof(pname), NULL, 0,
 			    NI_NUMERICHOST);
 		syslog(LOG_WARNING, "%s/%s:%s/%s loop request REFUSED from %s",
 		       sep->se_service, sep->se_proto,
 		       se2->se_service, se2->se_proto,
 		       pname);
 		return 1;
 	}
 	return 0;
 }
 
 /*
  * print_service:
  *	Dump relevant information to stderr
  */
 static void
 print_service(const char *action, const struct servtab *sep)
 {
 	fprintf(stderr,
 	    "%s: %s proto=%s accept=%d max=%d user=%s group=%s"
 #ifdef LOGIN_CAP
 	    "class=%s"
 #endif
 	    " builtin=%p server=%s"
 #ifdef IPSEC
 	    " policy=\"%s\""
 #endif
 	    "\n",
 	    action, sep->se_service, sep->se_proto,
 	    sep->se_accept, sep->se_maxchild, sep->se_user, sep->se_group,
 #ifdef LOGIN_CAP
 	    sep->se_class,
 #endif
 	    (void *) sep->se_bi, sep->se_server
 #ifdef IPSEC
 	    , (sep->se_policy ? sep->se_policy : "")
 #endif
 	    );
 }
 
 #define CPMHSIZE	256
 #define CPMHMASK	(CPMHSIZE-1)
 #define CHTGRAN		10
 #define CHTSIZE		6
 
 typedef struct CTime {
 	unsigned long 	ct_Ticks;
 	int		ct_Count;
 } CTime;
 
 typedef struct CHash {
 	union {
 		struct in_addr	c4_Addr;
 		struct in6_addr	c6_Addr;
 	} cu_Addr;
 #define	ch_Addr4	cu_Addr.c4_Addr
 #define	ch_Addr6	cu_Addr.c6_Addr
 	int		ch_Family;
 	time_t		ch_LTime;
 	char		*ch_Service;
 	CTime		ch_Times[CHTSIZE];
 } CHash;
 
 static CHash	CHashAry[CPMHSIZE];
 
 static int
 cpmip(const struct servtab *sep, int ctrl)
 {
 	struct sockaddr_storage rss;
 	socklen_t rssLen = sizeof(rss);
 	int r = 0;
 
 	/*
 	 * If getpeername() fails, just let it through (if logging is
 	 * enabled the condition is caught elsewhere)
 	 */
 
 	if (sep->se_maxcpm > 0 && 
 	   (sep->se_family == AF_INET || sep->se_family == AF_INET6) &&
 	    getpeername(ctrl, (struct sockaddr *)&rss, &rssLen) == 0 ) {
 		time_t t = time(NULL);
 		int hv = 0xABC3D20F;
 		int i;
 		int cnt = 0;
 		CHash *chBest = NULL;
 		unsigned int ticks = t / CHTGRAN;
 		struct sockaddr_in *sin4;
 #ifdef INET6
 		struct sockaddr_in6 *sin6;
 #endif
 
 		sin4 = (struct sockaddr_in *)&rss;
 #ifdef INET6
 		sin6 = (struct sockaddr_in6 *)&rss;
 #endif
 		{
 			char *p;
 			int addrlen;
 
 			switch (rss.ss_family) {
 			case AF_INET:
 				p = (char *)&sin4->sin_addr;
 				addrlen = sizeof(struct in_addr);
 				break;
 #ifdef INET6
 			case AF_INET6:
 				p = (char *)&sin6->sin6_addr;
 				addrlen = sizeof(struct in6_addr);
 				break;
 #endif
 			default:
 				/* should not happen */
 				return -1;
 			}
 
 			for (i = 0; i < addrlen; ++i, ++p) {
 				hv = (hv << 5) ^ (hv >> 23) ^ *p;
 			}
 			hv = (hv ^ (hv >> 16));
 		}
 		for (i = 0; i < 5; ++i) {
 			CHash *ch = &CHashAry[(hv + i) & CPMHMASK];
 
 			if (rss.ss_family == AF_INET &&
 			    ch->ch_Family == AF_INET &&
 			    sin4->sin_addr.s_addr == ch->ch_Addr4.s_addr &&
 			    ch->ch_Service && strcmp(sep->se_service,
 			    ch->ch_Service) == 0) {
 				chBest = ch;
 				break;
 			}
 #ifdef INET6
 			if (rss.ss_family == AF_INET6 &&
 			    ch->ch_Family == AF_INET6 &&
 			    IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
 					       &ch->ch_Addr6) != 0 &&
 			    ch->ch_Service && strcmp(sep->se_service,
 			    ch->ch_Service) == 0) {
 				chBest = ch;
 				break;
 			}
 #endif
 			if (chBest == NULL || ch->ch_LTime == 0 || 
 			    ch->ch_LTime < chBest->ch_LTime) {
 				chBest = ch;
 			}
 		}
 		if ((rss.ss_family == AF_INET &&
 		     (chBest->ch_Family != AF_INET ||
 		      sin4->sin_addr.s_addr != chBest->ch_Addr4.s_addr)) ||
 		    chBest->ch_Service == NULL ||
 		    strcmp(sep->se_service, chBest->ch_Service) != 0) {
 			chBest->ch_Family = sin4->sin_family;
 			chBest->ch_Addr4 = sin4->sin_addr;
 			if (chBest->ch_Service)
 				free(chBest->ch_Service);
 			chBest->ch_Service = strdup(sep->se_service);
 			bzero(chBest->ch_Times, sizeof(chBest->ch_Times));
 		} 
 #ifdef INET6
 		if ((rss.ss_family == AF_INET6 &&
 		     (chBest->ch_Family != AF_INET6 ||
 		      IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
 					 &chBest->ch_Addr6) == 0)) ||
 		    chBest->ch_Service == NULL ||
 		    strcmp(sep->se_service, chBest->ch_Service) != 0) {
 			chBest->ch_Family = sin6->sin6_family;
 			chBest->ch_Addr6 = sin6->sin6_addr;
 			if (chBest->ch_Service)
 				free(chBest->ch_Service);
 			chBest->ch_Service = strdup(sep->se_service);
 			bzero(chBest->ch_Times, sizeof(chBest->ch_Times));
 		}
 #endif
 		chBest->ch_LTime = t;
 		{
 			CTime *ct = &chBest->ch_Times[ticks % CHTSIZE];
 			if (ct->ct_Ticks != ticks) {
 				ct->ct_Ticks = ticks;
 				ct->ct_Count = 0;
 			}
 			++ct->ct_Count;
 		}
 		for (i = 0; i < CHTSIZE; ++i) {
 			CTime *ct = &chBest->ch_Times[i];
 			if (ct->ct_Ticks <= ticks &&
 			    ct->ct_Ticks >= ticks - CHTSIZE) {
 				cnt += ct->ct_Count;
 			}
 		}
 		if ((cnt * 60) / (CHTSIZE * CHTGRAN) > sep->se_maxcpm) {
 			char pname[NI_MAXHOST];
 
 			getnameinfo((struct sockaddr *)&rss,
 				    ((struct sockaddr *)&rss)->sa_len,
 				    pname, sizeof(pname), NULL, 0,
 				    NI_NUMERICHOST);
 			r = -1;
 			syslog(LOG_ERR,
 			    "%s from %s exceeded counts/min (limit %d/min)",
 			    sep->se_service, pname,
 			    sep->se_maxcpm);
 		}
 	}
 	return(r);
 }
 
 static struct conninfo *
 search_conn(struct servtab *sep, int ctrl)
 {
 	struct sockaddr_storage ss;
 	socklen_t sslen = sizeof(ss);
 	struct conninfo *conn;
 	int hv;
 	char pname[NI_MAXHOST],  pname2[NI_MAXHOST];
 
 	if (sep->se_maxperip <= 0)
 		return NULL;
 
 	/*
 	 * If getpeername() fails, just let it through (if logging is
 	 * enabled the condition is caught elsewhere)
 	 */
 	if (getpeername(ctrl, (struct sockaddr *)&ss, &sslen) != 0)
 		return NULL;
 
 	switch (ss.ss_family) {
 	case AF_INET:
 		hv = hashval((char *)&((struct sockaddr_in *)&ss)->sin_addr,
 		    sizeof(struct in_addr));
 		break;
 #ifdef INET6
 	case AF_INET6:
 		hv = hashval((char *)&((struct sockaddr_in6 *)&ss)->sin6_addr,
 		    sizeof(struct in6_addr));
 		break;
 #endif
 	default:
 		/*
 		 * Since we only support AF_INET and AF_INET6, just
 		 * let other than AF_INET and AF_INET6 through.
 		 */
 		return NULL;
 	}
 
 	if (getnameinfo((struct sockaddr *)&ss, sslen, pname, sizeof(pname),
 	    NULL, 0, NI_NUMERICHOST) != 0)
 		return NULL;
 
 	LIST_FOREACH(conn, &sep->se_conn[hv], co_link) {
 		if (getnameinfo((struct sockaddr *)&conn->co_addr,
 		    conn->co_addr.ss_len, pname2, sizeof(pname2), NULL, 0,
 		    NI_NUMERICHOST) == 0 &&
 		    strcmp(pname, pname2) == 0)
 			break;
 	}
 
 	if (conn == NULL) {
 		if ((conn = malloc(sizeof(struct conninfo))) == NULL) {
 			syslog(LOG_ERR, "malloc: %m");
 			exit(EX_OSERR);
 		}
 		conn->co_proc = malloc(sep->se_maxperip * sizeof(*conn->co_proc));
 		if (conn->co_proc == NULL) {
 			syslog(LOG_ERR, "malloc: %m");
 			exit(EX_OSERR);
 		}
 		memcpy(&conn->co_addr, (struct sockaddr *)&ss, sslen);
 		conn->co_numchild = 0;
 		LIST_INSERT_HEAD(&sep->se_conn[hv], conn, co_link);
 	}
 
 	/*
 	 * Since a child process is not invoked yet, we cannot
 	 * determine a pid of a child.  So, co_proc and co_numchild
 	 * should be filled leter.
 	 */
 
 	return conn;
 }
 
 static int
 room_conn(struct servtab *sep, struct conninfo *conn)
 {
 	char pname[NI_MAXHOST];
 
 	if (conn->co_numchild >= sep->se_maxperip) {
 		getnameinfo((struct sockaddr *)&conn->co_addr,
 		    conn->co_addr.ss_len, pname, sizeof(pname), NULL, 0,
 		    NI_NUMERICHOST);
 		syslog(LOG_ERR, "%s from %s exceeded counts (limit %d)",
 		    sep->se_service, pname, sep->se_maxperip);
 		return 0;
 	}
 	return 1;
 }
 
 static void
 addchild_conn(struct conninfo *conn, pid_t pid)
 {
 	struct procinfo *proc;
 
 	if (conn == NULL)
 		return;
 
 	if ((proc = search_proc(pid, 1)) != NULL) {
 		if (proc->pr_conn != NULL) {
 			syslog(LOG_ERR,
 			    "addchild_conn: child already on process list");
 			exit(EX_OSERR);
 		}
 		proc->pr_conn = conn;
 	}
 
 	conn->co_proc[conn->co_numchild++] = proc;
 }
 
 static void
 reapchild_conn(pid_t pid)
 {
 	struct procinfo *proc;
 	struct conninfo *conn;
 	int i;
 
 	if ((proc = search_proc(pid, 0)) == NULL)
 		return;
 	if ((conn = proc->pr_conn) == NULL)
 		return;
 	for (i = 0; i < conn->co_numchild; ++i)
 		if (conn->co_proc[i] == proc) {
 			conn->co_proc[i] = conn->co_proc[--conn->co_numchild];
 			break;
 		}
 	free_proc(proc);
 	free_conn(conn);
 }
 
 static void
 resize_conn(struct servtab *sep, int maxpip)
 {
 	struct conninfo *conn;
 	int i, j;
 
 	if (sep->se_maxperip <= 0)
 		return;
 	if (maxpip <= 0) {
 		free_connlist(sep);
 		return;
 	}
 	for (i = 0; i < PERIPSIZE; ++i) {
 		LIST_FOREACH(conn, &sep->se_conn[i], co_link) {
 			for (j = maxpip; j < conn->co_numchild; ++j)
 				free_proc(conn->co_proc[j]);
 			conn->co_proc = realloc(conn->co_proc,
 			    maxpip * sizeof(*conn->co_proc));
 			if (conn->co_proc == NULL) {
 				syslog(LOG_ERR, "realloc: %m");
 				exit(EX_OSERR);
 			}
 			if (conn->co_numchild > maxpip)
 				conn->co_numchild = maxpip;
 		}
 	}
 }
 
 static void
 free_connlist(struct servtab *sep)
 {
 	struct conninfo *conn;
 	int i, j;
 
 	for (i = 0; i < PERIPSIZE; ++i) {
 		while ((conn = LIST_FIRST(&sep->se_conn[i])) != NULL) {
 			for (j = 0; j < conn->co_numchild; ++j)
 				free_proc(conn->co_proc[j]);
 			conn->co_numchild = 0;
 			free_conn(conn);
 		}
 	}
 }
 
 static void
 free_conn(struct conninfo *conn)
 {
 	if (conn == NULL)
 		return;
 	if (conn->co_numchild <= 0) {
 		LIST_REMOVE(conn, co_link);
 		free(conn->co_proc);
 		free(conn);
 	}
 }
 
 static struct procinfo *
 search_proc(pid_t pid, int add)
 {
 	struct procinfo *proc;
 	int hv;
 
 	hv = hashval((char *)&pid, sizeof(pid));
 	LIST_FOREACH(proc, &proctable[hv], pr_link) {
 		if (proc->pr_pid == pid)
 			break;
 	}
 	if (proc == NULL && add) {
 		if ((proc = malloc(sizeof(struct procinfo))) == NULL) {
 			syslog(LOG_ERR, "malloc: %m");
 			exit(EX_OSERR);
 		}
 		proc->pr_pid = pid;
 		proc->pr_conn = NULL;
 		LIST_INSERT_HEAD(&proctable[hv], proc, pr_link);
 	}
 	return proc;
 }
 
 static void
 free_proc(struct procinfo *proc)
 {
 	if (proc == NULL)
 		return;
 	LIST_REMOVE(proc, pr_link);
 	free(proc);
 }
 
 static int
 hashval(char *p, int len)
 {
 	int i, hv = 0xABC3D20F;
 
 	for (i = 0; i < len; ++i, ++p)
 		hv = (hv << 5) ^ (hv >> 23) ^ *p;
 	hv = (hv ^ (hv >> 16)) & (PERIPSIZE - 1);
 	return hv;
 }
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.c	(revision 312218)
@@ -1,643 +1,642 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/ioctl.h>
 #include <sys/param.h>
 #include <sys/linker.h>
 #include <sys/socket.h>
 #include <sys/capsicum.h>
 #include <sys/wait.h>
 #include <assert.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libutil.h>
 #include <netdb.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "iscsid.h"
 
 static volatile bool sigalrm_received = false;
 
 static int nchildren = 0;
 
 static void
 usage(void)
 {
 
 	fprintf(stderr, "usage: iscsid [-P pidfile][-d][-m maxproc][-t timeout]\n");
 	exit(1);
 }
 
 char *
 checked_strdup(const char *s)
 {
 	char *c;
 
 	c = strdup(s);
 	if (c == NULL)
 		log_err(1, "strdup");
 	return (c);
 }
 
 static void
 resolve_addr(const struct connection *conn, const char *address,
     struct addrinfo **ai, bool initiator_side)
 {
 	struct addrinfo hints;
 	char *arg, *addr, *ch;
 	const char *port;
 	int error, colons = 0;
 
 	arg = checked_strdup(address);
 
 	if (arg[0] == '\0') {
 		fail(conn, "empty address");
 		log_errx(1, "empty address");
 	}
 	if (arg[0] == '[') {
 		/*
 		 * IPv6 address in square brackets, perhaps with port.
 		 */
 		arg++;
 		addr = strsep(&arg, "]");
 		if (arg == NULL) {
 			fail(conn, "malformed address");
 			log_errx(1, "malformed address %s", address);
 		}
 		if (arg[0] == '\0') {
 			port = NULL;
 		} else if (arg[0] == ':') {
 			port = arg + 1;
 		} else {
 			fail(conn, "malformed address");
 			log_errx(1, "malformed address %s", address);
 		}
 	} else {
 		/*
 		 * Either IPv6 address without brackets - and without
 		 * a port - or IPv4 address.  Just count the colons.
 		 */
 		for (ch = arg; *ch != '\0'; ch++) {
 			if (*ch == ':')
 				colons++;
 		}
 		if (colons > 1) {
 			addr = arg;
 			port = NULL;
 		} else {
 			addr = strsep(&arg, ":");
 			if (arg == NULL)
 				port = NULL;
 			else
 				port = arg;
 		}
 	}
 
 	if (port == NULL && !initiator_side)
 		port = "3260";
 
 	memset(&hints, 0, sizeof(hints));
 	hints.ai_family = PF_UNSPEC;
 	hints.ai_socktype = SOCK_STREAM;
 	hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV;
 	if (initiator_side)
 		hints.ai_flags |= AI_PASSIVE;
 
 	error = getaddrinfo(addr, port, &hints, ai);
 	if (error != 0) {
 		fail(conn, gai_strerror(error));
 		log_errx(1, "getaddrinfo for %s failed: %s",
 		    address, gai_strerror(error));
 	}
 }
 
 static struct connection *
 connection_new(int iscsi_fd, const struct iscsi_daemon_request *request)
 {
 	struct connection *conn;
 	struct iscsi_session_limits *isl;
 	struct addrinfo *from_ai, *to_ai;
 	const char *from_addr, *to_addr;
 #ifdef ICL_KERNEL_PROXY
 	struct iscsi_daemon_connect idc;
 #endif
 	int error, sockbuf;
 
 	conn = calloc(1, sizeof(*conn));
 	if (conn == NULL)
 		log_err(1, "calloc");
 
 	/*
 	 * Default values, from RFC 3720, section 12.
 	 */
 	conn->conn_header_digest = CONN_DIGEST_NONE;
 	conn->conn_data_digest = CONN_DIGEST_NONE;
 	conn->conn_initial_r2t = true;
 	conn->conn_immediate_data = true;
-	conn->conn_max_burst_length = MAX_BURST_LENGTH;
-	conn->conn_first_burst_length = FIRST_BURST_LENGTH;
+	conn->conn_max_recv_data_segment_length = 8192;
+	conn->conn_max_send_data_segment_length = 8192;
+	conn->conn_max_burst_length = 262144;
+	conn->conn_first_burst_length = 65536;
 	conn->conn_iscsi_fd = iscsi_fd;
 
 	conn->conn_session_id = request->idr_session_id;
 	memcpy(&conn->conn_conf, &request->idr_conf, sizeof(conn->conn_conf));
 	memcpy(&conn->conn_isid, &request->idr_isid, sizeof(conn->conn_isid));
 	conn->conn_tsih = request->idr_tsih;
 
 	/*
 	 * Read the driver limits and provide reasonable defaults for the ones
 	 * the driver doesn't care about.  If a max_snd_dsl is not explicitly
 	 * provided by the driver then we'll make sure both conn->max_snd_dsl
 	 * and isl->max_snd_dsl are set to the rcv_dsl.  This preserves historic
 	 * behavior.
 	 */
 	isl = &conn->conn_limits;
 	memcpy(isl, &request->idr_limits, sizeof(*isl));
-	if (isl->isl_max_recv_data_segment_length == 0) {
-		conn->conn_max_recv_data_segment_length = 8192;
-		conn->conn_max_send_data_segment_length = 8192;
-		isl->isl_max_recv_data_segment_length = 8192;
-	} else {
-		conn->conn_max_recv_data_segment_length =
-		    isl->isl_max_recv_data_segment_length;
-		conn->conn_max_send_data_segment_length =
-		    isl->isl_max_recv_data_segment_length;
-	}
-	if (isl->isl_max_send_data_segment_length == 0) {
+	if (isl->isl_max_recv_data_segment_length == 0)
+		isl->isl_max_recv_data_segment_length = (1 << 24) - 1;
+	if (isl->isl_max_send_data_segment_length == 0)
 		isl->isl_max_send_data_segment_length =
 		    isl->isl_max_recv_data_segment_length;
-	} else {
+	if (isl->isl_max_burst_length == 0)
+		isl->isl_max_burst_length = (1 << 24) - 1;
+	if (isl->isl_first_burst_length == 0)
+		isl->isl_first_burst_length = (1 << 24) - 1;
+	if (isl->isl_first_burst_length > isl->isl_max_burst_length)
+		isl->isl_first_burst_length = isl->isl_max_burst_length;
+
+	/*
+	 * Limit default send length in case it won't be negotiated.
+	 * We can't do it for other limits, since they may affect both
+	 * sender and receiver operation, and we must obey defaults.
+	 */
+	if (conn->conn_max_send_data_segment_length >
+	    isl->isl_max_send_data_segment_length) {
 		conn->conn_max_send_data_segment_length =
 		    isl->isl_max_send_data_segment_length;
-	}
-	if (isl->isl_max_burst_length == 0)
-		isl->isl_max_burst_length = conn->conn_max_burst_length;
-	if (isl->isl_first_burst_length == 0) {
-		if (isl->isl_max_burst_length < (int)conn->conn_first_burst_length)
-			isl->isl_first_burst_length = isl->isl_max_burst_length;
-		else
-			isl->isl_first_burst_length = conn->conn_first_burst_length;
 	}
 
 	from_addr = conn->conn_conf.isc_initiator_addr;
 	to_addr = conn->conn_conf.isc_target_addr;
 
 	if (from_addr[0] != '\0')
 		resolve_addr(conn, from_addr, &from_ai, true);
 	else
 		from_ai = NULL;
 
 	resolve_addr(conn, to_addr, &to_ai, false);
 
 #ifdef ICL_KERNEL_PROXY
 	if (conn->conn_conf.isc_iser) {
 		memset(&idc, 0, sizeof(idc));
 		idc.idc_session_id = conn->conn_session_id;
 		if (conn->conn_conf.isc_iser)
 			idc.idc_iser = 1;
 		idc.idc_domain = to_ai->ai_family;
 		idc.idc_socktype = to_ai->ai_socktype;
 		idc.idc_protocol = to_ai->ai_protocol;
 		if (from_ai != NULL) {
 			idc.idc_from_addr = from_ai->ai_addr;
 			idc.idc_from_addrlen = from_ai->ai_addrlen;
 		}
 		idc.idc_to_addr = to_ai->ai_addr;
 		idc.idc_to_addrlen = to_ai->ai_addrlen;
 
 		log_debugx("connecting to %s using ICL kernel proxy", to_addr);
 		error = ioctl(iscsi_fd, ISCSIDCONNECT, &idc);
 		if (error != 0) {
 			fail(conn, strerror(errno));
 			log_err(1, "failed to connect to %s "
 			    "using ICL kernel proxy: ISCSIDCONNECT", to_addr);
 		}
 
 		return (conn);
 	}
 #endif /* ICL_KERNEL_PROXY */
 
 	if (conn->conn_conf.isc_iser) {
 		fail(conn, "iSER not supported");
 		log_errx(1, "iscsid(8) compiled without ICL_KERNEL_PROXY "
 		    "does not support iSER");
 	}
 
 	conn->conn_socket = socket(to_ai->ai_family, to_ai->ai_socktype,
 	    to_ai->ai_protocol);
 	if (conn->conn_socket < 0) {
 		fail(conn, strerror(errno));
 		log_err(1, "failed to create socket for %s", from_addr);
 	}
 	sockbuf = SOCKBUF_SIZE;
 	if (setsockopt(conn->conn_socket, SOL_SOCKET, SO_RCVBUF,
 	    &sockbuf, sizeof(sockbuf)) == -1)
 		log_warn("setsockopt(SO_RCVBUF) failed");
 	sockbuf = SOCKBUF_SIZE;
 	if (setsockopt(conn->conn_socket, SOL_SOCKET, SO_SNDBUF,
 	    &sockbuf, sizeof(sockbuf)) == -1)
 		log_warn("setsockopt(SO_SNDBUF) failed");
 	if (from_ai != NULL) {
 		error = bind(conn->conn_socket, from_ai->ai_addr,
 		    from_ai->ai_addrlen);
 		if (error != 0) {
 			fail(conn, strerror(errno));
 			log_err(1, "failed to bind to %s", from_addr);
 		}
 	}
 	log_debugx("connecting to %s", to_addr);
 	error = connect(conn->conn_socket, to_ai->ai_addr, to_ai->ai_addrlen);
 	if (error != 0) {
 		fail(conn, strerror(errno));
 		log_err(1, "failed to connect to %s", to_addr);
 	}
 
 	return (conn);
 }
 
 static void
 handoff(struct connection *conn)
 {
 	struct iscsi_daemon_handoff idh;
 	int error;
 
 	log_debugx("handing off connection to the kernel");
 
 	memset(&idh, 0, sizeof(idh));
 	idh.idh_session_id = conn->conn_session_id;
 	idh.idh_socket = conn->conn_socket;
 	strlcpy(idh.idh_target_alias, conn->conn_target_alias,
 	    sizeof(idh.idh_target_alias));
 	idh.idh_tsih = conn->conn_tsih;
 	idh.idh_statsn = conn->conn_statsn;
 	idh.idh_header_digest = conn->conn_header_digest;
 	idh.idh_data_digest = conn->conn_data_digest;
 	idh.idh_initial_r2t = conn->conn_initial_r2t;
 	idh.idh_immediate_data = conn->conn_immediate_data;
 	idh.idh_max_recv_data_segment_length =
 	    conn->conn_max_recv_data_segment_length;
 	idh.idh_max_send_data_segment_length =
 	    conn->conn_max_send_data_segment_length;
 	idh.idh_max_burst_length = conn->conn_max_burst_length;
 	idh.idh_first_burst_length = conn->conn_first_burst_length;
 
 	error = ioctl(conn->conn_iscsi_fd, ISCSIDHANDOFF, &idh);
 	if (error != 0)
 		log_err(1, "ISCSIDHANDOFF");
 }
 
 void
 fail(const struct connection *conn, const char *reason)
 {
 	struct iscsi_daemon_fail idf;
 	int error, saved_errno;
 
 	saved_errno = errno;
 
 	memset(&idf, 0, sizeof(idf));
 	idf.idf_session_id = conn->conn_session_id;
 	strlcpy(idf.idf_reason, reason, sizeof(idf.idf_reason));
 
 	error = ioctl(conn->conn_iscsi_fd, ISCSIDFAIL, &idf);
 	if (error != 0)
 		log_err(1, "ISCSIDFAIL");
 
 	errno = saved_errno;
 }
 
 /*
  * XXX: I CANT INTO LATIN
  */
 static void
 capsicate(struct connection *conn)
 {
 	int error;
 	cap_rights_t rights;
 #ifdef ICL_KERNEL_PROXY
 	const unsigned long cmds[] = { ISCSIDCONNECT, ISCSIDSEND, ISCSIDRECEIVE,
 	    ISCSIDHANDOFF, ISCSIDFAIL, ISCSISADD, ISCSISREMOVE, ISCSISMODIFY };
 #else
 	const unsigned long cmds[] = { ISCSIDHANDOFF, ISCSIDFAIL, ISCSISADD,
 	    ISCSISREMOVE, ISCSISMODIFY };
 #endif
 
 	cap_rights_init(&rights, CAP_IOCTL);
 	error = cap_rights_limit(conn->conn_iscsi_fd, &rights);
 	if (error != 0 && errno != ENOSYS)
 		log_err(1, "cap_rights_limit");
 
 	error = cap_ioctls_limit(conn->conn_iscsi_fd, cmds,
 	    sizeof(cmds) / sizeof(cmds[0]));
 	if (error != 0 && errno != ENOSYS)
 		log_err(1, "cap_ioctls_limit");
 
 	error = cap_enter();
 	if (error != 0 && errno != ENOSYS)
 		log_err(1, "cap_enter");
 
 	if (cap_sandboxed())
 		log_debugx("Capsicum capability mode enabled");
 	else
 		log_warnx("Capsicum capability mode not supported");
 }
 
 bool
 timed_out(void)
 {
 
 	return (sigalrm_received);
 }
 
 static void
 sigalrm_handler(int dummy __unused)
 {
 	/*
 	 * It would be easiest to just log an error and exit.  We can't
 	 * do this, though, because log_errx() is not signal safe, since
 	 * it calls syslog(3).  Instead, set a flag checked by pdu_send()
 	 * and pdu_receive(), to call log_errx() there.  Should they fail
 	 * to notice, we'll exit here one second later.
 	 */
 	if (sigalrm_received) {
 		/*
 		 * Oh well.  Just give up and quit.
 		 */
 		_exit(2);
 	}
 
 	sigalrm_received = true;
 }
 
 static void
 set_timeout(int timeout)
 {
 	struct sigaction sa;
 	struct itimerval itv;
 	int error;
 
 	if (timeout <= 0) {
 		log_debugx("session timeout disabled");
 		return;
 	}
 
 	bzero(&sa, sizeof(sa));
 	sa.sa_handler = sigalrm_handler;
 	sigfillset(&sa.sa_mask);
 	error = sigaction(SIGALRM, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 
 	/*
 	 * First SIGALRM will arive after conf_timeout seconds.
 	 * If we do nothing, another one will arrive a second later.
 	 */
 	bzero(&itv, sizeof(itv));
 	itv.it_interval.tv_sec = 1;
 	itv.it_value.tv_sec = timeout;
 
 	log_debugx("setting session timeout to %d seconds",
 	    timeout);
 	error = setitimer(ITIMER_REAL, &itv, NULL);
 	if (error != 0)
 		log_err(1, "setitimer");
 }
 
 static void
 sigchld_handler(int dummy __unused)
 {
 
 	/*
 	 * The only purpose of this handler is to make SIGCHLD
 	 * interrupt the ISCSIDWAIT ioctl(2), so we can call
 	 * wait_for_children().
 	 */
 }
 
 static void
 register_sigchld(void)
 {
 	struct sigaction sa;
 	int error;
 
 	bzero(&sa, sizeof(sa));
 	sa.sa_handler = sigchld_handler;
 	sigfillset(&sa.sa_mask);
 	error = sigaction(SIGCHLD, &sa, NULL);
 	if (error != 0)
 		log_err(1, "sigaction");
 
 }
 
 static void
 handle_request(int iscsi_fd, const struct iscsi_daemon_request *request, int timeout)
 {
 	struct connection *conn;
 
 	log_set_peer_addr(request->idr_conf.isc_target_addr);
 	if (request->idr_conf.isc_target[0] != '\0') {
 		log_set_peer_name(request->idr_conf.isc_target);
 		setproctitle("%s (%s)", request->idr_conf.isc_target_addr, request->idr_conf.isc_target);
 	} else {
 		setproctitle("%s", request->idr_conf.isc_target_addr);
 	}
 
 	conn = connection_new(iscsi_fd, request);
 	set_timeout(timeout);
 	capsicate(conn);
 	login(conn);
 	if (conn->conn_conf.isc_discovery != 0)
 		discovery(conn);
 	else
 		handoff(conn);
 
 	log_debugx("nothing more to do; exiting");
 	exit (0);
 }
 
 static int
 wait_for_children(bool block)
 {
 	pid_t pid;
 	int status;
 	int num = 0;
 
 	for (;;) {
 		/*
 		 * If "block" is true, wait for at least one process.
 		 */
 		if (block && num == 0)
 			pid = wait4(-1, &status, 0, NULL);
 		else
 			pid = wait4(-1, &status, WNOHANG, NULL);
 		if (pid <= 0)
 			break;
 		if (WIFSIGNALED(status)) {
 			log_warnx("child process %d terminated with signal %d",
 			    pid, WTERMSIG(status));
 		} else if (WEXITSTATUS(status) != 0) {
 			log_warnx("child process %d terminated with exit status %d",
 			    pid, WEXITSTATUS(status));
 		} else {
 			log_debugx("child process %d terminated gracefully", pid);
 		}
 		num++;
 	}
 
 	return (num);
 }
 
 int
 main(int argc, char **argv)
 {
 	int ch, debug = 0, error, iscsi_fd, maxproc = 30, retval, saved_errno,
 	    timeout = 60;
 	bool dont_daemonize = false;
 	struct pidfh *pidfh;
 	pid_t pid, otherpid;
 	const char *pidfile_path = DEFAULT_PIDFILE;
 	struct iscsi_daemon_request request;
 
 	while ((ch = getopt(argc, argv, "P:dl:m:t:")) != -1) {
 		switch (ch) {
 		case 'P':
 			pidfile_path = optarg;
 			break;
 		case 'd':
 			dont_daemonize = true;
 			debug++;
 			break;
 		case 'l':
 			debug = atoi(optarg);
 			break;
 		case 'm':
 			maxproc = atoi(optarg);
 			break;
 		case 't':
 			timeout = atoi(optarg);
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	if (argc != 0)
 		usage();
 
 	log_init(debug);
 
 	pidfh = pidfile_open(pidfile_path, 0600, &otherpid);
 	if (pidfh == NULL) {
 		if (errno == EEXIST)
 			log_errx(1, "daemon already running, pid: %jd.",
 			    (intmax_t)otherpid);
 		log_err(1, "cannot open or create pidfile \"%s\"",
 		    pidfile_path);
 	}
 
 	iscsi_fd = open(ISCSI_PATH, O_RDWR);
 	if (iscsi_fd < 0 && errno == ENOENT) {
 		saved_errno = errno;
 		retval = kldload("iscsi");
 		if (retval != -1)
 			iscsi_fd = open(ISCSI_PATH, O_RDWR);
 		else
 			errno = saved_errno;
 	}
 	if (iscsi_fd < 0)
 		log_err(1, "failed to open %s", ISCSI_PATH);
 
 	if (dont_daemonize == false) {
 		if (daemon(0, 0) == -1) {
 			log_warn("cannot daemonize");
 			pidfile_remove(pidfh);
 			exit(1);
 		}
 	}
 
 	pidfile_write(pidfh);
 
 	register_sigchld();
 
 	for (;;) {
 		log_debugx("waiting for request from the kernel");
 
 		memset(&request, 0, sizeof(request));
 		error = ioctl(iscsi_fd, ISCSIDWAIT, &request);
 		if (error != 0) {
 			if (errno == EINTR) {
 				nchildren -= wait_for_children(false);
 				assert(nchildren >= 0);
 				continue;
 			}
 
 			log_err(1, "ISCSIDWAIT");
 		}
 
 		if (dont_daemonize) {
 			log_debugx("not forking due to -d flag; "
 			    "will exit after servicing a single request");
 		} else {
 			nchildren -= wait_for_children(false);
 			assert(nchildren >= 0);
 
 			while (maxproc > 0 && nchildren >= maxproc) {
 				log_debugx("maxproc limit of %d child processes hit; "
 				    "waiting for child process to exit", maxproc);
 				nchildren -= wait_for_children(true);
 				assert(nchildren >= 0);
 			}
 			log_debugx("incoming connection; forking child process #%d",
 			    nchildren);
 			nchildren++;
 
 			pid = fork();
 			if (pid < 0)
 				log_err(1, "fork");
 			if (pid > 0)
 				continue;
 		}
 
 		pidfile_close(pidfh);
 		handle_request(iscsi_fd, &request, timeout);
 	}
 
 	return (0);
 }
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.h
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.h	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/iscsid.h	(revision 312218)
@@ -1,152 +1,150 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef ISCSID_H
 #define	ISCSID_H
 
 #include <stdbool.h>
 #include <stdint.h>
 
 #include <iscsi_ioctl.h>
 
 #define	DEFAULT_PIDFILE			"/var/run/iscsid.pid"
 
 #define	CONN_DIGEST_NONE		0
 #define	CONN_DIGEST_CRC32C		1
 
 #define	CONN_MUTUAL_CHALLENGE_LEN	1024
 #define	SOCKBUF_SIZE			1048576
-#define	MAX_BURST_LENGTH		(256 * 1024)
-#define	FIRST_BURST_LENGTH		(128 * 1024)
 
 struct connection {
 	int			conn_iscsi_fd;
 	int			conn_socket;
 	unsigned int		conn_session_id;
 	struct iscsi_session_conf	conn_conf;
 	struct iscsi_session_limits	conn_limits;
 	char			conn_target_alias[ISCSI_ADDR_LEN];
 	uint8_t			conn_isid[6];
 	uint16_t		conn_tsih;
 	uint32_t		conn_statsn;
 	int			conn_header_digest;
 	int			conn_data_digest;
 	bool			conn_initial_r2t;
 	bool			conn_immediate_data;
 	int			conn_max_recv_data_segment_length;
 	int			conn_max_send_data_segment_length;
 	int			conn_max_burst_length;
 	int			conn_first_burst_length;
 	struct chap		*conn_mutual_chap;
 };
 
 struct pdu {
 	struct connection	*pdu_connection;
 	struct iscsi_bhs	*pdu_bhs;
 	char			*pdu_data;
 	size_t			pdu_data_len;
 };
 
 #define	KEYS_MAX		1024
 
 struct keys {
 	char			*keys_names[KEYS_MAX];
 	char			*keys_values[KEYS_MAX];
 	char			*keys_data;
 	size_t			keys_data_len;
 };
 
 #define	CHAP_CHALLENGE_LEN	1024
 #define	CHAP_DIGEST_LEN		16 /* Equal to MD5 digest size. */
 
 struct chap {
 	unsigned char	chap_id;
 	char		chap_challenge[CHAP_CHALLENGE_LEN];
 	char		chap_response[CHAP_DIGEST_LEN];
 };
 
 struct rchap {
 	char		*rchap_secret;
 	unsigned char	rchap_id;
 	void		*rchap_challenge;
 	size_t		rchap_challenge_len;
 };
 
 struct chap		*chap_new(void);
 char			*chap_get_id(const struct chap *chap);
 char			*chap_get_challenge(const struct chap *chap);
 int			chap_receive(struct chap *chap, const char *response);
 int			chap_authenticate(struct chap *chap,
 			    const char *secret);
 void			chap_delete(struct chap *chap);
 
 struct rchap		*rchap_new(const char *secret);
 int			rchap_receive(struct rchap *rchap,
 			    const char *id, const char *challenge);
 char			*rchap_get_response(struct rchap *rchap);
 void			rchap_delete(struct rchap *rchap);
 
 struct keys		*keys_new(void);
 void			keys_delete(struct keys *key);
 void			keys_load(struct keys *keys, const struct pdu *pdu);
 void			keys_save(struct keys *keys, struct pdu *pdu);
 const char		*keys_find(struct keys *keys, const char *name);
 void			keys_add(struct keys *keys,
 			    const char *name, const char *value);
 void			keys_add_int(struct keys *keys,
 			    const char *name, int value);
 
 struct pdu		*pdu_new(struct connection *ic);
 struct pdu		*pdu_new_response(struct pdu *request);
 void			pdu_receive(struct pdu *request);
 void			pdu_send(struct pdu *response);
 void			pdu_delete(struct pdu *ip);
 
 void			login(struct connection *ic);
 
 void			discovery(struct connection *ic);
 
 void			log_init(int level);
 void			log_set_peer_name(const char *name);
 void			log_set_peer_addr(const char *addr);
 void			log_err(int, const char *, ...)
 			    __dead2 __printflike(2, 3);
 void			log_errx(int, const char *, ...)
 			    __dead2 __printflike(2, 3);
 void			log_warn(const char *, ...) __printflike(1, 2);
 void			log_warnx(const char *, ...) __printflike(1, 2);
 void			log_debugx(const char *, ...) __printflike(1, 2);
 
 char			*checked_strdup(const char *);
 bool			timed_out(void);
 void			fail(const struct connection *, const char *);
 
 #endif /* !ISCSID_H */
Index: projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/login.c
===================================================================
--- projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/login.c	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017/usr.sbin/iscsid/login.c	(revision 312218)
@@ -1,887 +1,890 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <netinet/in.h>
 
 #include "iscsid.h"
 #include "iscsi_proto.h"
 
 static int
 login_nsg(const struct pdu *response)
 {
 	struct iscsi_bhs_login_response *bhslr;
 
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 
 	return (bhslr->bhslr_flags & 0x03);
 }
 
 static void
 login_set_nsg(struct pdu *request, int nsg)
 {
 	struct iscsi_bhs_login_request *bhslr;
 
 	assert(nsg == BHSLR_STAGE_SECURITY_NEGOTIATION ||
 	    nsg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION ||
 	    nsg == BHSLR_STAGE_FULL_FEATURE_PHASE);
 
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 
 	bhslr->bhslr_flags &= 0xFC;
 	bhslr->bhslr_flags |= nsg;
 }
 
 static void
 login_set_csg(struct pdu *request, int csg)
 {
 	struct iscsi_bhs_login_request *bhslr;
 
 	assert(csg == BHSLR_STAGE_SECURITY_NEGOTIATION ||
 	    csg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION ||
 	    csg == BHSLR_STAGE_FULL_FEATURE_PHASE);
 
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 
 	bhslr->bhslr_flags &= 0xF3;
 	bhslr->bhslr_flags |= csg << 2;
 }
 
 static const char *
 login_target_error_str(int class, int detail)
 {
 	static char msg[128];
 
 	/*
 	 * RFC 3270, 10.13.5.  Status-Class and Status-Detail
 	 */
 	switch (class) {
 	case 0x01:
 		switch (detail) {
 		case 0x01:
 			return ("Target moved temporarily");
 		case 0x02:
 			return ("Target moved permanently");
 		default:
 			snprintf(msg, sizeof(msg), "unknown redirection; "
 			    "Status-Class 0x%x, Status-Detail 0x%x",
 			    class, detail);
 			return (msg);
 		}
 	case 0x02:
 		switch (detail) {
 		case 0x00:
 			return ("Initiator error");
 		case 0x01:
 			return ("Authentication failure");
 		case 0x02:
 			return ("Authorization failure");
 		case 0x03:
 			return ("Not found");
 		case 0x04:
 			return ("Target removed");
 		case 0x05:
 			return ("Unsupported version");
 		case 0x06:
 			return ("Too many connections");
 		case 0x07:
 			return ("Missing parameter");
 		case 0x08:
 			return ("Can't include in session");
 		case 0x09:
 			return ("Session type not supported");
 		case 0x0a:
 			return ("Session does not exist");
 		case 0x0b:
 			return ("Invalid during login");
 		default:
 			snprintf(msg, sizeof(msg), "unknown initiator error; "
 			    "Status-Class 0x%x, Status-Detail 0x%x",
 			    class, detail);
 			return (msg);
 		}
 	case 0x03:
 		switch (detail) {
 		case 0x00:
 			return ("Target error");
 		case 0x01:
 			return ("Service unavailable");
 		case 0x02:
 			return ("Out of resources");
 		default:
 			snprintf(msg, sizeof(msg), "unknown target error; "
 			    "Status-Class 0x%x, Status-Detail 0x%x",
 			    class, detail);
 			return (msg);
 		}
 	default:
 		snprintf(msg, sizeof(msg), "unknown error; "
 		    "Status-Class 0x%x, Status-Detail 0x%x",
 		    class, detail);
 		return (msg);
 	}
 }
 
 static void
 kernel_modify(const struct connection *conn, const char *target_address)
 {
 	struct iscsi_session_modify ism;
 	int error;
 
 	memset(&ism, 0, sizeof(ism));
 	ism.ism_session_id = conn->conn_session_id;
 	memcpy(&ism.ism_conf, &conn->conn_conf, sizeof(ism.ism_conf));
 	strlcpy(ism.ism_conf.isc_target_addr, target_address,
 	    sizeof(ism.ism_conf.isc_target));
 	error = ioctl(conn->conn_iscsi_fd, ISCSISMODIFY, &ism);
 	if (error != 0) {
 		log_err(1, "failed to redirect to %s: ISCSISMODIFY",
 		    target_address);
 	}
 }
 
 /*
  * XXX:	The way it works is suboptimal; what should happen is described
  *	in draft-gilligan-iscsi-fault-tolerance-00.  That, however, would
  *	be much more complicated: we would need to keep "dependencies"
  *	for sessions, so that, in case described in draft and using draft
  *	terminology, we would have three sessions: one for discovery,
  *	one for initial target portal, and one for redirect portal.
  *	This would allow us to "backtrack" on connection failure,
  *	as described in draft.
  */
 static void
 login_handle_redirection(struct connection *conn, struct pdu *response)
 {
 	struct iscsi_bhs_login_response *bhslr;
 	struct keys *response_keys;
 	const char *target_address;
 
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	assert (bhslr->bhslr_status_class == 1);
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
 	target_address = keys_find(response_keys, "TargetAddress");
 	if (target_address == NULL)
 		log_errx(1, "received redirection without TargetAddress");
 	if (target_address[0] == '\0')
 		log_errx(1, "received redirection with empty TargetAddress");
 	if (strlen(target_address) >=
 	    sizeof(conn->conn_conf.isc_target_addr) - 1)
 		log_errx(1, "received TargetAddress is too long");
 
 	log_debugx("received redirection to \"%s\"", target_address);
 	kernel_modify(conn, target_address);
 	keys_delete(response_keys);
 }
 
 static struct pdu *
 login_receive(struct connection *conn)
 {
 	struct pdu *response;
 	struct iscsi_bhs_login_response *bhslr;
 	const char *errorstr;
 	static bool initial = true;
 
 	response = pdu_new(conn);
 	pdu_receive(response);
 	if (response->pdu_bhs->bhs_opcode != ISCSI_BHS_OPCODE_LOGIN_RESPONSE) {
 		log_errx(1, "protocol error: received invalid opcode 0x%x",
 		    response->pdu_bhs->bhs_opcode);
 	}
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	/*
 	 * XXX: Implement the C flag some day.
 	 */
 	if ((bhslr->bhslr_flags & BHSLR_FLAGS_CONTINUE) != 0)
 		log_errx(1, "received Login PDU with unsupported \"C\" flag");
 	if (bhslr->bhslr_version_max != 0x00)
 		log_errx(1, "received Login PDU with unsupported "
 		    "Version-max 0x%x", bhslr->bhslr_version_max);
 	if (bhslr->bhslr_version_active != 0x00)
 		log_errx(1, "received Login PDU with unsupported "
 		    "Version-active 0x%x", bhslr->bhslr_version_active);
 	if (bhslr->bhslr_status_class == 1) {
 		login_handle_redirection(conn, response);
 		log_debugx("redirection handled; exiting");
 		exit(0);
 	}
 	if (bhslr->bhslr_status_class != 0) {
 		errorstr = login_target_error_str(bhslr->bhslr_status_class,
 		    bhslr->bhslr_status_detail);
 		fail(conn, errorstr);
 		log_errx(1, "target returned error: %s", errorstr);
 	}
 	if (initial == false &&
 	    ntohl(bhslr->bhslr_statsn) != conn->conn_statsn + 1) {
 		/*
 		 * It's a warning, not an error, to work around what seems
 		 * to be bug in NetBSD iSCSI target.
 		 */
 		log_warnx("received Login PDU with wrong StatSN: "
 		    "is %u, should be %u", ntohl(bhslr->bhslr_statsn),
 		    conn->conn_statsn + 1);
 	}
 	conn->conn_tsih = ntohs(bhslr->bhslr_tsih);
 	conn->conn_statsn = ntohl(bhslr->bhslr_statsn);
 
 	initial = false;
 
 	return (response);
 }
 
 static struct pdu *
 login_new_request(struct connection *conn, int csg)
 {
 	struct pdu *request;
 	struct iscsi_bhs_login_request *bhslr;
 	int nsg;
 
 	request = pdu_new(conn);
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 	bhslr->bhslr_opcode = ISCSI_BHS_OPCODE_LOGIN_REQUEST |
 	    ISCSI_BHS_OPCODE_IMMEDIATE;
 
 	bhslr->bhslr_flags = BHSLR_FLAGS_TRANSIT;
 	switch (csg) {
 	case BHSLR_STAGE_SECURITY_NEGOTIATION:
 		nsg = BHSLR_STAGE_OPERATIONAL_NEGOTIATION;
 		break;
 	case BHSLR_STAGE_OPERATIONAL_NEGOTIATION:
 		nsg = BHSLR_STAGE_FULL_FEATURE_PHASE;
 		break;
 	default:
 		assert(!"invalid csg");
 		log_errx(1, "invalid csg %d", csg);
 	}
 	login_set_csg(request, csg);
 	login_set_nsg(request, nsg);
 
 	memcpy(bhslr->bhslr_isid, &conn->conn_isid, sizeof(bhslr->bhslr_isid));
 	bhslr->bhslr_tsih = htons(conn->conn_tsih);
 	bhslr->bhslr_initiator_task_tag = 0;
 	bhslr->bhslr_cmdsn = 0;
 	bhslr->bhslr_expstatsn = htonl(conn->conn_statsn + 1);
 
 	return (request);
 }
 
 static int
 login_list_prefers(const char *list,
     const char *choice1, const char *choice2)
 {
 	char *tofree, *str, *token;
 
 	tofree = str = checked_strdup(list);
 
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (strcmp(token, choice1) == 0) {
 			free(tofree);
 			return (1);
 		}
 		if (strcmp(token, choice2) == 0) {
 			free(tofree);
 			return (2);
 		}
 	}
 	free(tofree);
 	return (-1);
 }
 
 static void
 login_negotiate_key(struct connection *conn, const char *name,
     const char *value)
 {
 	struct iscsi_session_limits *isl;
 	int which, tmp;
 
 	isl = &conn->conn_limits;
 	if (strcmp(name, "TargetAlias") == 0) {
 		strlcpy(conn->conn_target_alias, value,
 		    sizeof(conn->conn_target_alias));
 	} else if (strcmp(value, "Irrelevant") == 0) {
 		/* Ignore. */
 	} else if (strcmp(name, "HeaderDigest") == 0) {
 		which = login_list_prefers(value, "CRC32C", "None");
 		switch (which) {
 		case 1:
 			log_debugx("target prefers CRC32C "
 			    "for header digest; we'll use it");
 			conn->conn_header_digest = CONN_DIGEST_CRC32C;
 			break;
 		case 2:
 			log_debugx("target prefers not to do "
 			    "header digest; we'll comply");
 			break;
 		default:
 			log_warnx("target sent unrecognized "
 			    "HeaderDigest value \"%s\"; will use None", value);
 			break;
 		}
 	} else if (strcmp(name, "DataDigest") == 0) {
 		which = login_list_prefers(value, "CRC32C", "None");
 		switch (which) {
 		case 1:
 			log_debugx("target prefers CRC32C "
 			    "for data digest; we'll use it");
 			conn->conn_data_digest = CONN_DIGEST_CRC32C;
 			break;
 		case 2:
 			log_debugx("target prefers not to do "
 			    "data digest; we'll comply");
 			break;
 		default:
 			log_warnx("target sent unrecognized "
 			    "DataDigest value \"%s\"; will use None", value);
 			break;
 		}
 	} else if (strcmp(name, "MaxConnections") == 0) {
 		/* Ignore. */
 	} else if (strcmp(name, "InitialR2T") == 0) {
 		if (strcmp(value, "Yes") == 0)
 			conn->conn_initial_r2t = true;
 		else
 			conn->conn_initial_r2t = false;
 	} else if (strcmp(name, "ImmediateData") == 0) {
 		if (strcmp(value, "Yes") == 0)
 			conn->conn_immediate_data = true;
 		else
 			conn->conn_immediate_data = false;
 	} else if (strcmp(name, "MaxRecvDataSegmentLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0)
 			log_errx(1, "received invalid "
 			    "MaxRecvDataSegmentLength");
 		if (tmp > isl->isl_max_send_data_segment_length) {
 			log_debugx("capping max_send_data_segment_length "
 			    "from %d to %d", tmp,
 			    isl->isl_max_send_data_segment_length);
 			tmp = isl->isl_max_send_data_segment_length;
 		}
 		conn->conn_max_send_data_segment_length = tmp;
+		/* We received target's limit, that means it accepted our's. */
+		conn->conn_max_recv_data_segment_length =
+		    isl->isl_max_recv_data_segment_length;
 	} else if (strcmp(name, "MaxBurstLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0)
 			log_errx(1, "received invalid MaxBurstLength");
 		if (tmp > isl->isl_max_burst_length) {
 			log_debugx("capping MaxBurstLength "
 			    "from %d to %d", tmp, isl->isl_max_burst_length);
 			tmp = isl->isl_max_burst_length;
 		}
 		conn->conn_max_burst_length = tmp;
 	} else if (strcmp(name, "FirstBurstLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0)
 			log_errx(1, "received invalid FirstBurstLength");
 		if (tmp > isl->isl_first_burst_length) {
 			log_debugx("capping FirstBurstLength "
 			    "from %d to %d", tmp, isl->isl_first_burst_length);
 			tmp = isl->isl_first_burst_length;
 		}
 		conn->conn_first_burst_length = tmp;
 	} else if (strcmp(name, "DefaultTime2Wait") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "DefaultTime2Retain") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "MaxOutstandingR2T") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "DataPDUInOrder") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "DataSequenceInOrder") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "ErrorRecoveryLevel") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "OFMarker") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "IFMarker") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "RDMAExtensions") == 0) {
 		if (conn->conn_conf.isc_iser == 1 &&
 		    strcmp(value, "Yes") != 0) {
 			log_errx(1, "received unsupported RDMAExtensions");
 		}
 	} else if (strcmp(name, "InitiatorRecvDataSegmentLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0)
 			log_errx(1, "received invalid "
 			    "InitiatorRecvDataSegmentLength");
 		if ((int)tmp > isl->isl_max_recv_data_segment_length) {
 			log_debugx("capping InitiatorRecvDataSegmentLength "
 			    "from %d to %d", tmp,
 			    isl->isl_max_recv_data_segment_length);
 			tmp = isl->isl_max_recv_data_segment_length;
 		}
 		conn->conn_max_recv_data_segment_length = tmp;
 	} else if (strcmp(name, "TargetPortalGroupTag") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "TargetRecvDataSegmentLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0) {
 			log_errx(1,
 			    "received invalid TargetRecvDataSegmentLength");
 		}
 		if (tmp > isl->isl_max_send_data_segment_length) {
 			log_debugx("capping TargetRecvDataSegmentLength "
 			    "from %d to %d", tmp,
 			    isl->isl_max_send_data_segment_length);
 			tmp = isl->isl_max_send_data_segment_length;
 		}
 		conn->conn_max_send_data_segment_length = tmp;
 	} else {
 		log_debugx("unknown key \"%s\"; ignoring",  name);
 	}
 }
 
 static void
 login_negotiate(struct connection *conn)
 {
 	struct pdu *request, *response;
 	struct keys *request_keys, *response_keys;
 	struct iscsi_bhs_login_response *bhslr;
 	int i, nrequests = 0;
 	struct iscsi_session_limits *isl;
 
 	log_debugx("beginning operational parameter negotiation");
 	request = login_new_request(conn, BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 	request_keys = keys_new();
 
 	isl = &conn->conn_limits;
 	log_debugx("Limits for offload \"%s\" are "
 	    "MaxRecvDataSegment=%d, max_send_dsl=%d, "
 	    "MaxBurstLength=%d, FirstBurstLength=%d",
 	    conn->conn_conf.isc_offload, isl->isl_max_recv_data_segment_length,
 	    isl->isl_max_send_data_segment_length, isl->isl_max_burst_length,
 	    isl->isl_first_burst_length);
 
 	/*
 	 * The following keys are irrelevant for discovery sessions.
 	 */
 	if (conn->conn_conf.isc_discovery == 0) {
 		if (conn->conn_conf.isc_header_digest != 0)
 			keys_add(request_keys, "HeaderDigest", "CRC32C");
 		else
 			keys_add(request_keys, "HeaderDigest", "None");
 		if (conn->conn_conf.isc_data_digest != 0)
 			keys_add(request_keys, "DataDigest", "CRC32C");
 		else
 			keys_add(request_keys, "DataDigest", "None");
 
 		keys_add(request_keys, "ImmediateData", "Yes");
 		keys_add_int(request_keys, "MaxBurstLength",
 		    isl->isl_max_burst_length);
 		keys_add_int(request_keys, "FirstBurstLength",
 		    isl->isl_first_burst_length);
 		keys_add(request_keys, "InitialR2T", "Yes");
 		keys_add(request_keys, "MaxOutstandingR2T", "1");
 		if (conn->conn_conf.isc_iser == 1) {
 			keys_add_int(request_keys, "InitiatorRecvDataSegmentLength",
 			    isl->isl_max_recv_data_segment_length);
 			keys_add_int(request_keys, "TargetRecvDataSegmentLength",
 			    isl->isl_max_send_data_segment_length);
 			keys_add(request_keys, "RDMAExtensions", "Yes");
 		} else {
 			keys_add_int(request_keys, "MaxRecvDataSegmentLength",
 			    isl->isl_max_recv_data_segment_length);
 		}
 	} else {
 		keys_add(request_keys, "HeaderDigest", "None");
 		keys_add(request_keys, "DataDigest", "None");
 		keys_add_int(request_keys, "MaxRecvDataSegmentLength",
 		    isl->isl_max_recv_data_segment_length);
 	}
 
 	keys_add(request_keys, "DefaultTime2Wait", "0");
 	keys_add(request_keys, "DefaultTime2Retain", "0");
 	keys_add(request_keys, "ErrorRecoveryLevel", "0");
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	request_keys = NULL;
 	pdu_send(request);
 	pdu_delete(request);
 	request = NULL;
 
 	response = login_receive(conn);
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 	for (i = 0; i < KEYS_MAX; i++) {
 		if (response_keys->keys_names[i] == NULL)
 			break;
 
 		login_negotiate_key(conn,
 		    response_keys->keys_names[i], response_keys->keys_values[i]);
 	}
 
 	keys_delete(response_keys);
 	response_keys = NULL;
 
 	for (;;) {
 		bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 		if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0)
 			break;
 
 		nrequests++;
 		if (nrequests > 5) {
 			log_warnx("received login response "
 			    "without the \"T\" flag too many times; giving up");
 			break;
 		}
 
 		log_debugx("received login response "
 		    "without the \"T\" flag; sending another request");
 
 		pdu_delete(response);
 
 		request = login_new_request(conn,
 		    BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 		pdu_send(request);
 		pdu_delete(request);
 
 		response = login_receive(conn);
 	}
 
 	if (login_nsg(response) != BHSLR_STAGE_FULL_FEATURE_PHASE)
 		log_warnx("received final login response with wrong NSG 0x%x",
 		    login_nsg(response));
 	pdu_delete(response);
 
 	log_debugx("operational parameter negotiation done; "
 	    "transitioning to Full Feature phase");
 }
 
 static void
 login_send_chap_a(struct connection *conn)
 {
 	struct pdu *request;
 	struct keys *request_keys;
 
 	request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	request_keys = keys_new();
 	keys_add(request_keys, "CHAP_A", "5");
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	pdu_send(request);
 	pdu_delete(request);
 }
 
 static void
 login_send_chap_r(struct pdu *response)
 {
 	struct connection *conn;
 	struct pdu *request;
 	struct keys *request_keys, *response_keys;
 	struct rchap *rchap;
 	const char *chap_a, *chap_c, *chap_i;
 	char *chap_r;
 	int error;
         char *mutual_chap_c, *mutual_chap_i;
 
 	/*
 	 * As in the rest of the initiator, 'request' means
 	 * 'initiator -> target', and 'response' means 'target -> initiator',
 	 *
 	 * So, here the 'response' from the target is the packet that contains
 	 * CHAP challenge; our CHAP response goes into 'request'.
 	 */
 
 	conn = response->pdu_connection;
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
 	/*
 	 * First, compute the response.
 	 */
 	chap_a = keys_find(response_keys, "CHAP_A");
 	if (chap_a == NULL)
 		log_errx(1, "received CHAP packet without CHAP_A");
 	chap_c = keys_find(response_keys, "CHAP_C");
 	if (chap_c == NULL)
 		log_errx(1, "received CHAP packet without CHAP_C");
 	chap_i = keys_find(response_keys, "CHAP_I");
 	if (chap_i == NULL)
 		log_errx(1, "received CHAP packet without CHAP_I");
 
 	if (strcmp(chap_a, "5") != 0) {
 		log_errx(1, "received CHAP packet "
 		    "with unsupported CHAP_A \"%s\"", chap_a);
 	}
 
 	rchap = rchap_new(conn->conn_conf.isc_secret);
 	error = rchap_receive(rchap, chap_i, chap_c);
 	if (error != 0) {
 		log_errx(1, "received CHAP packet "
 		    "with malformed CHAP_I or CHAP_C");
 	}
 	chap_r = rchap_get_response(rchap);
 	rchap_delete(rchap);
 
 	keys_delete(response_keys);
 
 	request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	request_keys = keys_new();
 	keys_add(request_keys, "CHAP_N", conn->conn_conf.isc_user);
 	keys_add(request_keys, "CHAP_R", chap_r);
 	free(chap_r);
 
 	/*
 	 * If we want mutual authentication, we're expected to send
 	 * our CHAP_I/CHAP_C now.
 	 */
 	if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 		log_debugx("requesting mutual authentication; "
 		    "binary challenge size is %zd bytes",
 		    sizeof(conn->conn_mutual_chap->chap_challenge));
 
 		assert(conn->conn_mutual_chap == NULL);
 		conn->conn_mutual_chap = chap_new();
 		mutual_chap_i = chap_get_id(conn->conn_mutual_chap);
 		mutual_chap_c = chap_get_challenge(conn->conn_mutual_chap);
 		keys_add(request_keys, "CHAP_I", mutual_chap_i);
 		keys_add(request_keys, "CHAP_C", mutual_chap_c);
 		free(mutual_chap_i);
 		free(mutual_chap_c);
 	}
 
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	pdu_send(request);
 	pdu_delete(request);
 }
 
 static void
 login_verify_mutual(const struct pdu *response)
 {
 	struct connection *conn;
 	struct keys *response_keys;
 	const char *chap_n, *chap_r;
 	int error;
 
 	conn = response->pdu_connection;
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
         chap_n = keys_find(response_keys, "CHAP_N");
         if (chap_n == NULL)
                 log_errx(1, "received CHAP Response PDU without CHAP_N");
         chap_r = keys_find(response_keys, "CHAP_R");
         if (chap_r == NULL)
                 log_errx(1, "received CHAP Response PDU without CHAP_R");
 
 	error = chap_receive(conn->conn_mutual_chap, chap_r);
 	if (error != 0)
                 log_errx(1, "received CHAP Response PDU with invalid CHAP_R");
 
 	if (strcmp(chap_n, conn->conn_conf.isc_mutual_user) != 0) {
 		fail(conn, "Mutual CHAP failed");
 		log_errx(1, "mutual CHAP authentication failed: wrong user");
 	}
 
 	error = chap_authenticate(conn->conn_mutual_chap,
 	    conn->conn_conf.isc_mutual_secret);
 	if (error != 0) {
 		fail(conn, "Mutual CHAP failed");
                 log_errx(1, "mutual CHAP authentication failed: wrong secret");
 	}
 
 	keys_delete(response_keys);
 	chap_delete(conn->conn_mutual_chap);
 	conn->conn_mutual_chap = NULL;
 
 	log_debugx("mutual CHAP authentication succeeded");
 }
 
 static void
 login_chap(struct connection *conn)
 {
 	struct pdu *response;
 
 	log_debugx("beginning CHAP authentication; sending CHAP_A");
 	login_send_chap_a(conn);
 
 	log_debugx("waiting for CHAP_A/CHAP_C/CHAP_I");
 	response = login_receive(conn);
 
 	log_debugx("sending CHAP_N/CHAP_R");
 	login_send_chap_r(response);
 	pdu_delete(response);
 
 	/*
 	 * XXX: Make sure this is not susceptible to MITM.
 	 */
 
 	log_debugx("waiting for CHAP result");
 	response = login_receive(conn);
 	if (conn->conn_conf.isc_mutual_user[0] != '\0')
 		login_verify_mutual(response);
 	pdu_delete(response);
 
 	log_debugx("CHAP authentication done");
 }
 
 void
 login(struct connection *conn)
 {
 	struct pdu *request, *response;
 	struct keys *request_keys, *response_keys;
 	struct iscsi_bhs_login_response *bhslr2;
 	const char *auth_method;
 	int i;
 
 	log_debugx("beginning Login phase; sending Login PDU");
 	request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	request_keys = keys_new();
 	if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 		keys_add(request_keys, "AuthMethod", "CHAP");
 	} else if (conn->conn_conf.isc_user[0] != '\0') {
 		/*
 		 * Give target a chance to skip authentication if it
 		 * doesn't feel like it.
 		 *
 		 * None is first, CHAP second; this is to work around
 		 * what seems to be LIO (Linux target) bug: otherwise,
 		 * if target is configured with no authentication,
 		 * and we are configured to authenticate, the target
 		 * will erroneously respond with AuthMethod=CHAP
 		 * instead of AuthMethod=None, and will subsequently
 		 * fail the connection.  This usually happens with
 		 * Discovery sessions, which default to no authentication.
 		 */
 		keys_add(request_keys, "AuthMethod", "None,CHAP");
 	} else {
 		keys_add(request_keys, "AuthMethod", "None");
 	}
 	keys_add(request_keys, "InitiatorName",
 	    conn->conn_conf.isc_initiator);
 	if (conn->conn_conf.isc_initiator_alias[0] != '\0') {
 		keys_add(request_keys, "InitiatorAlias",
 		    conn->conn_conf.isc_initiator_alias);
 	}
 	if (conn->conn_conf.isc_discovery == 0) {
 		keys_add(request_keys, "SessionType", "Normal");
 		keys_add(request_keys,
 		    "TargetName", conn->conn_conf.isc_target);
 	} else {
 		keys_add(request_keys, "SessionType", "Discovery");
 	}
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	pdu_send(request);
 	pdu_delete(request);
 
 	response = login_receive(conn);
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
 	for (i = 0; i < KEYS_MAX; i++) {
 		if (response_keys->keys_names[i] == NULL)
 			break;
 
 		/*
 		 * Not interested in AuthMethod at this point; we only need
 		 * to parse things such as TargetAlias.
 		 *
 		 * XXX: This is somewhat ugly.  We should have a way to apply
 		 *      all the keys to the session and use that by default
 		 *      instead of discarding them.
 		 */
 		if (strcmp(response_keys->keys_names[i], "AuthMethod") == 0)
 			continue;
 
 		login_negotiate_key(conn,
 		    response_keys->keys_names[i], response_keys->keys_values[i]);
 	}
 
 	bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	if ((bhslr2->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0 &&
 	    login_nsg(response) == BHSLR_STAGE_OPERATIONAL_NEGOTIATION) {
 		if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 			log_errx(1, "target requested transition "
 			    "to operational parameter negotiation, "
 			    "but we require mutual CHAP");
 		}
 
 		log_debugx("target requested transition "
 		    "to operational parameter negotiation");
 		keys_delete(response_keys);
 		pdu_delete(response);
 		login_negotiate(conn);
 		return;
 	}
 
 	auth_method = keys_find(response_keys, "AuthMethod");
 	if (auth_method == NULL)
 		log_errx(1, "received response without AuthMethod");
 	if (strcmp(auth_method, "None") == 0) {
 		if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 			log_errx(1, "target does not require authantication, "
 			    "but we require mutual CHAP");
 		}
 
 		log_debugx("target does not require authentication");
 		keys_delete(response_keys);
 		pdu_delete(response);
 		login_negotiate(conn);
 		return;
 	}
 
 	if (strcmp(auth_method, "CHAP") != 0) {
 		fail(conn, "Unsupported AuthMethod");
 		log_errx(1, "received response "
 		    "with unsupported AuthMethod \"%s\"", auth_method);
 	}
 
 	if (conn->conn_conf.isc_user[0] == '\0' ||
 	    conn->conn_conf.isc_secret[0] == '\0') {
 		fail(conn, "Authentication required");
 		log_errx(1, "target requests CHAP authentication, but we don't "
 		    "have user and secret");
 	}
 
 	keys_delete(response_keys);
 	response_keys = NULL;
 	pdu_delete(response);
 	response = NULL;
 
 	login_chap(conn);
 	login_negotiate(conn);
 }
Index: projects/netbsd-tests-upstream-01-2017
===================================================================
--- projects/netbsd-tests-upstream-01-2017	(revision 312217)
+++ projects/netbsd-tests-upstream-01-2017	(revision 312218)

Property changes on: projects/netbsd-tests-upstream-01-2017
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r312125-312217